2009年12月30日水曜日

PLSV

以前書いた plsv の perl script を晒しておく。
勾配法には sgd を使用した。
行数は320行。


#!/usr/bin/perl
# Probabilistic Latent Semantic Visualization
# Copyright (c) 2009, Kei Uchiumi

use warnings;
use strict;

# Usage
# perl plsv.pl corpus

our $dimension = 2;
our $topicsize = 2;
our $alpha = 0.01;
our $beta = 0.0001;
our $ganma = 0.0001 * $topicsize;
our $docnum = 0; # document size N
our $iteration = 50;

# for sgd parameters
our $rate = 0.1; # learning rate
our $input = shift;

my %W;

open(F, "$input") or die "Couldn't open $input $!";
while ()
{
chomp;
my $line = $_;
my @tokens = split(/\s/,$line);
&storeword(\%W, \@tokens);
$docnum++;
}
close(F);

our $wordsize = (keys %W);
$beta = $beta * $docnum;

# init parameters
our %theta;
our %phi;
our %xai;

# init phi
for (my $i = 0; $i < $topicsize; $i++)
{
my @position;
for (my $d = 0; $d < $dimension; $d++)
{
#$position[$d] = rand;
$position[$d] = 1;
}
$phi{$i} = \@position;
}
# init xai
for (my $i = 0; $i < $docnum; $i++)
{
my @position;
for (my $d = 0; $d < $dimension; $d++)
{
$position[$d] = 0;
}
$xai{$i} = \@position;
}
# init theta
for (my $i = 0; $i < $topicsize; $i++)
{
my @words;
my $denominator = 0;
for (my $j = 0; $j < $wordsize; $j++)
{
$words[$j] = -log(1.0 - rand);
$denominator += $words[$j];
}
for (my $j = 0; $j < $wordsize; $j++)
{
$words[$j] = $words[$j] / $denominator;
}
$theta{$i} = \@words;
}

our %prob_zpx;
our %prob_zpnm;

# learning start
for (my $i = 0; $i < $iteration; $i++)
{
&expectation($input);
&maximization($input);
}

# output
use Data::Dumper;
print "Result\n";
print "Phi\n";
print Dumper(%phi);
print "Xai\n";
print Dumper(%xai);
print "Theta\n";
print Dumper(%theta);

# functions
sub xaiupdate
{
my $docid = shift;
my $topic = shift;
my $grad = shift;

my $x = $xai{$docid};
my $p = $phi{$topic};

for (my $i = 0; $i < $dimension; $i++)
{
my $diff = $grad * ($x->[$i] - $p->[$i]) - $ganma * $x->[$i];
$x->[$i] += $rate * $diff;
}
return;
}

sub phiupdate
{
my $docid = shift;
my $topic = shift;
my $grad = shift;

my $x = $xai{$docid};
my $p = $phi{$topic};

for (my $i = 0; $i < $dimension; $i++)
{
my $diff = $grad * ($p->[$i] - $x->[$i]) - $beta * $p->[$i];
$p->[$i] += $rate * $diff;
}
return;
}

sub update
{
my $input = shift;

my $docid = 0;
open(F,"$input") or die "Couldn't open $input $!";
while ()
{
chomp;
my $line = $_;
my @tokens = split(/\s/,$line);

my $p_zpnm = $prob_zpnm{$docid};
for (my $i = 0; $i < @tokens; $i++)
{
my $p_znm = $p_zpnm->{$i};
for (my $j = 0; $j < $topicsize; $j++)
{
my $p_zpx = $prob_zpx{$docid}->[$j];
my $p_z = $p_znm->[$j];
my $grad = $p_zpx - $p_z;
&xaiupdate($docid,$j,$grad);
&phiupdate($docid,$j,$grad);
}
}
$docid++;
}
close(F);
}

sub thetaupdate
{
my $input = shift;
my $topic = shift;
my $word = shift;

my $numerator = 0;
my $denominator = 0;
my $docid = 0;
open(F,"$input") or die "Couldn't open $input $!";
while ()
{
chomp;
my $line = $_;
my @tokens = split(/\s/,$line);

my $p_zpnm = $prob_zpnm{$docid};
for (my $i = 0; $i < @tokens; $i++)
{
my $p_znm = $p_zpnm->{$i};
if ($tokens[$i] eq $word)
{
$numerator += $p_znm->[$topic];
}
$denominator += $p_znm->[$topic];
}
$docid++;
}
close(F);

return ($numerator+$alpha)/($denominator+$alpha*$wordsize);
}

sub maximization
{
my $input = shift;
# theta update
for (my $i = 0; $i < $topicsize; $i++)
{
for (my $j = 0; $j < $wordsize; $j++)
{
$theta{$i}->[$j] = &thetaupdate($input,$i,$j);
}
}
# xai, phi update
&update($input);

return;
}

sub euclid
{
my $topic = shift;
my $docid = shift;

my $docpositions = $xai{$docid};
my $topicpositions = $phi{$topic};

my $d = 0;
for (my $i = 0; $i < $dimension; $i++)
{
my $diff = $docpositions->[$i] - $topicpositions->[$i];
$d += $diff * $diff;
}

return $d;
}

sub dist
{
my $topic = shift;
my $docid = shift;

my $denominator = 0;
for (my $i = 0; $i < $topicsize; $i++)
{
$denominator += exp(-1/2 * &euclid($i, $docid));
}
my $numerator = exp(-1/2 * &euclid($topic, $docid));

return $numerator/$denominator;
}

sub posterior
{
my $docid = shift;
my $topic = shift;
my $word = shift;

my $p_zpx = $prob_zpx{$docid};
my $denominator = 0;
for (my $i = 0; $i < $topicsize; $i++)
{
$denominator += $p_zpx->[$i] * $theta{$i}->[$word];
}
my $numerator = $p_zpx->[$topic] * $theta{$topic}->[$word];

return $numerator/$denominator;
}

sub expectation
{
my $input = shift;
for (my $i = 0; $i < $docnum; $i++)
{
my @probs;
for (my $j = 0; $j < $topicsize; $j++)
{
my $prob = &dist($j,$i);
$probs[$j] = $prob;
}
$prob_zpx{$i} = \@probs;
}

my $docid = 0;
open(F,"$input") or die "Couldn't open $input $!";
while ()
{
chomp;
my $line = $_;
my @tokens = split(/\s/,$line);

my %probs_znm;
for (my $i = 0; $i < @tokens; $i++)
{
my @probs;
for (my $j = 0; $j < $topicsize; $j++)
{
my $p = &posterior($docid, $j, $tokens[$i]);
$probs[$j] = $p;
}
$probs_znm{$i} = \@probs;
}

$prob_zpnm{$docid} = \%probs_znm;
$docid++;
}
close(F);
return;
}

sub storeword
{
my $wh = shift;
my $ta = shift;

foreach my $w (@$ta)
{
unless (defined $wh->{$w})
{
$wh->{$w} = 1;
}
}
return;
}



入力ファイルの例は以下の通り。
1行が1文書に相当。
数値は単語ID。

0 1 2 3
0 1 2 3
4 5 6 7
4 5 6 7


使用例

# perl plsv.pl sample.txt
Result
Phi
$VAR1 = '1';
$VAR2 = [
'-0.581827405837318',
'-0.581827405837318'
];
$VAR3 = '0';
$VAR4 = [
'1.50610033709651',
'1.50610033709651'
];
Xai
$VAR1 = '1';
$VAR2 = [
'-0.42461538023816',
'-0.42461538023816'
];
$VAR3 = '3';
$VAR4 = [
'1.29019177243395',
'1.29019177243395'
];
$VAR5 = '0';
$VAR6 = [
'-0.393904728506928',
'-0.393904728506928'
];
$VAR7 = '2';
$VAR8 = [
'1.29484945144959',
'1.29484945144959'
];
Theta
$VAR1 = '1';
$VAR2 = [
'0.248699888255414',
'0.24869988511089',
'0.248699888504977',
'0.248699890098641',
'0.00130761840347527',
'0.00129605716166152',
'0.00129717365378877',
'0.00129959881115254'
];
$VAR3 = '0';
$VAR4 = [
'0.00128080308998399',
'0.00128080623499826',
'0.00128080284038186',
'0.00128080124646881',
'0.248711689079392',
'0.248723252125832',
'0.248722135459428',
'0.248719709923515'
];