勾配法には sgd を使用した。
行数は320行。
#!/usr/bin/perl
# Probabilistic Latent Semantic Visualization
# Copyright (c) 2009, Kei Uchiumi
use warnings;
use strict;
# Usage
# perl plsv.pl corpus
our $dimension = 2;
our $topicsize = 2;
our $alpha = 0.01;
our $beta = 0.0001;
our $ganma = 0.0001 * $topicsize;
our $docnum = 0; # document size N
our $iteration = 50;
# for sgd parameters
our $rate = 0.1; # learning rate
our $input = shift;
my %W;
open(F, "$input") or die "Couldn't open $input $!";
while ()
{
chomp;
my $line = $_;
my @tokens = split(/\s/,$line);
&storeword(\%W, \@tokens);
$docnum++;
}
close(F);
our $wordsize = (keys %W);
$beta = $beta * $docnum;
# init parameters
our %theta;
our %phi;
our %xai;
# init phi
for (my $i = 0; $i < $topicsize; $i++)
{
my @position;
for (my $d = 0; $d < $dimension; $d++)
{
#$position[$d] = rand;
$position[$d] = 1;
}
$phi{$i} = \@position;
}
# init xai
for (my $i = 0; $i < $docnum; $i++)
{
my @position;
for (my $d = 0; $d < $dimension; $d++)
{
$position[$d] = 0;
}
$xai{$i} = \@position;
}
# init theta
for (my $i = 0; $i < $topicsize; $i++)
{
my @words;
my $denominator = 0;
for (my $j = 0; $j < $wordsize; $j++)
{
$words[$j] = -log(1.0 - rand);
$denominator += $words[$j];
}
for (my $j = 0; $j < $wordsize; $j++)
{
$words[$j] = $words[$j] / $denominator;
}
$theta{$i} = \@words;
}
our %prob_zpx;
our %prob_zpnm;
# learning start
for (my $i = 0; $i < $iteration; $i++)
{
&expectation($input);
&maximization($input);
}
# output
use Data::Dumper;
print "Result\n";
print "Phi\n";
print Dumper(%phi);
print "Xai\n";
print Dumper(%xai);
print "Theta\n";
print Dumper(%theta);
# functions
sub xaiupdate
{
my $docid = shift;
my $topic = shift;
my $grad = shift;
my $x = $xai{$docid};
my $p = $phi{$topic};
for (my $i = 0; $i < $dimension; $i++)
{
my $diff = $grad * ($x->[$i] - $p->[$i]) - $ganma * $x->[$i];
$x->[$i] += $rate * $diff;
}
return;
}
sub phiupdate
{
my $docid = shift;
my $topic = shift;
my $grad = shift;
my $x = $xai{$docid};
my $p = $phi{$topic};
for (my $i = 0; $i < $dimension; $i++)
{
my $diff = $grad * ($p->[$i] - $x->[$i]) - $beta * $p->[$i];
$p->[$i] += $rate * $diff;
}
return;
}
sub update
{
my $input = shift;
my $docid = 0;
open(F,"$input") or die "Couldn't open $input $!";
while ()
{
chomp;
my $line = $_;
my @tokens = split(/\s/,$line);
my $p_zpnm = $prob_zpnm{$docid};
for (my $i = 0; $i < @tokens; $i++)
{
my $p_znm = $p_zpnm->{$i};
for (my $j = 0; $j < $topicsize; $j++)
{
my $p_zpx = $prob_zpx{$docid}->[$j];
my $p_z = $p_znm->[$j];
my $grad = $p_zpx - $p_z;
&xaiupdate($docid,$j,$grad);
&phiupdate($docid,$j,$grad);
}
}
$docid++;
}
close(F);
}
sub thetaupdate
{
my $input = shift;
my $topic = shift;
my $word = shift;
my $numerator = 0;
my $denominator = 0;
my $docid = 0;
open(F,"$input") or die "Couldn't open $input $!";
while ()
{
chomp;
my $line = $_;
my @tokens = split(/\s/,$line);
my $p_zpnm = $prob_zpnm{$docid};
for (my $i = 0; $i < @tokens; $i++)
{
my $p_znm = $p_zpnm->{$i};
if ($tokens[$i] eq $word)
{
$numerator += $p_znm->[$topic];
}
$denominator += $p_znm->[$topic];
}
$docid++;
}
close(F);
return ($numerator+$alpha)/($denominator+$alpha*$wordsize);
}
sub maximization
{
my $input = shift;
# theta update
for (my $i = 0; $i < $topicsize; $i++)
{
for (my $j = 0; $j < $wordsize; $j++)
{
$theta{$i}->[$j] = &thetaupdate($input,$i,$j);
}
}
# xai, phi update
&update($input);
return;
}
sub euclid
{
my $topic = shift;
my $docid = shift;
my $docpositions = $xai{$docid};
my $topicpositions = $phi{$topic};
my $d = 0;
for (my $i = 0; $i < $dimension; $i++)
{
my $diff = $docpositions->[$i] - $topicpositions->[$i];
$d += $diff * $diff;
}
return $d;
}
sub dist
{
my $topic = shift;
my $docid = shift;
my $denominator = 0;
for (my $i = 0; $i < $topicsize; $i++)
{
$denominator += exp(-1/2 * &euclid($i, $docid));
}
my $numerator = exp(-1/2 * &euclid($topic, $docid));
return $numerator/$denominator;
}
sub posterior
{
my $docid = shift;
my $topic = shift;
my $word = shift;
my $p_zpx = $prob_zpx{$docid};
my $denominator = 0;
for (my $i = 0; $i < $topicsize; $i++)
{
$denominator += $p_zpx->[$i] * $theta{$i}->[$word];
}
my $numerator = $p_zpx->[$topic] * $theta{$topic}->[$word];
return $numerator/$denominator;
}
sub expectation
{
my $input = shift;
for (my $i = 0; $i < $docnum; $i++)
{
my @probs;
for (my $j = 0; $j < $topicsize; $j++)
{
my $prob = &dist($j,$i);
$probs[$j] = $prob;
}
$prob_zpx{$i} = \@probs;
}
my $docid = 0;
open(F,"$input") or die "Couldn't open $input $!";
while ()
{
chomp;
my $line = $_;
my @tokens = split(/\s/,$line);
my %probs_znm;
for (my $i = 0; $i < @tokens; $i++)
{
my @probs;
for (my $j = 0; $j < $topicsize; $j++)
{
my $p = &posterior($docid, $j, $tokens[$i]);
$probs[$j] = $p;
}
$probs_znm{$i} = \@probs;
}
$prob_zpnm{$docid} = \%probs_znm;
$docid++;
}
close(F);
return;
}
sub storeword
{
my $wh = shift;
my $ta = shift;
foreach my $w (@$ta)
{
unless (defined $wh->{$w})
{
$wh->{$w} = 1;
}
}
return;
}
入力ファイルの例は以下の通り。
1行が1文書に相当。
数値は単語ID。
0 1 2 3
0 1 2 3
4 5 6 7
4 5 6 7
使用例
# perl plsv.pl sample.txt
Result
Phi
$VAR1 = '1';
$VAR2 = [
'-0.581827405837318',
'-0.581827405837318'
];
$VAR3 = '0';
$VAR4 = [
'1.50610033709651',
'1.50610033709651'
];
Xai
$VAR1 = '1';
$VAR2 = [
'-0.42461538023816',
'-0.42461538023816'
];
$VAR3 = '3';
$VAR4 = [
'1.29019177243395',
'1.29019177243395'
];
$VAR5 = '0';
$VAR6 = [
'-0.393904728506928',
'-0.393904728506928'
];
$VAR7 = '2';
$VAR8 = [
'1.29484945144959',
'1.29484945144959'
];
Theta
$VAR1 = '1';
$VAR2 = [
'0.248699888255414',
'0.24869988511089',
'0.248699888504977',
'0.248699890098641',
'0.00130761840347527',
'0.00129605716166152',
'0.00129717365378877',
'0.00129959881115254'
];
$VAR3 = '0';
$VAR4 = [
'0.00128080308998399',
'0.00128080623499826',
'0.00128080284038186',
'0.00128080124646881',
'0.248711689079392',
'0.248723252125832',
'0.248722135459428',
'0.248719709923515'
];
0 件のコメント:
コメントを投稿