## ## Spam Recognition System -- fully-functional prototype in perl ## (to be recoded with additional functionality in C or Objective-C) ## ## Based on algorithms described by Paul Graham in: ## http://www.paulgraham.com/spam.html ## ## $Id: lib.pl,v 1.3 2002/08/19 20:47:23 dgc Exp $ ## dgc@uchicago.edu ## use IO::File; use DB_File; $NBIAS = 2; # bias toward non-spam $PBIAS = 1; # bias toward spam $RADAR = 1; # min occurrences $SCOPE = 1; # how many words to chain $BASE64= 0; # hash lines which appear to be base-64? $NSEL = 15; # how many words to select when evaluating against ratings $UNSEEN= 0.2; # rating for unknown words $DBDIR = $ENV{HOME} . "/.srs"; $DBNAME= "srs"; $DELIM = ":"; $wordchars = "A-Za-z0-9'\$-"; sub newcontext { return { stats => {}, chain => [], breadth => $SCOPE, nmsgs => 0, ignoring=> 0, }; } sub mapcontext { my $file; my $i; my ($Pcontext, $Ncontext); my (%hp, %hn); $Pcontext = newcontext(); $Ncontext = newcontext(); tie(%hp, 'DB_File', "$DBDIR/$DBNAME.P.db", O_RDWR|O_CREAT, 0600, $DB_HASH); $Pcontext->{stats} = \%hp; $Pcontext->{nmsgs} = $hp{"${DELIM}nmsgs"}; tie(%hn, 'DB_File', "$DBDIR/$DBNAME.N.db", O_RDWR|O_CREAT, 0600, $DB_HASH); $Ncontext->{stats} = \%hn; $Ncontext->{nmsgs} = $hn{"${DELIM}nmsgs"}; return ($Pcontext, $Ncontext); } sub unmapcontext { my (@contexts) = @_; my ($context); for $context (@contexts) { $context->{stats}->{"${DELIM}nmsgs"} = $context->{nmsgs}; untie(%{$context->{stats}}); } } sub maprate { my %h; tie(%h, 'DB_File', "$DBDIR/$DBNAME.R.db", O_RDONLY, 0600, $DB_HASH); return \%h; } sub maprate_rw { my ($file) = @_; my %h; unlink "$DBDIR/$DBNAME.R.db"; tie(%h, 'DB_File', "$DBDIR/$DBNAME.R.db", O_RDWR|O_CREAT, 0600, $DB_HASH); return \%h; } sub unmaprate { my ($h) = @_; untie(%{$h}); } sub exammap { my ($m) = @_; map { printf("%0.04f %s\n", $m->{$_}, $_); } sort { $m->{$a} <=> $m->{$b} || $a cmp $b; } keys %{$m}; } sub hashtext { my ($C, $text) = @_; my ($word, $key); my (@chain, @tmp); @chain = @{$C->{chain}}; #while ($text =~ m/^[^$wordchars]*([$wordchars]+)(.*)/o) { while ($text =~ m/^[^A-Za-z0-9'\$-]*([A-Za-z0-9'\$-]+)(.*)/o) { if ($BASE64 && /^Content-Transfer-Encoding: base64/i) { $C->{ignoring} = 1; next; } if ($C->{ignoring}) { if (/^--/) { $C->{ignoring} = 0; } else { next; } } $word = $1; $text = $2; print "WORD: $word\n" if ($DEBUG); @chain = splice(@chain, 0, $C->{breadth}-1); unshift(@chain, $word); print "CHAIN: ", join($DELIM, @chain), "\n" if ($DEBUG); for $i (1 .. $#chain+1) { @tmp = @chain; splice(@tmp, $i); $key = join($DELIM, @tmp); print "chain $i: $key\n" if ($DEBUG); if (!exists($C->{stats})) { $C->{stats}->{$key} = 1; } else { $C->{stats}->{$key}++; } } } $C->{chain} = [@chain]; } sub scancorpus { my ($C, $fh) = @_; my $nmsgs = 0; while ($line = $fh->getline) { if ($line =~ /^From /) { ++$nmsgs; } $line = lc($line); &hashtext($C, $line); } $C->{nmsgs} += $nmsgs; } sub scanfiles { my ($C, @files) = @_; my $file; my $fh = new IO::File; for $file (@files) { if ($file eq "-") { &scancorpus($C, \*STDIN); } else { if ($fh->open($file, "r")) { &scancorpus($C, $fh); $fh->close; } else { return undef; } } } return 1; } ## normalize sub normalize { my ($n) = @_; $n = int($n * 1000); $n = 999 if ($n > 999); $n = 001 if ($n < 001); return $n / 1000; } ## compute chance that a msg containing word is spam sub rateword { my ($P, $N, $w) = @_; my ($n_neg, $n_pos, $freq_neg, $freq_pos, $chance); $n_pos = $P->{stats}->{$w} * $PBIAS; # occurrences in spam $n_neg = $N->{stats}->{$w} * $NBIAS; # occurrences in nonspam # disregard words that occur few times (under the radar) if ($n_neg + $n_pos < $RADAR) { return 0; } # frequencies of word in spam, nonspam $freq_pos = $n_pos / $P->{nmsgs}; if ($freq_pos > 1) { $freq_pos = 1; } $freq_neg = $n_neg / $N->{nmsgs}; if ($freq_neg > 1) { $freq_neg = 1; } # chance that msg containing word is spam $chance = ($freq_pos / ($freq_neg + $freq_pos)); # normalize return normalize($chance); } sub rate { my ($R, $P, $N) = @_; map { $R->{$_} = &rateword($P, $N, $_) unless ($R->{$_}); } (keys %{$P->{stats}}, keys %{$N->{stats}}); return $R; } ## find the "interestingness" of a word sub deviance { my ($R, $word) = @_; if ($R->{$word} > 0.5) { return $R->{$word} - 0.5; } else { return 0.5 - $R->{$word}; } } ## pick the most interesting words sub selectwords { my ($C, $R) = @_; my $i = $NSEL; my @order = (); my $ratings = {}; my $Rprime = {}; map { $Rprime->{$_} = $R->{$_} || $UNSEEN; } keys %{$C->{stats}}; @order = sort { &deviance($Rprime, $b) <=> deviance($Rprime, $a); } keys %{$C->{stats}}; map { $ratings->{$_} = $Rprime->{$_} } splice(@order, 0, $NSEL); return $ratings; } ## Compute conditional prob sub rateselection { my ($R, $P, $N, @sel) = @_; my ($r, $p, $n); my @k; my $w; $r = 0.5; @k = keys %{$P->{stats}}; $p = $#k; @k = keys %{$N->{stats}}; $n = $#k; $p1 = 1; map { $p1 *= $R->{$_}; } @sel; $p2 = 1; map { $p2 *= 1 - $R->{$_}; } @sel; return normalize($p1 / ($p1 + $p2)); } sub lockdb { } sub unlockdb { } sub zerodb { my (%empty); lockdb(); unlink("$DBDIR/$DBNAME.P.db"); unlink("$DBDIR/$DBNAME.N.db"); tie(%empty, 'DB_File', "$DBDIR/$DBNAME.P.db", O_RDWR|O_CREAT, 0600, $DB_HASH); $empty{"${DELIM}nmsgs"} = 1; untie(%empty); tie(%empty, 'DB_File', "$DBDIR/$DBNAME.N.db", O_RDWR|O_CREAT, 0600, $DB_HASH); $empty{"${DELIM}nmsgs"} = 1; untie(%empty); $R = maprate_rw(); unmaprate($R); unlockdb(); } 1; __DATA__ g = good{word} || 0; g *= (bias = 2) b = bad{word} || 0; if (g + b >= (below_radar = 5)) { freq_bad = b / nbad freq_bad = min(1, freq_bad) freq_good = g / ngood freq_good = min(1, freq_good) chance = freq_bad / (freq_good + freq_bad) chance = 0.999 if (chance > 0.999); chance = 0.001 if (chance < 0.001); chance = int(chance * 1000) / 1000; } else { return 0; } (let ((g (* 2 (or (gethash word good) 0))) (b (or (gethash word bad) 0))) (unless (< (+ g b) 5) (max .01 (min .99 (float (/ (min 1 (/ b nbad)) (+ (min 1 (/ g ngood)) (min 1 (/ b nbad))))))))) (let ((prod (apply #'* probs))) (/ prod (+ prod (apply #'* (mapcar #'(lambda (x) (- 1 x)) probs))))) P(B|A) P(A) P(A|B) = ----------- P(B) conditional * prior posterior = ------------------- likelihood M = msg is spam W = word indicates spam P(W|M) P(M) P(M|W) = ------------- P(W) M msg is spam W msg contains word M|W msg is spam, given that it contains word? W|M msg contains word, given that it is spam r a spam msg contains word I know P(M|W). I want P(M|W1,W2,W3).