## ## Spam Recognition System -- fully-functional prototype in perl ## (to be recoded with additional functionality in C or Objective-C) ## ## Based on algorithms described by Paul Graham in: ## http://www.paulgraham.com/spam.html ## ## $Id: lib.pl,v 1.4 2002/08/22 02:53:07 dgc Exp $ ## dgc@uchicago.edu ## use IO::File; use NDBM_File; use DB_File; $NBIAS = 2; # bias toward non-spam $PBIAS = 1; # bias toward spam $RADAR = 1; # min occurrences $SCOPE = 1; # how many words to chain $F_B64 = 1; # enable base-64 filter? $NSEL = 15; # how many words to select when evaluating against ratings $UNSEEN= 0.2; # rating for unknown words $VRCHUNK= 5000; # chunk size for verbose rating #$DBFMT = 'DB_File'; #$DBTYPE = $DB_BTREE; $DBFMT = 'NDBM_File'; $DB = $ENV{SRS} || $ENV{HOME} . "/.srs/srs"; $DELIM = ":"; ($DBDIR = $DB) =~ s!/[^/]+$!!; ($DBNAME = $DB) =~ s!.*/!!; $DB_P = "$DB.P.db"; $DB_N = "$DB.N.db"; $DB_R = "$DB.R.db"; $DB_P = "$DB.P"; $DB_N = "$DB.N"; $DB_R = "$DB.R"; sub newcontext { return { stats => {}, chain => [], nmsgs => 0, }; } sub mapcontext { my $file; my $i; my ($Pcontext, $Ncontext); my (%hp, %hn); $Pcontext = newcontext(); $Ncontext = newcontext(); tie(%hp, $DBFMT, "$DB_P", O_RDWR|O_CREAT, 0600); $Pcontext->{stats} = \%hp; $Pcontext->{nmsgs} = $hp{"${DELIM}nmsgs"}; tie(%hn, $DBFMT, "$DB_N", O_RDWR|O_CREAT, 0600); $Ncontext->{stats} = \%hn; $Ncontext->{nmsgs} = $hn{"${DELIM}nmsgs"}; return ($Pcontext, $Ncontext); } sub unmapcontext { my (@contexts) = @_; my ($context); for $context (@contexts) { $context->{stats}->{"${DELIM}nmsgs"} = $context->{nmsgs}; untie(%{$context->{stats}}); } } sub maprate { my %h; tie(%h, $DBFMT, "$DB_R", O_RDONLY, 0600); return \%h; } sub maprate_rw { my ($file) = @_; my %h; tie(%h, $DBFMT, "$DB_R", O_RDWR|O_CREAT, 0600); return \%h; } sub unmaprate { my ($h) = @_; untie(%{$h}); } sub exammap { my ($m) = @_; map { printf("%0.04f %s\n", $m->{$_}, $_); } sort { $m->{$a} <=> $m->{$b} || $a cmp $b; } keys %{$m}; } sub hashtext { my ($C, $text) = @_; my ($word, $key); my (@chain, @tmp); my (@new); @chain = @{$C->{chain}}; ## move html/sgml/xml markup in mid-word to outside. yikes. $text =~ s/^(.*[^\s])(<[^>]*>)([^\s].*)/\1\3 \2/og; ## make normal forms *look* normal if ($text =~ m!(.*)(https?:/[^\s]+)(.*)!o) { ($tmp = $2) =~ s/[^A-Za-z0-9]+//g; $text = "$1$2$3"; } ## extract tokens while ($text =~ m/^[^-A-Za-z0-9'\$_]*([-A-Za-z0-9'\$_]+)(.*)/o) { $word = $1; $text = $2; ## remove punctuation from tokens $word =~ s/[^A-Za-z0-9]+//og; next if ($word eq ""); print "WORD: $word\n" if ($DEBUG); @chain = splice(@chain, 0, $SCOPE-1); unshift(@chain, $word); print "CHAIN: ", join($DELIM, @chain), "\n" if ($DEBUG); for $i (1 .. $#chain+1) { @tmp = @chain; splice(@tmp, $i); # each key will be 'N:word:[word:[...]]' where N # is the number of words. This will help us in # choosing the best set of keys to worry about. # The $DELIM char is important for substring # matching. $key = join($DELIM, $i, @tmp, undef); push(@new, $key); print "chain $i: $key\n" if ($DEBUG); $C->{stats}->{$key}++; } } $C->{chain} = [@chain]; return @new; } sub merge { my ($into, $from) = @_; map { $into->{stats}->{$_} += $from->{stats}->{$_}; } keys %{$from->{stats}}; $into->{nmsgs} += $from->{nmsgs}; } sub scancorpus { my ($C, $file, $fh, $v) = @_; my $nmsgs = 0; my @new = (); my $hdr = 0; my $ignore = 0; while ($line = $fh->getline) { if ($line =~ /^From /o) { $hdr = 1; ++$nmsgs; printf("Added msg %d from %s\n", $nmsgs, $file) if ($v); } if ($hdr && $line =~ /^\r?\n?$/o) { $hdr = 0; } ## ignore? if ($ignore) { if (!$hdr && $F_B64 && $line =~ /^--/o) { $ignore = 0; next; } } next if ($ignore); # normalizations on headers if ($hdr) { # elide embedded dots, as in IP addrs and hostnames $line =~ s/([^\s])\.([^\s])/\1\2/og; } # on non-hdrs if (!$hdr && $F_B64 && $line =~ /^Content-Transfer-Encoding: base64/oi) { $ignore = 1; next; } push(@new, hashtext($C, lc($line))); } $C->{nmsgs} += $nmsgs; return @new; } sub scanfiles { my ($C, $v, @files) = @_; my $file; my $fh = new IO::File; my $n; my @new = (); for $file (@files) { $n = $C->{nmsgs}; if ($file eq "-") { push(@new, scancorpus($C, "stdin", \*STDIN, $v)); } else { if ($fh->open($file, "r")) { push(@new, scancorpus($C, $file, $fh, $v)); $fh->close; } else { return undef; } } printf ("Added %d msgs from %s\n", $C->{nmsgs}-$n, $file) if ($v); } return @new; } ## normalize sub normalize { my ($n) = @_; $n = int($n * 1000); $n = 999 if ($n > 999); $n = 001 if ($n < 001); return $n / 1000; } ## compute chance that a msg containing word is spam sub rateword { my ($P, $N, $w) = @_; my ($n_neg, $n_pos, $freq_neg, $freq_pos, $chance); $n_pos = $P->{stats}->{$w} * $PBIAS; # occurrences in spam $n_neg = $N->{stats}->{$w} * $NBIAS; # occurrences in nonspam # disregard words that occur few times (under the radar) if ($n_neg + $n_pos < $RADAR) { return 0; } # frequencies of word in spam, nonspam $freq_pos = $n_pos / $P->{nmsgs}; if ($freq_pos > 1) { $freq_pos = 1; } $freq_neg = $n_neg / $N->{nmsgs}; if ($freq_neg > 1) { $freq_neg = 1; } # chance that msg containing word is spam $chance = ($freq_pos / ($freq_neg + $freq_pos)); # normalize return normalize($chance); } sub rate { my ($R, $P, $N, @list) = @_; my $n = 0; map { $R->{$_} = &rateword($P, $N, $_) unless ($R->{$_}); printf ("Rated %d of %d...\n", $n, $#list) unless (++$n % $VRCHUNK); } @list; @tmp = keys %{$R}; printf ("Rated %d among %d.\n", $n, $#tmp); return $R; } ## find the "interestingness" of a word sub deviance { my ($R, $word) = @_; if ($R->{$word} > 0.5) { return $R->{$word} - 0.5; } else { return 0.5 - $R->{$word}; } } ## return substrings ## !!! needs work sub substrings { my ($depth, @list) = @_; my $i; my @out = @list; return @out; } ## pick the most interesting words sub selectwords { my ($C, $R) = @_; my $i = $NSEL; my @order = (); my $ratings = {}; my $Rprime = {}; ## Fill in ratings for unseen words. ## ## Remove string/substring entries from ordered list. ## For high ratings, favor conjugate strings. ## For low ratings, favor simple strings. map { $Rprime->{$_} = $R->{$_} || $UNSEEN; # if ($SCOPE > 1) { # if ($Rprime->{$_} > 0.500) { # ## highly-rated word; throw out its components #$x = $_; # map { #print "Deleting $_ because $x = ", $Rprime->{$x}, "\n"; # delete $C->{stats}->{$_}; # } substrings(split($DELIM, $_)); # } else { # ## low-rated; throw out anything made of it #$x = $_; # map { #print "Deleting $_ because $x = ", $Rprime->{$x}, "\n"; # delete $C->{stats}->{$_}; # } grep (/^[2-9].*:$_:/, keys %{$C->{stats}}); # } # } } keys %{$C->{stats}}; ## Sort by reverse rating @order = sort { &deviance($Rprime, $b) <=> deviance($Rprime, $a); } keys %{$C->{stats}}; ## Select the $NSEL most deviant words. map { $ratings->{$_} = $Rprime->{$_} } splice(@order, 0, $NSEL); return $ratings; } ## Compute conditional prob sub rateselection { my ($R, $P, $N, @sel) = @_; my ($r, $p, $n); my @k; my $w; $r = 0.5; @k = keys %{$P->{stats}}; $p = $#k; @k = keys %{$N->{stats}}; $n = $#k; $p1 = 1; map { $p1 *= $R->{$_}; } @sel; $p2 = 1; map { $p2 *= 1 - $R->{$_}; } @sel; return normalize($p1 / ($p1 + $p2)); } sub lockdb { } sub unlockdb { } sub zerodb { my (%empty); lockdb(); unlink("$DB_P"); unlink("$DB_P.dir"); unlink("$DB_P.pag"); unlink("$DB_N"); unlink("$DB_N.dir"); unlink("$DB_N.pag"); unlink "$DB_R"; unlink "$DB_R.dir"; unlink "$DB_R.pag"; tie(%empty, $DBFMT, "$DB_P", O_RDWR|O_CREAT, 0600); $empty{"${DELIM}nmsgs"} = 1; untie(%empty); tie(%empty, $DBFMT, "$DB_N", O_RDWR|O_CREAT, 0600); $empty{"${DELIM}nmsgs"} = 1; untie(%empty); $R = maprate_rw(); unmaprate($R); unlockdb(); } 1; __DATA__ g = good{word} || 0; g *= (bias = 2) b = bad{word} || 0; if (g + b >= (below_radar = 5)) { freq_bad = b / nbad freq_bad = min(1, freq_bad) freq_good = g / ngood freq_good = min(1, freq_good) chance = freq_bad / (freq_good + freq_bad) chance = 0.999 if (chance > 0.999); chance = 0.001 if (chance < 0.001); chance = int(chance * 1000) / 1000; } else { return 0; } (let ((g (* 2 (or (gethash word good) 0))) (b (or (gethash word bad) 0))) (unless (< (+ g b) 5) (max .01 (min .99 (float (/ (min 1 (/ b nbad)) (+ (min 1 (/ g ngood)) (min 1 (/ b nbad))))))))) (let ((prod (apply #'* probs))) (/ prod (+ prod (apply #'* (mapcar #'(lambda (x) (- 1 x)) probs))))) P(B|A) P(A) P(A|B) = ----------- P(B) conditional * prior posterior = ------------------- likelihood M = msg is spam W = word indicates spam P(W|M) P(M) P(M|W) = ------------- P(W) M msg is spam W msg contains word M|W msg is spam, given that it contains word? W|M msg contains word, given that it is spam r a spam msg contains word I know P(M|W). I want P(M|W1,W2,W3).