#!/opt/bin/perl ## ## Spam Recognition System -- fully-functional prototype in perl ## (to be recoded with additional functionality in C or Objective-C) ## ## Based on algorithms described by Paul Graham in: ## http://www.paulgraham.com/spam.html ## ## $Id: check.pl,v 1.4 2002/08/22 02:53:07 dgc Exp $ ## dgc@uchicago.edu ## ($pdir = $0) =~ s!/[^/]+$!!; eval 'push(@INC, $pdir); require "lib.pl"'; $DEBUG = 0; sub usage { print STDERR "usage: $0 [-v] file [...]\n"; } sub main { my ($C, $R, $P, $N); my ($file, $pretty, @sel, $rating); my (@k, $n); my $verbose = undef; my @indicators = qw(-- == ++); my $Rprime; lockdb(); ($P, $N) = mapcontext(); $R = maprate(); if ($ARGV[0] eq "-v") { $verbose = 1; shift @ARGV; } @ARGV = qw(-) if ($#ARGV < 0); for $file (@ARGV) { $C = newcontext(); unless (scanfiles($C, 0, $file)) { print "$0: can't open \"$file\": $!\n"; next; } $Rprime = selectwords($C, $R); @sel = keys %{$Rprime}; $rating = rateselection($Rprime, $P, $N, @sel); $pretty = $file; $pretty = "stdin" if ($file eq "-"); print "$pretty: $rating\n"; if ($verbose) { #print "\tBased upon:\n"; map { printf("\t%s %3d x %0.3f %s\n", $indicators[($Rprime->{$_} <=> 0.5) + 1], $C->{stats}->{$_}, $Rprime->{$_}, $_); } sort { $Rprime->{$b} <=> $Rprime->{$a} || $C->{stats}->{$b} <=> $C->{stats}->{$a} || $a cmp $b; } @sel; print "\n"; @k = keys %{$R}; $n = $#k+1; print <{nmsgs} Negative records: $N->{nmsgs} Tokens indexed: $n * "++" indicates tokens which suggest unsolicited bulk e-mail content. * "--" indicates tokens which suggest ordinary mail. * A rating of 0.500 indicates that a token counts equally for and against a message. It suggests that you need more data in your knowledge base. END_TEXT } } unmapcontext($P, $N); unmaprate($R); unlockdb(); } &main;