#! /usr/bin/perl # Read a mail folder and compute statistics of # the classification of messages in it by # annoyance-filter. If a second file name is # supplied after the required mail folder name, # a CSV log will be written with a record for # each message classified containing the following # fields: # # 0 Classification (Mail, Junk, or Indeterminate) # 1 Junk probability # 2 From header line # 3 To header line # 4 Subject header line # 5 Date header line # # The header line fields are all quoted, with embedded # quotes doubled to force them. Control characters # in header fields which might confuse CSV parsing # are deleted. # by John Walker http://www.fourmilab.ch/ # September 2002 $acmd = "./annoyance-filter --read dict.bin --trans - --test - |" . "egrep '^X-Annoyance-Filter' | head -2 >score.tmp"; if ($#ARGV < 0) { print("Usage: perl testfolder.pl mail_folder_file [CSV _log_file]\n"); exit(2); } $totalMail = 0; $totalJunk = 0; $totalMessages = 0; $sumProbability = 0; $writeLog = 0; $inf = $ARGV[0]; if ($inf =~ m/\.gz$/) { open(IN, "zcat $inf |") || die "Cannot open gzipped input file $inf"; } else { open(IN, "<$inf") || die "Cannot open input file $inf"; } if ($#ARGV > 0) { open(CSV, ">$ARGV[1]") || die "Cannot create CSV log file $ARGV[1]"; $writeLog = 1; } while (($l = )) { if ($l =~ m/^From /) { last; } } if (!$l) { print("No messages in mail folder!\n"); exit(0); } $eof = 0; $pattern = ''; # Initialise search pattern while (!$eof) { # Read next message from mail folder. At # this point $l contains the first ("From ") # line of the message. $nlines = 0; undef @message; $msize = length($l); &trim_end_of_line; $message[$nlines++] = $l; # Read the balance of the message into the # @message array. Quit when the "From " line # of the next message is encountered or the # end of the folder is encountered. while ($l = ) { if ($l =~ m/^From /) { last; } else { &trim_end_of_line; $message[$nlines++] = $l; $msize += length($l); } } if ($nlines > 0) { &dispose_of_message; } $eof = !($l); # if ($bail++ > 20) { $eof = 1; } # Quick bail-out for testing } if ($writeLog) { close(CSV); } # Compute and display aggregate statistics print("Total messages: $totalMessages\n"); print(" Mail: $totalMail\n"); print(" Junk: $totalJunk\n"); printf("Mean junk probability: %.4f\n", $sumProbability / $totalMessages); # Dispose of the message in the @message # array. sub dispose_of_message { local($i, $from, $to, $subject, $disp, $afrom, $act, $score, $class); # Parse message header for "interesting" items for ($i = 0; $i < $nlines; $i++) { if ((length($message[$i]) == 0) || ($message[$i] =~ m/^\s*$/)) { #print("Bail on line $i\n"); last; } if ($message[$i] =~ m/^From\s/) { if (!($message[$i] =~ m/^From\s+-\s+/)) { $from = $message[$i]; } else { $afrom = $message[$i]; } } elsif ((!defined($from)) && ($message[$i] =~ m/^From:\s/)) { $from = $message[$i]; } elsif ($message[$i] =~ m/^To:\s/) { $to = $message[$i]; } elsif ($message[$i] =~ m/^Subject:\s/) { $subject = $message[$i]; } elsif ($message[$i] =~ m/^Date:\s/) { $date = $message[$i]; } } if (!defined($from)) { $from = $afrom; } # Open a pipe to annoyance-filter and feed the message # to it. Since we cannot read the output from # the run via a pipe, it's redirected to a temporary # file from which we snarf it after closing the pipe. open(OF, "|$acmd") || die "Cannot create pipeline $acmd"; for ($i = 0; $i < $nlines; $i++) { print(OF "$message[$i]\n"); } close(OF); $score = `grep Probability score.tmp`; $score =~ s/(^\S+: )(\S+)//; $score = $2; $class = `grep Classification score.tmp`; $class =~ s/(^\S+: )(\S+)//; $class = $2; system("rm -f score.tmp"); if ($writeLog) { print(CSV "$class,$score," . &csvQuote($from) . "," . &csvQuote($to) . "," . &csvQuote($subject) . "," . &csvQuote($date) . "\n"); } print("$subject\n"); print("Score: $score $class\n"); $totalMessages++; $sumProbability += $score; if ($class eq 'Mail') { $totalMail++; } else { $totalJunk++; } } # Trim end of line sequences from line sub trim_end_of_line { $l =~ s/(.*)([\r\n]+)$/\1/; } # Quote string for output in CSV sub csvQuote { local($s) = @_[0]; $s =~ s/"/""/g; $s =~ tr/\t\r\n//d; return "\"$s\""; }