#!/usr/bin/perl
#
# so-display spamfile hamfile
# combineddatasource | so-display 
#
# Compute "S/O ratios" for data. S/O stands for Spam/Overall, and denotes the
# probability that a hit for that datum is spam (in the Bayesian style).
#
# combinedfile should contain lines in the format "X data", where "X" is either
# "h" or "s" for ham or spam, and "data" is what will be collated and reported.
#
# Otherwise "hamfile" and "spamfile" contain data entries, one per line.
# 
# Feb 11 2003 jm

my $spamdata = shift @ARGV;
my $hamdata = shift @ARGV;
my $combined = 0;
if (!defined $spamdata) { $combined = 1; }

%spam = (); %ham = (); %found = ();

if ($combined) {
  while (<>) {
    chomp; s/^(\S+)\s+//;
    if ($1 eq 's') { $spam{$_}++; } else { $ham{$_}++; }
    $found{$_}++;
  }

} else {
  open (IN, "< $spamdata");
  while (<IN>) { chomp; $found{$_}++; $spam{$_}++; }
  close IN;
  open (IN, "< $hamdata");
  while (<IN>) { chomp; $found{$_}++; $ham{$_}++; }
  close IN;
}

my $stot = 0;
my $htot = 0;
foreach my $id (keys %found) {
  $ham{$id} ||= 0; $spam{$id} ||= 0;
  $htot += $ham{$id}; $stot += $spam{$id};
}
$htot ||= 0.000001;
$stot ||= 0.000001;

foreach my $id (keys %found) {
  my $ham = $ham{$id} / $htot;
  my $spam = $spam{$id} / $stot;
  my $t = $ham + $spam || 0.000001;
  $so{$id} = $spam / $t;
}

printf ("%6s  %6s  %6s   %s\n", "RATIO", "SPAM%", "HAM%", "DATA");
foreach my $id (sort {
		     $so{$a} <=> $so{$b}
		  || $spam{$a} <=> $spam{$b}
		  || $ham{$b} <=> $ham{$a}
		} keys %so)
{
  printf ("%6.3f  %6.3f  %6.3f   %s\n",
	$so{$id}, ($spam{$id}*100) / $stot, ($ham{$id}*100) / $htot, $id);
}

exit;
