#! /usr/bin/perl -w
use strict;
use Statistics::Distributions qw(uprob udistr);
use Carp qw(confess);

print <<EOT;
This estimates the difference in the averages of two sets of data using the
z-test.  It expects to get a tab-delimited dataset (actually any whitespace
delimiter will do) with 2 columns.  The first column contain A or B depending
on which data set it is for, and the second column should have non-negative
numbers.

Any rows that do not fit this format will generate an error.

It will perform the calculation when you hit end of file.  If you are
entering data by hand, you can use control-d to indicate end of file.

Please enter data now.
EOT

my @a;
my @b;

while (my $line = <>) {
  chomp($line);
  my @row = split /\s+/, $line;
  if (2 != @row) {
    print "WARNING: wrong number of columns at line $.\n\n";
    next;
  }

  if ($row[0] eq 'A' or $row[0] eq 'a') {
    push @a, $row[1];
  }
  elsif ($row[0] eq 'B' or $row[0] eq 'b') {
    push @b, $row[1];
  }
  else {
    print "WARNING: one line $. '$row[0]' should have been A or B\n";
  }
}

if (@a < 3) {
  die "Insufficient data found for version A\n";
}
elsif (@b < 3) {
  die "Insufficient data found for version B\n";
}

my $var_a = variance(@a);
if (0 == $var_a) {
  die "No variation found within version A\n";
}

my $var_b = variance(@b);
if (0 == $var_b) {
  die "No variation found within version B\n";
}

# Sanity checks
my @a_bottom = sort {$a <=> $b} @a;
pop @a_bottom;
my $var_a_bottom = variance(@a_bottom);

my $a_top_variance_weight = 1 - $var_a_bottom/$var_a;
if ($a_top_variance_weight > 0.01) {
  print "WARNING: The estimated variance of A may be off\n";
  printf "  (a_weight = %.1f%% > 1%%)\n\n", $a_top_variance_weight * 100;
}

my @b_bottom = sort {$a <=> $b} @b;
pop @b_bottom;
my $var_b_bottom = variance(@b_bottom);

my $b_top_variance_weight = 1 - $var_b_bottom/$var_b;
if ($b_top_variance_weight > 0.01) {
  print "WARNING: The estimated variance of B may be off\n";
  printf "  (b_weight = %.1f%% > 1%%)\n\n", $b_top_variance_weight * 100;
}

my $m_a = mean(@a);
printf "Mean of A: %.3f\n", $m_a;

my $m_b = mean(@b);
printf "Mean of B: %.3f\n", $m_b;

my $diff = $m_a - $m_b;
printf "A is %.3f better than B\n", $diff;

# Note, @a in this calculation becomes the count of things in @a.
# Likewise for @b.
my $m_var = $var_a/@a + $var_b/@b;

my $p = 2 * uprob( abs($diff) / sqrt($m_var) );

printf "p = %.3f\n", $p;

for my $c (qw(80 90 95 99)) {
  my $p = 1 - $c/100;
  # We're calculating the upper bound of a $c% confidence interval for
  # a normal distribution with mean 0 and variance $m_var.
  #
  # The 2 is because it is 2-tailed.  (Half the left out stuff is in
  # one tail, half in the other.)
  #
  my $b = udistr( $p/2 ) * sqrt($m_var);
  printf "$c%% Confidence for A - B: (%.3f, %.3f)\n", $diff - $b, $diff + $b;
}

# Calculates the arithmetic average of a list of numbers.
sub mean {
  if (not @_) {
    confess("No data found, cannot take mean");
  }
  return sum(@_)/@_;
}

# Calculates the sum of a list of numbers.
sub sum {
  my $s = shift;
  $s += $_ for @_;
  return $s;
}

# Estimates the variance of a list of numbers.
sub variance {
  if (@_ < 2) {
    confess("Not enough data to calculate a variance");
  }
  my $m = mean(@_);
  return sum( map {($_ - $m)**2} @_ ) / $#_;
}
