#! /usr/bin/perl -w
use strict;
use Statistics::Distributions qw(uprob udistr);
use Carp qw(confess);
print <) {
chomp($line);
my @row = split /\s+/, $line;
if (2 != @row) {
print "WARNING: wrong number of columns at line $.\n\n";
next;
}
if ($row[0] eq 'A' or $row[0] eq 'a') {
push @a, $row[1];
}
elsif ($row[0] eq 'B' or $row[0] eq 'b') {
push @b, $row[1];
}
else {
print "WARNING: one line $. '$row[0]' should have been A or B\n";
}
}
if (@a < 3) {
die "Insufficient data found for version A\n";
}
elsif (@b < 3) {
die "Insufficient data found for version B\n";
}
my $var_a = variance(@a);
if (0 == $var_a) {
die "No variation found within version A\n";
}
my $var_b = variance(@b);
if (0 == $var_b) {
die "No variation found within version B\n";
}
# Sanity checks
my @a_bottom = sort {$a <=> $b} @a;
pop @a_bottom;
my $var_a_bottom = variance(@a_bottom);
my $a_top_variance_weight = 1 - $var_a_bottom/$var_a;
if ($a_top_variance_weight > 0.01) {
print "WARNING: The estimated variance of A may be off\n";
printf " (a_weight = %.1f%% > 1%%)\n\n", $a_top_variance_weight * 100;
}
my @b_bottom = sort {$a <=> $b} @b;
pop @b_bottom;
my $var_b_bottom = variance(@b_bottom);
my $b_top_variance_weight = 1 - $var_b_bottom/$var_b;
if ($b_top_variance_weight > 0.01) {
print "WARNING: The estimated variance of B may be off\n";
printf " (b_weight = %.1f%% > 1%%)\n\n", $b_top_variance_weight * 100;
}
my $m_a = mean(@a);
printf "Mean of A: %.3f\n", $m_a;
my $m_b = mean(@b);
printf "Mean of B: %.3f\n", $m_b;
my $diff = $m_a - $m_b;
printf "A is %.3f better than B\n", $diff;
# Note, @a in this calculation becomes the count of things in @a.
# Likewise for @b.
my $m_var = $var_a/@a + $var_b/@b;
my $p = 2 * uprob( abs($diff) / sqrt($m_var) );
printf "p = %.3f\n", $p;
for my $c (qw(80 90 95 99)) {
my $p = 1 - $c/100;
# We're calculating the upper bound of a $c% confidence interval for
# a normal distribution with mean 0 and variance $m_var.
#
# The 2 is because it is 2-tailed. (Half the left out stuff is in
# one tail, half in the other.)
#
my $b = udistr( $p/2 ) * sqrt($m_var);
printf "$c%% Confidence for A - B: (%.3f, %.3f)\n", $diff - $b, $diff + $b;
}
# Calculates the arithmetic average of a list of numbers.
sub mean {
if (not @_) {
confess("No data found, cannot take mean");
}
return sum(@_)/@_;
}
# Calculates the sum of a list of numbers.
sub sum {
my $s = shift;
$s += $_ for @_;
return $s;
}
# Estimates the variance of a list of numbers.
sub variance {
if (@_ < 2) {
confess("Not enough data to calculate a variance");
}
my $m = mean(@_);
return sum( map {($_ - $m)**2} @_ ) / $#_;
}