#! /usr/bin/perl -w
use strict;
use Statistics::Distributions;
use Getopt::Long;

my $step = 0;

my $count_by_a_yes_by_b_yes = {};
my $next_count_by_a_yes_by_b_yes;
my $correct = 0;
my $incorrect = 0;

my $outstanding = 100;
my $stop_at_portion_outstanding = 0.1;
my $p_a_yes = 0.5;
my $p_b_yes = 0.51;
my $threshold_prob = 0.01;

# You can set all of the parameters on the command line.
# For the talk, I set total to 100000, and a_success,
# b_success and cutoff to whatever was appropriate for
# the simulation.
#
# It outputs a header, then the simulation results.  The
# fields in those results are:
#
#   o Steps:  The number of people in the test is actually
#     double this because we add one person to A and one
#     person to B at each step.
#   o Outstanding:  How many tests are still running.  Because
#     of the way I avoid tracking individual tests as long as I
#     can, you can have a fractional number outstanding.
#   o Correct:  How many tests finished with B better than A.
#   o Incorrect:  How many tests finished with A better than B.
#
# Be warned, some of the simulations took me over a day to run.
#
GetOptions(
  "a_success=f" => \$p_a_yes,
  "b_success=f" => \$p_b_yes,
  "cutoff=f" => \$threshold_prob,
  "total=i" => \$outstanding,
  "end_at=f" => \$stop_at_portion_outstanding,
) or die "Cannot process options";

my $threshold_g_test
  = Statistics::Distributions::chisqrdistr(1, $threshold_prob);
$count_by_a_yes_by_b_yes->{0}{0} = $outstanding;
my $stop_at_x_outstanding = $outstanding * $stop_at_portion_outstanding;
my $last_outstanding = 0;

#print "threshold: $threshold_g_test\n";

print "trials: $outstanding\n";
printf "prob(a): %.02f\n", $p_a_yes;
printf "prob(b): %.02f\n", $p_b_yes;
printf "threshold: %.03f\n", $threshold_prob;
print "  Step  Outstanding  Correct  Incorrect\n";

while ($outstanding > $stop_at_x_outstanding) {

  $next_count_by_a_yes_by_b_yes = {};

  # Distribute them to the next.
  for my $b_yes (keys %$count_by_a_yes_by_b_yes) {
    my $count_by_a_yes = $count_by_a_yes_by_b_yes->{$b_yes};
    my @a_yeses = keys %$count_by_a_yes;
    for my $a_yes (keys %$count_by_a_yes) {
      my $count = $count_by_a_yes->{$a_yes};
      my $a_success = $count * $p_a_yes;
      my $a_fail = $count - $a_success;

      my $both_success = $a_success * $p_b_yes;
      my $a_only_success = $a_success - $both_success;
      my $b_only_success = $a_fail * $p_b_yes;
      my $no_success = $a_fail - $b_only_success;

      my $both_n = int($both_success);
      my $both_frac = $both_success - $both_n;

      my $a_only_n = int($a_only_success);
      my $a_only_frac = $a_only_success - $a_only_n;

      my $b_only_n = int($b_only_success);
      my $b_only_frac = $b_only_success - $b_only_n;

      my $no_n = int($no_success);
      my $no_frac = $no_success - $no_n;

      my $to_distribute = $both_frac + $a_only_frac + $b_only_frac + $no_frac;
      # We really need no roundoff errors here.
      $to_distribute = int($to_distribute + 0.5);

#      print ("$both_success: $a_only_success: $b_only_success; $no_success: $to_distribute\n");
#      print ("$both_n: $a_only_n; $b_only_n: $no_n\n");
#      print ("$both_frac: $a_only_frac; $b_only_frac: $no_frac\n");
#      <STDIN>;

      # Distribute each one randomly and fairly.
      for (1..$to_distribute) {
        my $choice = rand(1);

	$choice -= $both_frac/$to_distribute;
	if ($choice < 0) {
	  $both_n++;
	  next;
	}

	$choice -= $a_only_frac/$to_distribute;
	if ($choice < 0) {
	  $a_only_n++;
	  next;
	}

	$choice -= $b_only_frac/$to_distribute;
	if ($choice < 0) {
	  $b_only_n++;
	  next;
	}

	$no_n++;
      }

      $next_count_by_a_yes_by_b_yes->{$b_yes}{$a_yes} += $no_n
        if $no_n;
      $next_count_by_a_yes_by_b_yes->{$b_yes + 1}{$a_yes} += $b_only_n
        if $b_only_n;
      $next_count_by_a_yes_by_b_yes->{$b_yes}{$a_yes + 1} += $a_only_n
        if $a_only_n;
      $next_count_by_a_yes_by_b_yes->{$b_yes + 1}{$a_yes + 1} += $both_n
        if $both_n;
    }
  }

  $step++;
  # Now pass through, figure out who we've decided on
  my @b_yeses = sort {$a <=> $b} keys %$next_count_by_a_yes_by_b_yes;
  for my $b_yes (@b_yeses) {
    my $count_by_a_yes = $next_count_by_a_yes_by_b_yes->{$b_yes};
    my @a_yeses = sort {$a <=> $b} keys %$count_by_a_yes;

    filter_out_bottom_decisions(
      $b_yes, $count_by_a_yes, @a_yeses,
    );

    # And reversing filters at the top. :-)
    filter_out_bottom_decisions(
      $b_yes, $count_by_a_yes, reverse @a_yeses,
    );
  }


  $count_by_a_yes_by_b_yes = $next_count_by_a_yes_by_b_yes;
  if ($outstanding != $last_outstanding) {
    printf("% 6d% 13d% 9d% 11d\n"
      , $step, $outstanding, $correct, $incorrect
    );
  }
  $last_outstanding = $outstanding;
}


sub calculate_g_test {
  my ($a_yes, $a_no, $b_yes, $b_no) = @_;

  my $a = $a_yes + $a_no;
  my $b = $b_yes + $b_no;
 
  my $yes = $a_yes + $b_yes;
  my $no = $a_no + $b_no;

  my $total = $a + $b;

  my $e_a_yes = $a * $yes / $total;
  my $e_a_no  = $a * $no  / $total;
  my $e_b_yes = $b * $yes / $total;
  my $e_b_no  = $b * $no  / $total;

  if (
    $a_yes < 10 or
    $a_no < 10 or
    $b_yes < 10 or
    $b_no < 10
  ) {
#    print "@_ n/a\n";
    return "n/a";
  }

  my $g_test =
      2*$a_yes*log($a_yes/$e_a_yes)
    + 2*$a_no*log($a_no/$e_a_no)
    + 2*$b_yes*log($b_yes/$e_b_yes)
    + 2*$b_no*log($b_no/$e_b_no);

#  print "@_ $g_test\n";
}

sub filter_out_bottom_decisions {
  my $b_yes = shift;
  my $count_by_a_yes = shift;
  my @a_yeses = @_;
  for my $a_yes (@a_yeses) {
    my $g_test = calculate_g_test(
      $a_yes, $step - $a_yes,
      $b_yes, $step - $b_yes,
    );

    next if $g_test eq "n/a";

    if ($g_test > $threshold_g_test) {
      my $count = delete $count_by_a_yes->{$a_yes} || 0;
      $outstanding -= $count;
      if ($a_yes > $b_yes) {
        $incorrect += $count;
      }
      else {
        $correct += $count;
      }
    }
    else {
      # We're into insignificant territory now
      return;
    }
  }
}
