#! /usr/bin/env perl
use strict;
use warnings;

use Data::Dumper;
$Data::Dumper::Indent = 1;
use Getopt::Long;
use List::Util qw(sum);
use Pod::Usage qw(pod2usage);
use Statistics::Distributions qw(udistr uprob);

my $error_rate;
my $help;
my $n;
my $output_file;
my $win;

my $options_processed = GetOptions(
  'error-rate=f'    => \$error_rate,
  'help'            => \$help,
  'n=i'             => \$n,
  'output-file=s'   => \$output_file,
  'win=f'           => \$win,
);

# Obvious error checking.
if ($help or not $options_processed) {
  pod2usage(-verbose => 2);
}

if ($error_rate and $error_rate < 0) {
  die "--error-rate $error_rate cannot be < 0";
}
if ($n and $n < 0) {
  die "--n $n cannot be < 0";
}
if ($win and $win < 0) {
  die "--win $win cannot be < 0";
}

# We can't have any 3 of these...
my @specified;
if ($error_rate) {
  push @specified, '--error-rate';
}
if ($n) {
  push @specified, '--n';
}
if ($win) {
  push @specified, '--win';
}

if (2 != @specified) {
  print STDERR "Must specify 2 of --error-rate, --n, and --win\n";
  die "Actually specified: " . join " ", @specified;
}

# To have a final stop, we need this to be odd.
if ($n and not $n % 2) {
  $n--;
}

my $result;
if ($win and $error_rate) {
  # This n is large enough that we'll probably run out of memory first.
  # Wherever it stops is where  we should stop.
  $result = run_simulation(
    n => 1_000_000_000,
    win => $win,
    error_rate => $error_rate,
  );
}
elsif ($n and $error_rate) {
  $result = fit_param(
    error_rate => $error_rate,
    n => $n,
    win => udistr($error_rate) / sqrt($n),
    range_param => "win",
  );
}
else {
  $result = fit_param(
    error_rate => uprob(2 * $win / sqrt($n))/$n,
    n => $n,
    win => $win,
    range_param => "error_rate",
  );

}

if (not $result) {
  die "Failed to implement for combination: " . join " ", @specified;;
}
else {
  my $stop = delete $result->{stop};
  if ($output_file) {
    output($output_file, $stop);
  }
  print Dumper($result);
}

sub run_simulation {
  my %param = @_;
  my $error_rate = delete $param{error_rate} or die "No error_rate";
  my $n = delete $param{n} or die "No n";
  my $win = delete $param{win} or die "No win";

  # We arbitrarily decide that A is better, then make all cutoffs
  # symmetrical.
  my $p_a = (1 + $win) / (2 + $win);
  my $p_b = 1 - $p_a;
  my $cumulative_errors = 0;
  my $cumulative_right = 0;
  my $last_error = 0;
  my $last_decision = 0;
  my @left = (1);
  my @stop; # Array of possible values.
  my $fix_ratio = 1; # Hack to avoid cumulative roundoff errors.

  for my $i (1..$n) {
    my @new_left;
    for my $j (0..$#left) {
      $left[$j] *= $fix_ratio;
      $new_left[$j+1] += $p_a * $left[$j];
      $new_left[$j] += $p_b * $left[$j];
    }

    my $this_error = 0;
    while (@new_left and $new_left[0] + $this_error < $error_rate) {
      # $#new_left is the index of the last element of @new_left, which
      # after we center the array and take into account that possible values
      # are separated by 2, is the width that we are cutting off.
      push @stop, "$#new_left,$i";
      # The bottom value we got wrong.
      $this_error += $new_left[0];
      $cumulative_errors += shift @new_left;
      # The top value we got right.
      $cumulative_right += pop @new_left if @new_left;
    }

    if ($this_error) {
      # We possibly made a decision.
      $last_error = $this_error;
      $last_decision = $i;
    }

    @left = @new_left;

    last if not @left; # Can't keep going in this case.

    my $total_running = sum(@left);
    my $total_should_run = 1 - $cumulative_errors - $cumulative_right;
    # And our hack to keep roundoff errors from building up too much.
    $fix_ratio = $total_should_run / $total_running if $total_running;
  }

  # Tally up what a pure stop would cost.
  my $remaining_errors = 0;
  while (@left) {
    $remaining_errors += shift @left;
    pop @left;
  }

  return {
    right => $cumulative_right,
    errors => $cumulative_errors,
    stop => \@stop,
    error_rate => $error_rate,
    n => $n,
    win => $win,
    remaining_errors => $remaining_errors,
    last_error => $last_error,
    last_decision => $last_decision,
  }
}

sub output {
  my ($file, $stop) = @_;
  open(OUT, ">", $file) or die "Can't write '$file': $!";
  for my $threshold (@$stop) {
    my ($m, $n) = split /,/, $threshold;
    my $std = $m/sqrt($n);
    my $p = 2 * uprob($std);
    print OUT join "\t", $n, $m, $std, $p;
    print OUT "\n";
  }
}

sub evaluate_fit {
  my $run = shift;
  #delete $run->{stop};
  #print Dumper($run);
  if ($run->{remaining_errors}) {
    return $run->{remaining_errors};
  }
  else {
    my $missed = ($run->{n} - $run->{last_decision}) * $run->{error_rate}
                 + ($run->{error_rate} - $run->{last_error});
    if ($missed < 0.1 * $run->{error_rate}) {
      return 0;
    }
    else {
      return - $missed;
    }
  }
}

sub fit_param {
  my %param = @_;
  my $range_param = delete $param{range_param} or die "Need a range_param";
  unless ($param{$range_param}) {
    die "Guess for $range_param mising.";
  }
  my @range_values = ($param{$range_param} / 2, $param{$range_param});

  my $max_iterations = delete $param{max_iterations} || 30;

  $param{$range_param} = $range_values[0];
  my @runs = run_simulation(%param);
  $param{$range_param} = $range_values[1];
  push @runs, run_simulation(%param);

  # Figure which end gives a higher fit, whether we're in range, etc.
  my $higher_fit = 0;
  if (0 == evaluate_fit($runs[0])) {
    # We were lucky our guess was off by 2!
    return $runs[0];
  }
  elsif (0 == evaluate_fit($runs[1])) {
    # We guessed right!
    return $runs[1];
  }
  elsif (evaluate_fit($runs[0]) < evaluate_fit($runs[1])) {
    $higher_fit = 1;
  }

  # Make sure the one that gets a higher fit is above 0.
  while (evaluate_fit($runs[$higher_fit]) < 0) {
    $range_values[$higher_fit] *= $higher_fit ? 2 : 0.5;
    $param{$range_param} = $range_values[$higher_fit];
    $runs[$higher_fit] = run_simulation(%param);
  }

  # Make sure the one that gets a lower fit is below 0.
  while (0 < evaluate_fit($runs[1 - $higher_fit])) {
    $range_values[1 - $higher_fit] *= $higher_fit ? 0.5 : 2;
    $param{$range_param} = $range_values[1 - $higher_fit];
    $runs[1 - $higher_fit] = run_simulation(%param);
  }


  for (1..$max_iterations) {
    my $mid = $range_values[0]/2 + $range_values[1]/2;
    $param{$range_param} = $mid;
    my $mid_run = run_simulation(%param);
    if (0 == evaluate_fit($mid_run)) {
      return $mid_run;
    }
    elsif (evaluate_fit($mid_run) < 0) {
      $range_values[1 - $higher_fit] = $mid;
      $runs[1 - $higher_fit] = $mid_run;
    }
    else {
      $range_values[$higher_fit] = $mid;
      $runs[$higher_fit] = $mid_run;
    }
  }
  # We didn't get close enough?  Oh well...return the one that finished first.
  return $runs[1 - $higher_fit];
}

__END__

=head1 NAME

find-fixed-conversion-limits - Find stats procedure for fixed data windows.

=head1 SYNOPSIS

  ./find-fixed-conversion-limits --win 0.02 --n 10000 --file output.txt

=head1 DESCRIPTION

Uses numerical simulations to find a procedure to identify the better version
as well as you can within a fixed number of conversions.

It prints summary information about the procedure that it discovers, and will
optionally print the procedure itself to a file.

If you've specified any two of the options --error-rate, --n, and
--win, then it will solve for the last.  Specifying all three is an error.

=head1 OPTIONS

=head2 --error-rate RATE

The average rate of errors per conversion that you're willing to make.
The willingness to make errors accumulates, so at a particular cutoff
threshold you may make errors at several times this rate.

This cannot be specified if you've also specified both --n and --win.

=head2 --help

Display this documentation.

=head2 --n MAX-CONVERSIONS

How many conversions you are willing to go to.  The very last stopping
number has to be at an odd number of conversions, so that you know which
way to go.  Therefore this is silently rounded down if an even number is
given.

This cannot be specified if you've also specified --error-rate, and
--win.

=head2 --output-file FILE

If specified, the stopping rules, and basic statistics about them are
written to the output file.

=head2 --win WIN

The amount the conversion rate for B one version is better than the other.

This cannot be specified if you've also specified of --error-rate and
--n.

=head1 BUGS

None reported.

=head1 AUTHOR

Ben Tilly <btilly@gmail.com>

=head1 COPYRIGHT AND LICENSE

This software is copyright 2012 by btilly@gmail.com.

This is free software, you can redistribute it and/or modify it under the
same terms as the Perl 5 programming language itself.