#!/usr/bin/perl -w 

$desired = "forest.bin";

sub synccheck {
    if ( /Test run at .*OwenData\/(.*) -groupbits (.*) -memorylimit.*/ ) {
	if ($state != 1) {
	    print "[Note] projection $proj ended early, in state $state\n";
	}

	if ($1 ne $desired) { print STDERR "[Warning] See entry for wrong dataset, $1\n"; $state = 1;}
	else {
	    $proj = $2;
	    $state = 2;  #got sync, want data!
	    return 1;
	}
    }
    return 0;
}



# take care, timeouts can occur anywhere.

$state = 1;  #seeking sync


while (<>) {
    chomp;

    synccheck();
    next if $state == 1; 

    if ($state == 2) {
	next unless /Using regular chunks of size 2/;  #always have this
	$state = 3;  #getting data chunking 2
	next;
    }

    if ($state == 3) {  #assume no interruptions
	# $_ = <>; chomp;
	while (<>) {
	    chomp;
	    if (/^\s*$/) {
		$state = 4; #seeking sqrt or 4 or indep sum
		last;
	    }
	    if ( /\d+=\s*(\d+).*- (.*)/ ) {
		$A2{"$proj;$2"} = $1;
		#print "$proj  $2  = $1\n";
	    }
	    else {print STDERR "unexpected line in state 3 = $_\n";}
	}
	next if  $state == 4;
	print STDERR "problem in state 3";  #eof??
    }

    if ($state == 4) {
	if ( /^Using sqrt/) { $state = 5; next;}
	if ( /^Using regular chunks of size 4/) { $state = 6; next;}
	if ( /Independence sum = (\S+) /) {
	    $IS{"$proj"} = $1;
	    $state = 1; 
	    next;
	}
	next;
    }

    if ($state == 5) {  # cut from 3
	# $_ = <>; chomp;
	while (<>) {
	    chomp;
	    if (/^\s*$/) {
		$state = 4; #seeking sqrt or 4 or indep sum
		last;
	    }
	    if ( /\d+=\s*(\d+).*- (.*)/ ) {
		$Asqrt{"$proj;$2"} = $1;
		# print "sqrt: $proj  $2  = $1\n";
	    }
	    else {print STDERR "unexpected line in state 5 = $_\n";}
	}
	next if $state == 4;
	print STDERR "problem in state 5 line '$_'\n";  #eof??
	exit;
    }


    if ($state == 6) {  # cut from 3
	# $_ = <>; chomp;
	while (<>) {
	    chomp;
	    if (/^\s*$/) {
		$state = 4; #seeking sqrt or 4 or indep sum
		last;
	    }
	    if ( /\d+=\s*(\d+).*- (.*)/ ) {
		$A4{"$proj;$2"} = $1;
		#print "4: $proj  $2  = $1\n";
	    }
	    else {print STDERR "unexpected line in state 6 = $_\n";}
	}
	next if $state == 4;
	print STDERR "problem in state 6";  #eof??
    }
    else { print STDERR "oops 7";}
}


sub stddev {
    my ($k,$s,$ctr);
    $s = 0;
    $k = 0;
    $ctr=0;
    foreach $v (@_) {$s += $v; $k += $v**2; $ctr++;}
    if ($ctr < 2) {return 0;}  #nonsense
    return sqrt( (1/($ctr-1))* ($k - $s**2/$ctr)); 
}


sub avg {
    my ($s,$ctr);
    $s = 0;
    $ctr=0;
    foreach $v (@_) {$s += $v; $ctr++;}
    return $s / $ctr;
}

$IM = "Weighted Matching iterated BETTER over all dimensions";
$FS = "Slice Sorting (Less-than ordering)";
$GS = "Greedy Iteration (Less-than ordering)";
$DF = "default normalization";
if ($desired eq "census.bin") {  #yuck
    $IS4 = "IteratedSliceCluster chunks of 4 x 4 x 4 x 4 x 4 x 4";
    $IS2 = "IteratedSliceCluster chunks of 2 x 2 x 2 x 2 x 2 x 2";
}
if ($desired eq "forest.bin") {
    $IS4 = "IteratedSliceCluster chunks of 4 x 4 x 4";
    $IS2 = "IteratedSliceCluster chunks of 2 x 2 x 2";
}
if ($desired eq "weather.bin") {
    $IS4 = "IteratedSliceCluster chunks of 4 x 4 x 4 x 4 x 4";
    $IS2 = "IteratedSliceCluster chunks of 2 x 2 x 2 x 2 x 2";
}

# analysing keys %IS good: see only things that actually completed.

# analysis 1.  relationship between independence sum and ratio sort/wgt match
#on census, indep sum < 0.5 erratic behaviour, >0.5 both similar.

open(SCAT,">indep-$desired.dat"); # for gnuplot, say    plot "indep-census.bin.dat" with points   
foreach $is (keys %IS) {
#    print "key $is\n";
    print SCAT "$IS{$is} ",$A2{"$is;$FS"} / $A2{"$is;$IM"},"\n";
}
close(SCAT);


# analysis 2. Number of cases on 4x4 when Iterated Slice Clustering(4) beats Freq Sort
if ($desired eq "census.bin") {
    $beats = 0;
    $total = 0;

    foreach $is (keys %IS) {
	if ( defined $A4{"$is;$IS4"} and defined $A4{"$is;$FS"}) {
	    $total++;
	    if ($A4{"$is;$IS4"} <  $A4{"$is;$FS"}) { $beats++;}
	}
    }

    print "[OUTPUT] On census for blocksize 4, IS4 is superior to FS $beats times out of $total\n";
}

# analysis 3.  Each heuristic's stddev and mean (and number samples) for blocksize 2

if ($desired eq "census.bin") {
    @heurs = ($FS,$GS,$IS2,$IS4,$IM);
} else {
    @heurs = ($FS,$GS,$IM);
}

print "$desired and blocksize 2\n";

foreach $h (@heurs) {
    @ratios = ();
    foreach $is (keys %IS) {
        push @ratios,( $A2{"$is;$h"} / $A2{"$is;$DF"} )  if defined $A2{"$is;$h"};
    }
    print "For $h datapts = ",scalar(@ratios)," mean = ",avg(@ratios)," and stddev = ", stddev(@ratios),"\n";
}

# analysis 4.  Based on blocksize sqrt.

print "$desired and blocksize sqr\n";

foreach $h (@heurs) {
    @ratios = ();
    foreach $is (keys %IS) {
        push @ratios,( $Asqrt{"$is;$h"} / $Asqrt{"$is;$DF"} )  if defined $Asqrt{"$is;$h"};
    }
    print "For $h datapts = ",scalar(@ratios)," mean = ",avg(@ratios)," and stddev = ", stddev(@ratios),"\n";
}
