package Binning;
$|=1;
use warnings;
use strict;
use rob_misc;
use Sort::Key qw(nsort);
use Carp qw(confess);
# Rob's module to do data binning for 1 and 2D data

# see docs at the end ...

require Exporter;

our @ISA = qw(Exporter);

# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.

# This allows declaration	use Binning ':all';
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
# will save memory.
our %EXPORT_TAGS = ( 'all' => [ qw(
	
) ] );

our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );

our @EXPORT = qw(
	
);

our $VERSION = '0.03';


###### methods and subroutines follow

sub new
{
    # function to make a new binning object
    my ($class)=@_;
    my $self={
	
	_data => [], # data array : actually a multi-d array of 
	# either:
	# x,1,n,width or x,y,n,width (pre-bin)

	_xrange => [],
	_yrange => [],

	_binned_data=>[], # x,y,dx,dy,n (post-bin, dx=bin width)
	
	_widths_calculated=>0, # 1 if the data widths are calculated

	# the data hash and its counter
	_datahash => {},
	_datahashcounter => {},

	# hash containing the binning options
	_binning_options => {},
	
	# input type:
	# 1 : scattered data, do not preserve areas under the input curve
	# 2 : a distribution, preserve the area under the curve
	# (see set_input_type)
	_input_type => 1,

	# compression options
	_compress => 0,
	_comrpession_string => undef,

	# verbose option
	_vb => 1
    };
    bless($self,$class);
    return $self;
}

sub destroy
{
    my $self=shift;
  
    # first remove the hashes
    %{$self->datahash}=() if($self->datahash);
    %{$self->datahashcounter}=() if($self->datahashcounter);

    # now go through and blow away the arrays of arrays
    undef(@$_) foreach (@{$self->{_data}},@{$self->{_binned_data}});
      
    # brutal:
    undef(${$self}{$_}) foreach (keys %{$self});
    undef($self);
}
sub DESTROY
{
    my $self=shift;
    $self->destroy();
}

sub load_data
{
    # load data from a file: this is the memory intensive way, load everything in
    # and then deal with it
    my $self=shift;
    my $filename=shift;
    my @data;
    print "Loading data from $filename ... "if($self->vb);
    print "Load data from $filename \n";
    open(FP,"<$filename")||confess("cannot open $filename for input in Binning.pm");
    my $c=0;
    my $code='while(<FP>)
    {
	next if /^\#/o; # skip comments
	chomp;
        next if /^\s*$/o;
        $data[$c++]=$_;
print "Set data[$c]=$_\n";
        # vb3
     }';

    $code=~s/\# _vb3/print \"Load DATA \$c: \$_\\n\";/o if($self->vb>=3);
    eval $code;
    close FP;
    my $n=$#data;
    $self->set_data(\@data);
    print "RETURN $#data\n";
    undef(@data);
    return $n;
}

sub load_data_onthefly
{
    # load data from a file: this is the per-line way: less memory intensive
    # but probably slower
    my $self=shift;
    my $filename=shift;
    print "Loading data on the fly from $filename ... "if($self->vb);
    my $datahash=$self->datahash;
    my $datahashcounter=$self->datahashcounter;
    my $ymin=1e300;
    my $ymax=-1e300;
    open(FP,"<$filename")||confess("cannot open $filename for input in Binning.pm");
    my $code='
    while($_=<FP>)
    {
	next if /^\#/o; # skip comments
	my @y=split('."' '".',$_);
	# compress   
	$y[1]=1 if(! defined($y[1]) );
 	$datahash{$y[0]}+=$y[1];
	$datahashcounter{$y[0]}++;
        # _vb3
	$ymin=MIN($ymin,$y[1]);
	$ymax=MAX($ymax,$y[1]);
    }
 ';
    
    $code=~s/\# _vb3/print \"DATA at x=\$y\[0\] now \$datahash\{\$y\[0\]\}\\n\";/o if($self->vb>=3);
    $code=~s/\# compress/\$y\[0\]=sprintf \"\$self->compression_string\",\$y\[0\];/o if($self->compress);
    print "EVAL: ",$code if($self->vb);
    eval $code;
    close FP;
    $self->set_data(undef);
    $self->{_yrange}[0]=$ymin;
    $self->{_yrange}[1]=$ymax;
}

sub datahash
{
    return $_[0]->{_datahash};
}
sub datahashcounter
{
    return $_[0]->{_datahashcounter};
}

sub save_binned_data
{
    # save the binned data to a file
    my $self=shift;
    my $filename=shift;
    my $comment=shift;
    print "Saving binned data..."if($self->vb);
    open(FP,">$filename")||confess("cannot open $filename for output in Binning.pm");
    my $d=$self->binned_data;
    print FP '#',$comment,"\n"if(defined($comment));
    
    foreach (@$d)
    {
	print FP join (' ',@{$_},"\n");
    }
    close FP;
    print "done\n"if($self->vb);
}

sub set_data
{
    my $self=shift;
    my $p=shift; 
    # set data in _data from an array pointer by *copying* the data
    print "Set data ... "if($self->vb);
    my $datahash=$self->datahash;
    my $datahashcounter=$self->datahashcounter;
   
    # If $p is defined then it is an array from which we should load
    # data. It may not be defined if data is loaded on-the-fly.
    if(defined($p))
    {
	# nb data is a 2D array, whether we input a series of numbers, 
	# or an array
  	print "Use data at $p ..."if($self->vb);
	
	# code loop which sets the datahashes: 
	# using eval to run it is is marginally
	# quicker
	
	# the split is the slow part: it cripples
	# us compared to the old bin_data.pl script
	# which makes more assumptions about consistency
	# of the data format

	# note: it seems that deleteing the array elements
	# on the fly (with delete $$p[$n++]) is faster than
	# the final undef and seems better at freeing memory:
	# perhaps becaue perl reuses the space freed by the delete
	# action. Nice!
	my $ymin=1e300;
	my $ymax=-1e300;
 
	my $code='
        
        my $n=0;
	foreach $_ (@$p)
        {
	    my @y;
	    if(ref($_))
	    {
		@y=@$_;
	    }
	    else
	    {
		@y=split('."' '".',$_);
	    }
            delete $$p[$n++]; 
	    # compress
            # _vb3
	    $y[1]=1 if(! defined($y[1]) );
            $$datahash{$y[0]}+=$y[1];
	    $$datahashcounter{$y[0]}++;
	    $ymin=MIN($ymin,$y[1]);
 	    $ymax=MAX($ymax,$y[1]);
 	}
	';

	# perhaps compress according to the compression string
	$code=~s/\# _vb3/print \"Set data \$n: \@y\\n\";/ if($self->vb>=3);

	$code=~s/\# compress/\$y\[0\]=1.0*sprintf \"\$self->compression_string\",\$y\[0\];/o if($self->compress);
	print "EVAL: ",$code if($self->vb);
	eval $code;
	undef(@$p);  # clean up (saves lots of RAM!)
	$self->{_yrange}[0]=$ymin;
	$self->{_yrange}[1]=$ymax;
    }

    print "mem use = ",mem_usage(1,$$),' MB ...' if($self->vb);


    my $da=$self->data_array; # save pointer

    # add data to the data array from the data hash, 
    # make sure it is sorted lowest to highest
    
    # list of x values
    print "Sorting ... "if($self->vb);

    # the fastest sorter is nsort, but this is just a <=>
    # type of sort. 
    my @k=keys %$datahash;
    @k=nsort @k;
    print "$#k keys ...Set up data_array (pointer $da) ..."if($self->vb);

    # from which we get the min and max
    $self->{_xrange}[0]=$k[0];
    $self->{_xrange}[1]=$k[$#k];
	 
    # now add the data to the data_array : 
    my $n=0;
    foreach $_ (@k)
    {
	push(@$da,[$_,$$datahash{$_},$$datahashcounter{$_}]);

	# by deleting the hash elements, we free up RAM and apparently
	# speed up further hash lookups
	delete($$datahash{$_});
	delete($$datahashcounter{$_});
    }

    print "postpush mem use = ",mem_usage(1,$$),' MB ...',"N=",$#{$self->data_array}," cf $#k ... Release RAM ..." if($self->vb);
    
    # release RAM asap (what we can! data_array is still huge)
    undef(@k);
    undef(%{$datahash});
    undef(%{$datahashcounter});
    
    print "post undef mem use = ",mem_usage(1,$$),' MB ... 'if($self->vb);
    
    # to preserve area we require the widths to be calculated for the input data 

    print 'Calc widths...'if($self->vb);
    $self->calc_widths;
    print "\n"if($self->vb);

    if($self->vb)
    {
	print "XRANGE ",join(' ',$self->xrange());
	print "YRANGE ",join(' ',$self->yrange());
    }
}

sub input_type
{
    my $self=shift;
    return($self->{_input_type});
}

sub set_input_type
{
    my $self=shift;
    my $opt=shift;
    if($opt eq 'scatter')
    {
	# input is scattered data, we should set each to have an equal width
	$self->{_input_type}=1;
    }
    elsif($opt eq 'distribution')
    {
	# in this case each point is part of a distribution and the
	# widths should be calculated
	$self->{_input_type}=2;
    }
}

sub calc_widths
{
    my $self=shift;
    # make an array of widths for an array, usually the pre-bin data,
    # for the first data point just use the difference between
    # the first and second (of course we lose information
    # by doing this, so have to fudge either the first or last point)
    my $totwidth=0.0;
    my $prev;
    my $d=shift; # pass in pointer
    $d||=$self->data_array; # reasonable default

    my $n=$#{$d};
    $self->{_widths_calculated}=1;
    
    print "Calc widths : n=$n, pointer=$d ... "if($self->vb);
    
    if($n==-1)
    {
	dumperr("Binning: Error in calc_widths! no data? (n=$n, pointer=$d)\n");
    }
   
    foreach (1..($n-1))
    {
	$$d[$_][3]=0.5* ($$d[$_+1][0]-$$d[$_-1][0]);
    }
    
    # guess end point widths based on nearest neighbours
    if($n>0)
    { 
	$$d[0][3]=$$d[1][0]-$$d[0][0];
	$$d[$n][3]=$$d[$n][0]-$$d[$n-1][0];
    }
    print "ok\n"if($self->vb);
}

sub dumperr
{
    # output error and quit
    print $_[0];
    exit;
}

sub data_array
{
    # return the original data as a 2D array : NB returns 2d array pointer
    return ($_[0]->{_data});
}

sub flatten_array
{
    # flatten 2d array so it is human-readble
    my $p=shift;
    my @y;
    
    foreach (@$p)
    {
	push(@y,'[');
	foreach (@{$_})
	{
	    push(@y,$_);
	}
	push(@y,'],');
    }
    $y[$#y]=~s/,//o if($#y>-1); # remove trailing comma
    return join(' ',@y);
}

sub flat_data_array
{
    # return the original data as a flat array suitable for human readability
    # (of course we cannot just trim the 2d array to 1d!)
    return(flatten_array($_[0]->data_array));
}

sub binned_data
{
    # return pointer to binned data
    return $_[0]->{_binned_data};
}

sub flat_binned_data
{
    # return human-readable binned data 
    return(flatten_array($_[0]->binned_data));
}

sub bin_data
{
    # bin the _data and leave the result in _binned_data
    my $self=shift;
    my $opts=shift;

    # erase binned_data on entry
    @{$self->binned_data}=();

    print "Pre-bin mem usage ",mem_usage(1,$$)," MB\n"if($self->vb);

    if($self->vb)
    {
	# print options
	print "Bin data $self\n";
	map
	{
	    print "Opt $_ = ${$opts}{$_}\n";
	}keys %$opts;
    }

    # bin data
    $self->bin_function($opts);
    
    # correct for compression artifacts
    $self->correct_for_compression() if($self->compress);
    
    print "Post-bin mem usage ",mem_usage(1,$$)," MB\n"if($self->vb);
}


sub bin_function
{
    # this is the gut of the binning code
    my $self=shift;
    my $opts=shift; #options hash, determines how we bin
                
    # perhaps based on the bin count? (flex binning)
    my $minbinn;
    my $binythresh;
    my $useabs=0;

    # bin start and end points
    my $binxmin;
    my $binxmax;

    # fixed-bin algorithm variables
    my $minx;
    my $maxx;
    my $dx;
    my $nextx;
 
    # oft-used pointer to the data_array
    my $d=$self->data_array;
 
    if(defined($$opts{method})&&($$opts{method} eq 'fixed'))
    {
	# perhaps use supplied min and max values
	$minx||=$$opts{min};
	$maxx||=$$opts{max};

	# otherwise use min and max of the data
	$minx=($self->xrange)[0] if(!defined($minx));
	$maxx=($self->xrange)[1] if(!defined($maxx));

	# calculate bin width	
	if(defined($maxx)&&defined($minx)&&(defined($$opts{nbins})))
	{
	    # determined by nbins
	    $dx=($maxx-$minx)/$$opts{nbins};
	}
	elsif(defined($$opts{binwidth}))
	{
	    # set manually
	    $dx=$$opts{binwidth};
	}
	else
	{
	    dumperr("ERROR: what is dx in fixed bin options?\n");
	}

	# put zero in each bin
	for(my $x=$minx;$x<=$maxx;$x+=$dx)
	{
	    
	}

	$nextx=$minx+$dx;
	print "Set orig nextx $nextx\n"if($self->vb);
    }
    else
    {
	if($$opts{threshold} =~/^count fraction (\S+)$/o)
	{
	    # count a fraction of points in each bin 
	    $minbinn=int($#{$d}*$1);
	}
	elsif($$opts{threshold} =~/^count (\d+)$/o)
	{
	    # count an absolute number of points in each bin
	    $minbinn=$1;
	}
	# or set a fraction of the total y or the total count 
	# (where y is the count)
	elsif($$opts{threshold} =~/^fraction of total (\S+)$/o)
	{
	    $binythresh=$1*$self->totaly();
	}
	# or use the absolute value
	elsif($$opts{threshold} =~/^absolute fraction of total (\S+)$/o)
	{
	    $binythresh=$1*$self->totalabsy();
	    $useabs=1;
	}
	# or set a fraction of the maximum y
	elsif($$opts{threshold} =~/^fraction of maximum (\S+)$/o)
	{
	    $binythresh=$1*$self->maxy();
	}
	# or set a fraction of the maximum y but use the absolute value
	elsif($$opts{threshold} =~/^absolute fraction of maximum (\S+)$/o)
	{
	    $binythresh=$1*$self->maxabsy();
	    print "Set binytresh $binythresh from ",$self->maxabsy,"\n";
	    $useabs=1;
	}
	# or use an absolute threshold
	elsif($$opts{threshold} =~/^absolute (\S+)$/o)
	{
	    $binythresh=$1;
	    $useabs=1;
	}
	else
	{
	    dumperr( "ERROR : I have no idea how to bin in flex-bin mode\n");
	}
    }
    
    if($self->vb)
    {
	print "DX=$dx between $minx and $maxx\n"if(defined($dx));
	print "Minbinn $minbinn\n" if(defined($minbinn));
	print "binYtresh $binythresh\n"if(defined($binythresh)); 
    }

    # variables used by all the binning routines:
    # binx and biny are integrated, binn is the number
    # of points in a bin, and binwidth its current width
    my $binx=0.0;
    my $biny=0.0;
    my $binn=0;
    my $binwidth=0.0;
    my $binminx;
    my $prevbinminx;
    my $datan=0;
    my $prevdatan;
    my $absbiny=0.0; # as biny but integral of the abs(y) 
    my $prevx;
    my $bd=$self->binned_data;

    # useful functions which really want to share the namespace
    # NOTE: they share variables, hence they have to be declared
    # as references ... grrr (but it's fast enough)
    my $shove_ref = sub
    {
	# function to push data onto the binned_data array (used a lot)
	my $x;
	if($$opts{halfx})
	{
	    # output at the half-way point (good for plotting)
	    $x=$binminx + 0.5*$binwidth;
	}
	else
	{
	    # default to the mean over the number of points 
	    if($datan>0)
	    {
		$x=$binx/$datan;
	    }
	    else
	    {
		$x=$binx; # fallback
	    }
	}

	$biny/=$binwidth;  # always divide by bin width

	
	printf "xshove: x=%g : y=%g (binwidth=%g), datan=%d (binwidth=$binwidth)\n",
	       $x,$biny,$binwidth,$datan,$binwidth if($self->vb);

	if(defined($$opts{method})&&
	   ($$opts{method} eq 'fixed')&&
	   (defined($prevx)))
	{
	    # fill with zeros
	    my $fx=$x-(0.5*$dx); # almost x
	    for(my $f=$prevx+$dx;$f<$fx;$f+=$dx)
	    {
		push(@{$bd},
		     [$f, # take the average x value 
		      0, # use the normalized width
		      $binwidth, # width of the bin, from the lowest point to the highest
		      0, # poisson error
		      0 # number
		     ]);
	    }
	}

	push(@{$bd},
	     [$x, # take the average x value 
	      $biny, # use the normalized width
	      $binwidth, # width of the bin, from the lowest point to the highest
	      poisson($biny,$binn), # poisson error
	      $binn # number
	      ]);

	# reset the bin information
	    	    
	# shift the boundary
	$nextx+=$dx if(defined($nextx));
	$prevbinminx=$binminx;
	$binminx+=$binwidth;
	$prevx=$x;
	$binx=$absbiny=$biny=$binwidth=0.0;
	$prevdatan=$datan;
	$binn=$datan=0;
    };

    # loop over the data, binning when desired
    my $nmax=$#{$d}+1; # max loop number
    my $n=0;
    my @prev;

    $binminx=$minx if(defined($minx)); # set fixed bin minimum
        
    foreach (0..$#{$d})
    {
	$n++; # count up...
	@_=@{$$d[$_]}; # set the data in @_ (fast)
	
	# RAW DATA:
	# 0 = x, 1 = y, 2 = n, 3 = dx

	if($self->vb)
	{
	    print "DATA x=$_[0] ";
	    print "cf nextx=$nextx "if(defined($nextx));
	    print "\n";
	}

	# check for fixed-width binning
	if(defined($nextx)&&($_[0]>=$nextx))
	{
	    print "Fixed : shove previous\n"if($self->vb);
	    # this x exceeds the next bin, so dump the bin
	    # before adding to it
	    $binwidth=$dx; # fix the bin width (fixed method!)
	    &$shove_ref();
	    # careful to set the next binminx in case of missing points
	    $nextx+=$dx while($nextx<$_[0]);
	    $binminx=$nextx-$dx;
	    print "Post-shove binminx=$binminx\n"if($self->vb);
	}

	# save the bin min x value
	if(!defined($binminx))
	{
	    if(defined($prevbinminx))
	    {
		$binminx=$prevbinminx;
	    }
	    else
	    {
		$binminx= $_[0] - $_[3]*0.5 ;
	    }
	}

	# add to total x (if required), y and n values
	$binx+=$_[0];	

	# note: in the case where there is no y value given,
	# a value of '1' is set in its place
	if($self->input_type==1)
	{
	    # scatter plot: integral is irrelevant
	    $biny+=$_[1];
	    #print "ADD $_[1] to $biny\n";
	    $absbiny+=abs($_[1]);
	}
	elsif($self->input_type==2)
	{
	    # we store the integral under the curve: this should be preserved!
	    my $tmp=$_[1]*$_[3];
	    #print "ADD y=$_[1]*$_[3]= $tmp to $biny\n";
	    $biny+=$tmp;
	    $absbiny+=abs($tmp);
	}
	
	# add up the bin width
	$binwidth+=$_[3];

	# save the number of objects in this bin
	$binn+=$_[2];

	# save the number of datapoints which contributed to this bin
	$datan++;

	# next bin?
	if(# check if y >= threshold
	   ($useabs && defined($binythresh) && ($absbiny >=$binythresh))||
	   ((!$useabs) && defined($binythresh) && ($biny >= $binythresh))||
	   # otherwise n >= threshold
	   (defined($minbinn)&&($binn>=$minbinn)))
	{
	    # add to the binned data
	    &$shove_ref();
	}
	elsif($n==$nmax)
	{
	    if(defined($nextx))
	    {
		print "Fixed : shove last\n"if($self->vb);
		# this x exceeds the next bin, so dump the bin
		# before adding to it
		$binwidth=$dx; # fix the bin width (fixed method!)
	    }
	    else
	    {
		# push data onto final binned_array
		# item to make an extra-populated final bin
		@_=@{pop(@{$bd})};
		
		# set the previous binminx to the current
		$binminx = $prevbinminx;
		
		# add previous x contribution
		$binx += $_[0];

		if($self->input_type==1)
		{
		    # scatter plot: integral is irrelevant
		    $biny+=$_[1];
		    $absbiny+=abs($_[1]);
		}
		elsif($self->input_type==2)
		{
		    # we store the integral under the curve: this should be preserved!
		    $biny+=$_[1]*$_[2];
		    $absbiny+=abs($_[1]*$_[2]);
		}

		# add up the previous bin's width
		$binwidth+=$_[2];

		# add the number of points
		$binn+=$_[4];

		# add previous data points to the current
		$datan+=$prevdatan;
	   }
	    # call the usual shove function
	    &$shove_ref();  
	}
	@prev=@_;
    }
}

sub compress
{
    return($_[0]->{_compress});
}
sub compression_string
{
    return($_[0]->{_compression_string});
}

sub correct_for_compression
{
    my $self=shift;
    # sometimes the compression routine does not work 
    # because the scalings go all awry, so correct for this
    # by forcing the max y values to be the same
    my $maxy=$self->maxabsy;
    my $maxbiny=0.0;
    
    foreach (@{$self->binned_data})
    {
	$maxbiny=MAX($maxbiny,abs(${$_}[1]) );
    }
    return if ($maxbiny==0.0);
    my $ratio=$maxy/$maxbiny;
    if($self->compression_string=~/\.(\d+)f/o)
    {
    	$self->arraymult($self->binned_data,1,$ratio) ;
    	$self->arraymult($self->binned_data,3,$ratio) ;
    }
}

sub poisson
{
    # return the poisson error on y from n measurements
    my $y=shift;
    my $n=shift;
    return ($y * (MAX(1,$n))**-0.5);
}

# total and max/min functions for the data
sub totalx
{
    return($_[0]->totalval(0));
}

sub totalabsx
{
    return($_[0]->totalabsval(0));
}

sub totaly
{
    return($_[0]->totalval(1));
}


sub totalabsy
{
    return($_[0]->totalabsval(1));
}
sub maxx
{
    return($_[0]->{_xrange}[1]);
}
sub maxy
{
    return($_[0]->{_yrange}[1]);
}

sub maxabsy
{
    # maximum deviation from zero
    my @yr=@{$_[0]->{_yrange}};
    return MAX(abs($yr[0]),abs($yr[1]));
}

sub minx
{
    return($_[0]->{_xrange}[0]);
}
sub miny
{
    return($_[0]->{_yrange}[0]);
}
sub xrange
{
    # return array cotaining min,max of x data
    return @{$_[0]->{_xrange}};
}
sub yrange
{
    # return array containing min,max of y data
    return @{$_[0]->{_yrange}};
}

sub dx
{
    # return the difference between xmax and xmin
    my $self=shift;
    return(($self->xrange)[1]- ($self->xrange)[0]);
}

sub dy
{
    # return the difference between ymax and ymin
    my $self=shift;
    return(($self->yrange)[1]-($self->yrange)[0]);
}


sub totalval
{
    # return the sum of all the x(n=0) or y(n=1) data
    my $self=shift;
    my $n=shift;
    my $tot=0.0;
    
    my $d=$self->data_array;
    foreach (@$d)
    {
	$tot += ${$_}[$n];
    }
    return $tot;
}

sub totalabsval
{
    # return the absolute sum of all the x(n=0) or y(n=1) data
    my $self=shift;
    my $n=shift;
    my $tot=0.0;
    
    my $d=$self->data_array;
    foreach (@$d)
    {
	$tot += abs(${$_}[$n]);
    }
    return $tot;
}

sub arraymax
{
    # find max val from 2d array, look in position $n 
    my $self=shift;
    my $p=shift; # 2d array pointer
    my $n=shift; # position
    my $max=-1e300;
    
    foreach (@$p)
    {
	$max=MAX($max,${$_}[$n]);
    }
    return $max;
}

sub arraydivide
{
    # divide a whole array $p at position $n by $f 
    my $self=shift;
    $_[2]=1.0/$_[2];
    $self->arraymult(@_);
}

sub arraymult
{
    # multiply a whole array $p at position $n by $f 
    my $self=shift;
    my $p=shift; # 2d array pointer
    my $n=shift; # position
    my $f=shift; # scalar
    
    foreach (@$p)
    {
	${$_}[$n]*=$f; # * is faster?
    }
}

sub area
{
    # calculate the integral of the curve i.e the area
    my $self=shift;
    my $p=shift; # data array
    my $n=shift; # variable counter to use as the width
    my $area=0.0;

    print "Area calculation: p=$p, n=$n\n"if($self->vb>=2);
    my $evalstring='
    foreach (@$p)
    {
	$area += ${$_}[1] * ${$_}[$n];
	# _vb
    }';
    $evalstring =~s/\# _vb/print \"Add \$\{\$_\}\[1\] * \$\{\$_\}\[\$n\] -> now \$area\\n\"/o if($self->vb>=2); 
   
    eval $evalstring;
    print "Area = $area\n"if($self->vb>=2);
    return $area;
}

sub absarea
{
    # calculate the integral of the absolute curve i.e absolute area
    my $self=shift;
    my $p=shift; # data array
    my $n=shift; # variable counter to use as the width
    my $area=0.0;
    
    foreach (@$p)
    {
	$area += abs(${$_}[1]) * ${$_}[$n];
    }
    return $area;
}


sub data_area
{
    my $self=shift;
    # calculate data area
    if($self->{_widths_calculated}==0)
    {
	print "Data area calls calc_widths\n"if($self->vb);
	$self->calc_widths;
    }
    $self->area($self->data_array,3);
}

sub data_absarea
{
    my $self=shift;
    # calculate data area
    $self->absarea($self->data_array,3);
}

sub binned_data_area
{
    # calculate binned data area
    my $self=shift;
    $self->area($self->binned_data,2);
}

sub binned_data_absarea
{
    # calculate binned data area
    my $self=shift;
    $self->absarea($self->binned_data,2);
}

sub set_binned_data_area
{
    my $self=shift;
    my $newarea=shift;
    # set the binned data area to the given value $newarea
    
    # determine the original area
    my $area=$self->binned_data_area;
    if($area==0.0)
    {
	dumperr("Binned data area = 0 in set_binned_data_area\n");
    }

    printf "Multiply binned area by %g from newarea=$newarea, old area=$area\n",$newarea/$area if($self->vb);
    
    # multiply
    my $f=$newarea/$area;
    $self->arraymult($self->binned_data,1,$f);
    $self->arraymult($self->binned_data,3,$f);
}

sub verbose
{
    my $self=shift;
    my $n=shift;
    if(defined($n))
    {
	$self->{_vb}=$n;
    }
    else
    {
	$self->{_vb}++;    
    }
}

sub quiet
{
    $_[0]->{_vb}=0;
}

sub vb
{
    return $_[0]->{_vb};
}

sub set_xcompression
{
    my $self=shift;
    my $opts=shift;
    $self->compress=1;
    $self->compression_string=$$opts{format} if (defined($$opts{format}));
    print "Set x-data compression on with string \"$self->compression_string\"\n"if($self->vb);
}




sub normalize_data_wrapper
{
    my $self=shift;
    my $p=shift; # array to normalize
    my $max=$self->maxy;
    # normalize data to 1, for each column left in @_
    print "Normalize data : p=$p maxy=$max (multipliers @_)\n" if($self->vb);
    return if ($max==0.0); # can't do anything!

    # divide 
    foreach (@_)
    {
	$self->arraydivide($p,$_,$max);    
    }
}

sub normalize_data
{
    # wrapper to normalize binned y data to 1
    my $self=shift;
    print "Normalize input data...\n" if($self->vb);
    
    # normalize y only
    $self->normalize_data_wrapper($self->data_array,1);
}
sub normalize_binned_data
{
    # wrapper to normalize binned y data to 1
    my $self=shift;
    print "Normalize binned data...\n" if($self->vb);
    
    # find max
    my $bmax;
    foreach my $l (@{$self->binned_data})
    {
	my @dl=@{$l};
	if(defined($bmax))
	{
	    $bmax=MAX($bmax,$dl[1]);
	}
	else
	{
	    $bmax=$dl[1];
	}
    }
    $bmax=1.0/$bmax;

    for(my $i=0;$i<=$#{$self->binned_data};$i++)
    {
	${$self->binned_data}[$i][1]*=$bmax;
	${$self->binned_data}[$i][3]*=$bmax;
    }

    # normalize y, dy only
    #$self->normalize_data_wrapper($self->binned_data,1);
}

sub zero_pad_binned_data
{
    # add zeros at the end of the bins so that a plotted histogram 
    # always has zero-points at the end
    my $self=shift;
    print "Zero pad binned data...\n" if($self->vb);
    
    # low end
    my $dx=${$self->binned_data}[0][2];
    my $lowx=${$self->binned_data}[0][0]-$dx;
    unshift @{$self->binned_data},[$lowx,0.0,$dx,0.0,0];

    # high end
    my $n=$#{$self->binned_data};
    $dx=${$self->binned_data}[$n][2];
    my $highx=${$self->binned_data}[$n][0]+$dx;
    push @{$self->binned_data},[$highx,0.0,$dx,0.0,0];
}


sub version
{
    return $VERSION;
}

1;
__END__
# Below is stub documentation for your module. You'd better edit it!

=head1 NAME

Binning - Perl extension for blah blah blah

=head1 SYNOPSIS

  use Binning;

  my $o=Binning->new();
  $o->set_data(1,2,3,4);
  $o->set_input_type('scatter');
  $o->load_data('test.dat');
  $o->bin_data({method => 'fixed',nbins => 4});  
  $o->save_binned_data('/tmp/binned.dat');


=head1 DESCRIPTION

This module should deal with binning of 1 and 2d datasets.

1) Make an object:

    my $o=Binning->new();

2) Loading data

You can do this directly through the set_data method, which takes
either one or two-dimensional arrays:

    $o->set_data[ 1, 2, 3 ... ];
    $o->set_data[ [1,2,3] , [4,5,6] , ...]

or load data from a file:

    $o->load_data(<filename>)

Note that load_data() slurps in the data then analyses it (classic
perl!) but you can use a function while loads in on the fly and saves
a little memory:

$o->load_data_onthefly(<filename>);

I assume the data is of the form

(x) or (x,y) - in the former case (no y) y is assumed to be 1
(i.e. this is for a pure count).

 
You should also set the input type to either 'scatter' or 'distribution',
depending on the source of your data:

    $o->set_input_type(...);

scatter: the area under the input is not preserved, final bin y values are
simple sums. NOTE this only works with Fixed bins!
distribution: the area under the input is preserved, so the final area of the
histogram equals the data area. NOTE When you use Flex binning, this is implied!



3) Bin data

There are two main binning methods: flex and fixed.
Flex allows you to bin as many data points per bin
until you read some threshold e.g.  points.

Flex bin

Note: Using "Flex bin" sets the input type to "distribution"
because it tries to conserve the area in each bin.
 
e.g.

Bin after every 4 data points:

    $o->bin_data({method => 'flex',threshold => 'count 4'});

Bin after every 1/10 of the total number of data points:

    $o->bin_data({method => 'flex',threshold => 'count fraction 0.1'});

Bin every 1/10 of the total integrated y value:

    $o->bin_data({method => 'flex',threshold => 'fraction of total 0.1'});

Bin every 1/10 of the total integrated absolute y value:

    $o->bin_data({method => 'flex',threshold => 'absolute fraction of total 0.1'});

Every 1/4 of the maximum y value:

    $o->bin_data({method => 'flex',threshold => 'fraction of maximum 0.4'});

Every 1/4 of the maximum absolute y value:

    $o->bin_data({method => 'flex',threshold => 'absolute fraction of maximum 0.4'});

Bin when the bin has an absolute value of 3 (abs(y) values are added instead of just y)

    $o->bin_data({method => 'flex',threshold => 'absolute 3'});

Othere options dictate the storage of the bin's x value:
If you make halfx=>1 then the bin x values are chosen half way
through the bin, otherwise then are taken at the midpoint
as averaged over the values *by number count* (i.e. not
weighted by the y values). 

Of course, in reality the actual x value is not so important, 
but you might want to plot at the half-way points to make it look
nicer.


Fixed binning


In this case you either specify:

a) the number of bins between a minimum and maximum x value
(if the max/min are not specified, the data range is used)

e.g.
    $o->bin_data({method => 'fixed',nbins => 4});

b) a fixed bin width 
    $o->bin_data({method => 'fixed',binwidth => 0.1});



4 Data normalization



The reason I wrote this module was that I was fed up with messing
around with my old (rubbish) code and normalization tricks.

You might want to normalize to the maximum point (so everything
is <1) in which case use:

    $o->normalize_binned_data();

You can also determine the area under the curve according to

    $o->binned_data_area();

You can set the area to a given value:

    $o->set_binned_data_area(scalar)



5 Other functions


Save your data with save_binned_data:

    $o->save_binned_data('/tmp/binned.dat');

It might save you a *lot* of memory to compress the x-data down to a 
more sensible resolution. The module can do this for you if you use
the set_xcompression method. You pass a format string for sprintf
which is then used on each x data value e.g.

    $o->set_xcompression({format=>'%1.1f'});

NB you must do this *before* the data is loaded!

WARNING : if you compress too much, then there will be big problems.
Binning.pm tries to normalize the compressed distribution to the 
original one by using the ratio of the absolute maxima : however, this
may not work in some pathological cases and does *not* preserve the 
area under the curves! BE WARNED use compression with care!

If you change your mind, turn this off with

    $o->no_xcompression();

You can directly access the data through these functions:

To get a pointer to the original (2d) data array:

    $o->data_array

To get the binned result (pointer to 2d array): 

    $o->binned_data

To get a human-readable string:

    $o->flat_data_array

To find the max, min etc. try
    $o->minx, $o->miny, $o->maxx, $o->maxy

To get the sum of the x, y data
    $o->totalx, $o->totaly


To see some verbose output set

    $o->verbose();

You can set this multiple times, or via 

    $o->verbose(<n>);

the verbosity levels being:

1 Some guide as to what is going on
2 Information on calculations (could be long)
3 Information on data input (could be very long)


To stop this output set

    $o->quiet();




=head2 EXPORT

None by default.



=head1 SEE ALSO

This replaces rob's bin_data.pl script. I hope.



=head1 AUTHOR

Rob Izzard

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2008 by Rob Izzard

This library is NOT free software; you MAY NOT redistribute it and/or modify
it under the same terms as Perl itself, or anything else. PISS OFF!


=cut
