#!/usr/bin/perl -w

#
# dbstats
# Copyright (C) 1991-2000 by John Heidemann <johnh@isi.edu>
# $Id: dbstats,v 1.39 2001/11/02 22:36:22 johnh Exp $
#
# This program is distributed under terms of the GNU general
# public license, version 2.  See the file COPYING
# in $dblibdir for details.
#
sub usage {
	print STDERR <<END;
usage: $0 [-amS] [-c ConfidencePercent] [-q NumberOfQuartiles] column

Compute statistics over a column of data.
Records containing non-numeric data are considered null
do not contribute to the stats (optionally they are treated as zeros).

Confidence intervals are a t-test (+/- (t_{a/2})*s/sqrt(n))
and assume the population takes a normal distribution
with a small number of samples (< 100).

All statistics are computed for as a population sample (with an ``n-1'' term),
not as representing the whole population (using ``n'').

Stats are probably best looked at after post-processing the output
with dblistize.

Options:
    -c ConfidencePercent    specify confidence intervals
    -a	    compute stats over all records (treat non-numeric records
		as zero rather than just ignoring them)
    -m      compute median value
    -q N    compute quartile (if N=4) or ntile values (the scores
	    that are 1 Nth of the way across the population)
    -S	    assume data is already sorted

Dbstats runs in O(1) memory.  Median or quartile requires sorting the
data and invokes dbsort.  Sorting will run in constant RAM but
O(number of records) disk space.  If median or quartile is required
and the data is already sorted, dbstats will run more efficiently with
the -S option.


Sample input:
#h      absdiff
0
0.046953
0.072074
0.075413
0.094088
0.096602
#  | /home/johnh/BIN/DB/dbrow 
#  | /home/johnh/BIN/DB/dbcol event clock
#  | dbrowdiff clock
#  | /home/johnh/BIN/DB/dbcol absdiff

Sample command:
cat data.jdb | dbstats absdiff

Sample output:
#h mean stddev pct_rsd conf_range conf_low conf_high conf_pct sum sum_squared min max n
0.064188        0.036194        56.387  0.037989        0.026199        0.102180.95     0.38513 0.031271        0       0.096602        6
#  | /home/johnh/BIN/DB/dbrow 
#  | /home/johnh/BIN/DB/dbcol event clock
#  | dbrowdiff clock
#  | /home/johnh/BIN/DB/dbcol absdiff
#  | dbstats absdiff
#               0.95 confidence intervals assume normal distribution and small n.

Related programs:
dbmultistats:  handles multiple experiments in a single file.
dblistize:  pretty-print the output of dbstats
dbcolpercentile:  compute an even more general version of median/quartiles
END
	exit 1;
}

BEGIN {
    $dbbindir = "/usr/local/bin";
    $dblibdir = "/usr/local/lib/jdb";
    push(@INC, $dblibdir);
}
use DbGetopt;
use DbTDistr;
require "$dblibdir/dblib.pl";

$conf_pct = 0.95;
$format = $default_format;
my(@orig_argv) = @ARGV;
my($prog) = &progname;
my($sorting_required) = 1;
my($bogus_are_ignored) = 1;
my($save_data) = 0;
my($do_median, $ntile);
my($dbopts) = new DbGetopt("ac:f:mq:S?", \@ARGV);
my($ch);
while ($dbopts->getopt) {
    $ch = $dbopts->opt;
    if ($ch eq 'c') {
	$conf_pct = $dbopts->optarg;
    } elsif ($ch eq 'f') {
	$format = $dbopts->optarg;
    } elsif ($ch eq 'a') {
	$bogus_are_ignored = 0;
    } elsif ($ch eq 'm') {
	$do_median = 1;
	$save_data = 1;
    } elsif ($ch eq 'S') {
	$sorting_required = 0;
    } elsif ($ch eq 'q') {
	$ntile = $dbopts->optarg;
	$save_data = 1;
    } else {
	&usage;
    };
};

&usage if ($#ARGV != 0);
my($xfcol) = $ARGV[0];

&readprocess_header;
die ("$prog: unknown column ``$xfcol''.\n")
    if (!defined($colnametonum{$xfcol}));
my($xf) = $colnametonum{$xfcol};

my($n) = 0;
my($sx) = 0;
my($sxx) = 0;
my($minmaxinit) = 0;

my($save_data_filename);
if ($save_data) {
    $save_data_filename = db_tmpfile(TMP);
    close TMP;
    my($sort_command) = ($sorting_required ? "|$dbbindir/dbsort -n data " : "");
    open(SAVE_DATA, "$sort_command>$save_data_filename") || die "$prog: cannot run dbsort.\n";
    print SAVE_DATA "$col_headertag data\n";
}

#
# Read and process the data.
#
while (<STDIN>) {
    &delayed_pass_comments() && next;
    &split_cols;

    $x = &force_numeric($f[$xf], $bogus_are_ignored);
    next if (!defined($x));

    $n++;
    $sx += $x;
    $sxx += $x * $x;
    print SAVE_DATA "$x\n" if ($save_data);

    if (!$minmaxinit) {
	$min = $max = $x;
	$minmaxinit = 1;
    } else {
	$min = $x if ($x < $min);
	$max = $x if ($x > $max);
    };
};

if ($n == 0) {
    die "$0: no input\n";
    exit 1;
};

#
# Compute stats.
#
$mean = $sx / $n;
# stddev = s, not s^2, approximates omega
# Check for special cases:
#   $n <= 1	    => divide by zero
#   all same data value  => can sometimes get very small or negative
#			stddev (due to rounding error)	    
# for these cases, $stddev = 0
$stddev = ($n <= 1 || $max == $min) ? 0 : 
    sqrt(($sxx - $n * $mean * $mean) / ($n - 1));
if ($mean == 0) {
	$pct_rsd = "na";
} else {
	$pct_rsd = ($stddev / $mean) * 100;
};
#
# Confidence intervals from "Probability and Statistics for Engineers",
# Second Edition, Scheaffer and McClave, p. 242.
#
if ($n <= 1) {
	$conf_half = 0;
} else {
	$conf_gamma = 1.0 - ((1.0 - $conf_pct) / 2.0);
	$conf_half = &DbTDistr::t_distr($n - 1, $conf_gamma) * $stddev / sqrt($n);  
};
$conf_low = $mean - $conf_half;
$conf_high = $mean + $conf_half;

#
# Compute median/quartile.
#
sub round_up {
    my($x) = @_;
    my($xi) = int($x);
    return ($x > $xi) ? $xi+1 : $xi;
}

my($median, @q);
my($real_ntile) = ($ntile ? $ntile : 2);
if ($save_data && $n == 0) {
    $median = $mean;
    push(@q, ($mean) x $real_ntile);
} elsif ($save_data) {
    close SAVE_DATA;
    open (SAVE_DATA, "<$save_data_filename") || die "$prog: cannot open $save_data_filename.\n";
    # To handle the ugly case of having more ntiles than
    # data, we detect it and replicate the data until we have more
    # replicated_data than ntiles.
    my($replicate_data) = ($n >= $real_ntile+1) ? 1 : round_up(($real_ntile+1.0)/$n);
    my($replicated_n) = $n * $replicate_data;
    # Also note that the array of quartiles and the number of 
    # data elements read are both 1-based and not 0-based like
    # most perl stuff.  This is to make the math easier.
    $median_i = round_up($replicated_n / 2);
    $ntile_frac = ($replicated_n + 0.0) / ($real_ntile + 0.0);
    my($x, $last_x, $next_q_i);
    @q = (0);   # note that q is primed with 0
    my($replicates_left) = 0;
    $x = <SAVE_DATA>;  # consume header
    die "$0: internal error." if ($x !~ /^#h/); #
    my($i);       # note that i counts from 1!
    for ($i = 1; $#q+1 < $real_ntile; $i++) {
	if (--$replicates_left <= 0) {
	    $x = <SAVE_DATA>;
	    chomp $x;
	    $replicates_left = $replicate_data;
	    # Verify sorted order (in case the user lied to us
	    # about pre-sorting).
	    if (defined($last_x) && $x < $last_x) { 
		my($info) = ($sorting_required ? " (internal error in dbsort)" : " (user specified -S for pre-sorted data but it is unsorted)");
		die "$prog: cannot process data that is out of order between $last_x and $x $info.\n";
	    };
	    $last_x = $x;
	};
	if ($i == $median_i) { $median = $x; };
	$next_q_i = (round_up($ntile_frac * ($#q + 1.0) )) if (!defined($next_q_i));
#	print "d: q=$#q nq=$next_q_i i=$i\n";
	if ($i == $next_q_i) { push(@q, $x); $next_q_i = undef; };
    };
};

#
# Output the results.
#
my(@headers) = (qw(mean stddev pct_rsd conf_range conf_low conf_high
		  conf_pct sum sum_squared min max n));
push(@headers, "median") if ($do_median);
if ($ntile) {
    foreach (1..($ntile-1)) {
	push(@headers, "q$_");
    };
};
&write_header(@headers);

my(@o) = ();
foreach ($mean, $stddev, $pct_rsd, $conf_half, $conf_low, $conf_high, $conf_pct, $sx, $sxx, $min, $max) {
    push (@o, ($_ eq 'na' ? $_ : (sprintf "$format", $_)));
};
push (@o, $n);
push(@o, $median) if ($do_median);
push(@o, @q[1..($ntile-1)]) if ($ntile);
&write_these_cols(@o);

&delayed_flush_comments();
print "#  | $prog " . join(" ", @orig_argv) . "\n" . 
	"#\t\t$conf_pct confidence intervals assume normal distribution and small n.\n";

exit 0;

# supress warings
# error supression
{
    my($dummy) = $f[0];
    $dummy = $default_format;
    $dummy = $col_headertag;
    $dummy = <TMP>;
}
