#!/usr/bin/perl

#
# dbcolstats
# Copyright (C) 1997-1998 by John Heidemann <johnh@isi.edu>
# $Id: dbcolstats,v 1.13 1999/11/23 02:04:29 johnh Exp $
#
# This program is distributed under terms of the GNU general
# public license, version 2.  See the file COPYING
# in $dblibdir for details.
#
sub usage {
	print STDERR <<END;
usage: $0 [-t mean,sd] column

Compute statistics (z-score and optionally t-score) over a column of
numbers.  New columns are "zscore", "tscore".

This program requires two passes over the data.

Options:
    -t mean,sd	    generate t-scores with a given mean and standard deviation

Sample input:
#h name id test1
a 1 80
b 2 70
c 3 65
d 4 90
e 5 70
f 6 90

Sample command:
cat DATA/grades.jdb | dbcolstats -t 50,10 test1 | dbcolneaten

Sample output:
#h name id test1 zscore   tscore 
a       1  80    0.23063  52.306 
b       2  70    -0.69188 43.081 
c       3  65    -1.1531  38.469 
d       4  90    1.1531   61.531 
e       5  70    -0.69188 43.081 
f       6  90    1.1531   61.531 
#  | dbcolstats -t 50,10 test1
#  | dbcolneaten 

Related programs:
dbcolpercentile:   show a histogram of the data
dbstats: stats that compress a column of data
END
    exit 1;
}

BEGIN {
    $dbbindir = "/usr/local/bin";
    $dblibdir = "/usr/local/lib/jdb";
    push(@INC, $dblibdir);
}
use DbGetopt;
require "$dblibdir/dblib.pl";

my(@orig_argv) = @ARGV;
my($prog) = &progname;
my($do_tscores) = undef;
my($bogus_are_ignored) = 1;
my($t_mean, $t_sd);
my($dbopts) = new DbGetopt("t:?", \@ARGV);
my($ch);
while ($dbopts->getopt) {
    $ch = $dbopts->opt;
    if ($ch eq 't') {
	$do_tscores = 1;
	($t_mean, $t_sd) = split(/,/, $dbopts->optarg);
	die "$0: option ``-t mean,sd'' incorrectly specified.\n"
	    if (!defined($t_sd));
    } else {
	&usage;
    };
};

&usage if ($#ARGV != 0);
my($xfcol) = $ARGV[0];

#
# Shunt the data to a separate file.
#
my($tmp) = db_tmpfile(TMP);
my($header_line) = 1;
while (<STDIN>) {
    if ($header_line) {
	&process_header($_);
	die ("$prog: unknown column ``$xfcol''.\n")
	    if (!defined($colnametonum{$xfcol}));
	my($xf) = $colnametonum{$xfcol};
	$header_line = undef;
    };
    print TMP $_;
};
close TMP;
close STDIN;

#
# Figure out stats on the file.
#
open(FROMSTATS, "$dbbindir/dbstats $xfcol <$tmp |") || die "$prog: cannot run dbstats\n";
my($header) = scalar(<FROMSTATS>);
$header =~ /^$headertag_regexp/ or die "$prog: dbstats returns bogus header.\n";
my(@statsnames) = split(/$header_fsre/, $header);
shift @statsnames;
shift @statsnames while ($statsnames[0] =~ /^\-/);
my($stats) = scalar(<FROMSTATS>);
my(@stats) = split(/$fsre/, $stats);
my(%stats);
foreach (0..$#stats) {
    $stats{$statsnames[$_]} = $stats[$_];
};
close FROMSTATS;

# reopen the file for our work.
open(STDIN, "<$tmp") || die "$prog: cannot reopen $tmp.\n";

&readprocess_header;
$xf = $colnametonum{$xfcol};
die ("$prog: unknown column name ``$xfcol''.\n") if ($xf == undef);

#
# new columns
#
my($zscore_f) = &col_create('zscore');
my($tscore_f) = &col_create('tscore') if ($do_tscores);
&write_header();

#
# Just showing off.
#
$code = <<END;
END

#
# Figure colstats.
#
while (<STDIN>) {
    &pass_comments && next;
    &split_cols;

    $x = &force_numeric($f[$xf], $bogus_are_ignored);

    if (!defined($x)) {
	$f[$zscore_f] = "-";
	$f[$tscore_f] = "-" if ($do_tscores);
    } else {
	$f[$zscore_f] = sprintf("$default_format", ($x - $stats{'mean'}) / $stats{'stddev'});
	$f[$tscore_f] = sprintf("$default_format", $f[$zscore_f] * $t_sd + $t_mean) if ($do_tscores);
    };
    
    &write_cols;
};

print "#  | $prog ", join(" ", @orig_argv), "\n";
exit 0;

# error supression
{
    my($dummy) = $default_format;
}
