#!/usr/bin/perl

#
# dbcolscorrelate
# Copyright (C) 1998-2001 by John Heidemann <johnh@isi.edu>
# $Id: dbcolscorrelate,v 1.6 2001/05/25 21:05:19 johnh Exp $
#
# This program is distributed under terms of the GNU general
# public license, version 2.  See the file COPYING
# in $dblibdir for details.
#
sub usage {
	print STDERR <<END;
usage: $0 column1 column2 [column3...]

Compute the coefficient of correlation over two (or more) columns.
With more than two columns, correlations are names as a combination of
each two column names.

This program requires M+1 passes over the data where M is the number
of columns, but only O(M) memory.

Sample input:
#h name id test1 test2
a 1 80 81
b 2 70 71
c 3 65 66
d 4 90 91
e 5 70 71
f 6 90 91

Sample command:
cat DATA/more_grades.jdb | dbcolscorrelate test1 test2

Sample output:
#h correlation
0.83329
#  | dbcolscorrelate test1 test2

Related programs:
END
    exit 1;
}

BEGIN {
    $dbbindir = "/usr/local/bin";
    $dblibdir = "/usr/local/lib/jdb";
    push(@INC, $dblibdir);
}
require "$dblibdir/dblib.pl";
use DbGetopt;

my(@orig_argv) = @ARGV;
my($prog) = &progname;
my($bogus_are_ignored) = 1;
my($dbopts) = new DbGetopt("a?", \@ARGV);
my($ch);
while ($dbopts->getopt) {
    $ch = $dbopts->opt;
    if ($ch eq 'a') {
	$bogus_are_ignored = 0;
    } else {
	&usage;
    };
};

&usage if ($#ARGV < 1);
my(@xfcols) = @ARGV;
my(@xfs);

#
# Shunt the data to a separate file.
#
my($tmp) = db_tmpfile(TMP);
my($header_line) = 1;
while (<STDIN>) {
    if ($header_line) {
	&process_header($_);
	my($xfcol);
	foreach $xfcol (@xfcols) {
	    die ("$prog: unknown column ``$xfcol''.\n")
	        if (!defined($colnametonum{$xfcol}));
	    push(@xfs, $colnametonum{$xfcol});
        };
	$header_line = undef;
    };
    print TMP $_;
};
close TMP;
close STDIN;

#
# Figure out stats on each column.
#
my($stats) = [];
my($i, $j);
foreach $i (0..$#xfcols) {
    my($xfcol) = $xfcols[$i];
    my($xf) = $xfs[$i];
    # NEEDSWORK: should pass $bogus_are_ignored
    open(FROMSTATS, "$dbbindir/dbstats $xfcol <$tmp |") || die "$prog: cannot run dbstats\n";
    my($header) = scalar(<FROMSTATS>);
    $header =~ /^$headertag_regexp/ or die "$prog: dbstats returns bogus header.\n";
    my(@statsnames) = split(/$header_fsre/, $header);
    shift @statsnames;
    shift @statsnames while ($statsnames[0] =~ /^\-/);
    my($stats_line) = scalar(<FROMSTATS>);
    my(@stats) = split(/$fsre/, $stats_line);
    foreach (0..$#stats) {
        $stats->[$i]{$statsnames[$_]} = $stats[$_];
    };
    close FROMSTATS;
};

# reopen the file for our work.
open(STDIN, "<$tmp") || die "$prog: cannot reopen $tmp.\n";

&readprocess_header;

#
# Figure correlation.
#
my($sum_zs) = [];
my($n) = [];
my($names) = [];
foreach $i (0..$#xfs) {
    foreach $j (0..$#xfs) {
        next if ($i >= $j);
	$sum_zs->[$i][$j] = 0;
	$n->[$i][$j] = 0;
	$names->[$i][$j] = ($#xfs == 1) ? "correlation" : "$xfcols[$i]_$xfcols[$j]";
    };
};

while (<STDIN>) {
    &delayed_pass_comments() && next;
    &split_cols;

    # figure z scores
    my(@xs, @zs);
    foreach $i (0..$#xfs) {
	my($x) = &force_numeric($f[$xfs[$i]], $bogus_are_ignored);
	push(@xs, $x);
	my($z) = defined($x) ? ($x - $stats->[$i]{'mean'}) / $stats->[$i]{'stddev'} : undef;
	push(@zs, $z);
    };

    # figure correlation sums
    foreach $i (0..$#xfs) {
	foreach $j (0..$#xfs) {
	    next if ($i >= $j);
	    next if (!defined($zs[$i]) || !defined($zs[$j]));
	    $sum_zs->[$i][$j] += $zs[$i] * $zs[$j];
	    ($n->[$i][$j])++;
	};
    };
};

#
# Output the results.
#
my(@names);
my(@correlations);
foreach $i (0..$#xfs) {
    foreach $j (0..$#xfs) {
        next if ($i >= $j);
	push(@names, $names->[$i][$j]);
	push(@correlations, $n->[$i][$j] == 0 ? "-" :
				sprintf("$default_format",
					$sum_zs->[$i][$j] / $n->[$i][$j]));
    };
};
&write_header(@names);
&write_these_cols(@correlations);

&delayed_flush_comments();
print "#  | $prog ", join(" ", @orig_argv), "\n";
exit 0;

# error supression
{
    my($dummy) = $default_format;
}
