#!/usr/bin/perl -w

#
# dbcolhisto
# Copyright (C) 1997-1998 by John Heidemann <johnh@isi.edu>
# $Id: dbcolhisto,v 1.20 1999/11/23 02:04:29 johnh Exp $
#
# This program is distributed under terms of the GNU general
# public license, version 2.  See the file COPYING
# in $dblibdir for details.
#
sub usage {
	print STDERR <<END;
usage: $0 [-ag] [-w BucketWidth] [-s BucketStart] [-e BucketEnd] [-n NumberOfBuckets] column

Compute a histogram over a column of data.
Records containing non-numeric data are considered null
do not contribute to the stats (optionally they are treated as zeros).

Options:
    -w n   specify the width of each bucket
    -s n   specify the start of the first bucket
    -e n   specify the start of the last bucket
    -n n   specify the number of buckets
    -g     graphical version (default is numeric)
    -I     last bucket is not inclusive
    -a	    compute stats over all records (treat non-numeric records
		as zero rather than just ignoring them)

Defaults to 10 buckets over the exact range of data.
Up to three parameters can be specified, the rest default accordingly.

Buckets range from a value (given the the low column) to just below
the next low value and buckets are equal width.
If necessary, "<min" and ">max" buckets are created.
By default, the last bucket includes max (and is thus infintimessally 
larger than the other buckets).  This irregularity can be removed
with the -I option.

Sample input:
#h name id test1
a 1 80
b 2 70
c 3 65
d 4 90
e 5 70
f 6 90

Sample command:
cat DATA/grades.jdb | dbcolhisto -s 0 -e 100 -n 10 test1

Sample output:
#h low histogram
0       0
10      0
20      0
30      0
40      0
50      0
60      1
70      2
80      1
90      2
#  | ./dbcolhisto -s 0 -e 100 -n 10 test1

Related programs:
dbcolpercentile

This program requires constant memory and O(size of data) temporary
disk space.

END
	exit 1;
}

BEGIN {
    $dblibdir = "/usr/local/lib/jdb";
    push(@INC, $dblibdir);
}
use DbGetopt;
require "$dblibdir/dblib.pl";

my(@orig_argv) = @ARGV;
my($prog) = &progname;
my($bucket_width, $bucket_start, $bucket_end, $bucket_count, $graphical_output);
my($last_inclusive) = 1;
my($bogus_are_ignored) = 1;
my($dbopts) = new DbGetopt("ae:gIn:s:w:?", \@ARGV);
my($ch);
while ($dbopts->getopt) {
    $ch = $dbopts->opt;
    if ($ch eq 'e') {
	$bucket_end = $dbopts->optarg;
    } elsif ($ch eq 'n') {
	$bucket_count = $dbopts->optarg;
    } elsif ($ch eq 'a') {
	$bogus_are_ignored = 0;
    } elsif ($ch eq 's') {
	$bucket_start = $dbopts->optarg;
    } elsif ($ch eq 'w') {
	$bucket_width = $dbopts->optarg;
    } elsif ($ch eq 'g') {
	$graphical_output = 1;
    } elsif ($ch eq 'I') {
	$last_inclusive = 0;
    } else {
	&usage;
    };
};

&usage if ($#ARGV != 0);

&readprocess_header;
my($xfcol) = $ARGV[0];
my($xf) = $colnametonum{$xfcol};
die ("$prog: unknown column name ``$xfcol''.\n") if (!defined($xf));

my($min, $max);
my($n) = 0;
my($save_data_filename);
$save_data_filename = db_tmpfile(SAVE_DATA);
print SAVE_DATA "$col_headertag data\n";

while (<STDIN>) {
    &delayed_pass_comments && next;
    &split_cols;

    $x = &force_numeric($f[$xf], $bogus_are_ignored);
    next if (!defined($x));

    print SAVE_DATA "$x\n";   # sigh, back to a string
    $min = $x if (!defined($min) || $x < $min);
    $max = $x if (!defined($max) || $x > $max);
    $n++;
};

if ($n == 0) {
    print STDERR "$prog: histogram impossible with no input\n";
    exit 1;
} elsif ($n == 1) {
    print STDERR "$prog: histogram impossible with singleton input\n";
    exit 1;
};

#
# Figure out bucket parameters.
# Yuck.  Constraint solving in Perl.
#
my($n_defined) = 
    (defined($bucket_start) ? 1 : 0) +
    (defined($bucket_end) ? 1 : 0) +
    (defined($bucket_width) ? 1 : 0) +
    (defined($bucket_count) ? 1 : 0);
if ($n_defined >= 4) {
    die "$prog: parameters over-specified.\n";
} elsif ($n_defined == 3) {
    # fall through, clean up handles it.
} elsif ($n_defined == 2) {
    if (defined($bucket_start) && defined($bucket_end)) {
	$bucket_count = 10;
    } elsif (defined($bucket_start) && defined($bucket_width)) {
	$bucket_count = 10;
    } elsif (defined($bucket_start) && defined($bucket_count)) {
	$bucket_end = $max;
    } elsif (defined($bucket_end) && defined($bucket_width)) {
	$bucket_count = 10;
    } elsif (defined($bucket_end) && defined($bucket_count)) {
	$bucket_start = $min;
    } elsif (defined($bucket_width) && defined($bucket_count)) {
	my($mid) = ($max - $min) / 2 + $min;
	$bucket_start = $mid - $bucket_width * $bucket_count / 2;
    };
    # Figure the rest out below.
} elsif ($n_defined == 1) {
    if (defined($bucket_start)) {
	$bucket_end = $max;
	$bucket_count = 10;
    } elsif (defined($bucket_end)) {
	$bucket_start = $min;
	$bucket_count = 10;
    } elsif (defined($bucket_width) || defined($bucket_count)) {
	$bucket_start = $min;
	$bucket_end = $max;
    };
} elsif ($n_defined < 1) {
    $bucket_start = $min;
    $bucket_end = $max;
    $bucket_count = 10;
};
# clean up
$bucket_start = $bucket_end - $bucket_width * $bucket_count
    if (!defined($bucket_start));
$bucket_end = $bucket_start + $bucket_width * $bucket_count
    if (!defined($bucket_end));
$bucket_width = ($bucket_end - $bucket_start) / $bucket_count
    if (!defined($bucket_width));
$bucket_count = ($bucket_end - $bucket_start) / $bucket_width
    if (!defined($bucket_count));
$bucket_width += 0.0;

#
# Compute the histogram.
#
my(@buckets) = (0) x $bucket_count;
my($low_bucket, $high_bucket) = (0, 0);
close SAVE_DATA;
open (SAVE_DATA, "<$save_data_filename") || die "$prog: cannot open $save_data_filename\n";
my($header) = scalar(<SAVE_DATA>);
while (<SAVE_DATA>) {
    chomp;
    $_ += 0.0;
    my($b) = ($_ - $bucket_start) / ($bucket_width);
    if ($b < 0) {
	$low_bucket++;
    } elsif ($b >= $bucket_count) {
	if (($_ == $high_bucket || $b == $bucket_count) && $last_inclusive) {
	    $buckets[$bucket_count]++;
	} else {
	    $high_bucket++;
	};
    } else {
	$buckets[int($b)]++;
    };
}
close SAVE_DATA;

#
# Display the histogram
#
sub format {
    my($n) = @_;
    return ($graphical_output ? ("*" x ($n)) : $n);
}
&write_header(qw(low histogram));
my($break) = $bucket_start;
print "<$break\t" . &format($low_bucket) . "\n" if ($low_bucket);
for ($i = 0; $i <= $#buckets; $i++) {
    &write_these_cols($break, &format($buckets[$i]));
    $break += $bucket_width;
};
&write_these_cols(">=$break\t", &format($high_bucket)) if ($high_bucket);

&delayed_flush_comments;
print "#  | $prog ", join(" ", @orig_argv), "\n";
exit 0;

if (0) {
    my $x;
    $x = $col_headertag = $f[0] = $colnametonum{'foo'};
}
