#!/usr/bin/perl

#
# dbcolpercentile
# Copyright (C) 1997-1998 by John Heidemann <johnh@isi.edu>
# $Id: dbcolpercentile,v 1.12 1999/11/23 02:04:29 johnh Exp $
#
# This program is distributed under terms of the GNU general
# public license, version 2.  See the file COPYING
# in $dblibdir for details.
#
sub usage {
	print STDERR <<END;
usage: $0 [-rRS] column

Sort and compute a percentile of a column of numbers.
The new column will be called percentile, rank, or RANK.
Non-numeric records are handled as in other programs.

This program requires buffering a copy of all data in memory.

Options:
    -S indicates that the data is already sorted
    -r rank mode:  show rank rather than percential (lower rank is best)
    -R RANK mode:  show rank rather than percential (HIGHER rank is best)
    -a	    compute stats over all records (treat non-numeric records
		as zero rather than just ignoring them)

Sample input:
#h name id test1
a 1 80
b 2 70
c 3 65
d 4 90
e 5 70
f 6 90

Sample command:
cat DATA/grades.jdb | dbcolpercentile test1

Sample output:
#h      name    id      test1   percentile
c 3 65  0.166666666666667
b 2 70  0.5
e 5 70  0.5
a 1 80  0.666666666666667
d 4 90  1
f 6 90  1
#  | /home/johnh/BIN/DB/dbsort -n test1
#  | ./dbpercentile test1

(With -r or -R the new column will be called rank or RANK.)

Related programs:
dbcolhisto:   show a histogram of the data
END
    exit 1;
}

BEGIN {
    $dbbindir = "/usr/local/bin";
    $dblibdir = "/usr/local/lib/jdb";
    push(@INC, $dblibdir);
}
use DbGetopt;
require "$dblibdir/dblib.pl";

my(@orig_argv) = @ARGV;
my($prog) = &progname;
my($sorting_required) = 1;
my($RANK_PERCENTILE, $RANK_LOW_BEST, $RANK_HIGH_BEST) = (0..10);
my(@new_col_name) = qw(percentile rank RANK);
my($rank_mode) = $RANK_PERCENTILE;
my($bogus_are_ignored) = 1;
my($dbopts) = new DbGetopt("arRS?", \@ARGV);
my($ch);
while ($dbopts->getopt) {
    $ch = $dbopts->opt;
    if ($ch eq 'S') {
	$sorting_required = 0;
    } elsif ($ch eq 'a') {
	$bogus_are_ignored = 0;
    } elsif ($ch eq 'r') {
	$rank_mode = $RANK_LOW_BEST;
    } elsif ($ch eq 'R') {
	$rank_mode = $RANK_HIGH_BEST;
    } else {
	&usage;
    };
};

&usage if ($#ARGV != 0);

#
# Handle sorting, if necessary.
#
if ($sorting_required) {
    open(SORTER, "$dbbindir/dbsort -n $ARGV[0] |") || die("$prog: cannot run dbsort.\n");
    open(STDIN, "<&SORTER") || die("$prog: cannot dup SORTER.\n");
};

&readprocess_header;
$xf = $colnametonum{$ARGV[0]};
die ("$prog: unknown column name ``$ARGV[0]''.\n") if ($xf == undef);

#
# new columns
#
$percentile_f = &col_create($new_col_name[$rank_mode]);
&write_header();

my(@data) = ();
my(@scores) = ();
my($comments) = "";
my($n) = 0;
my($last) = undef;
my($run) = 0;

#
# Figure rankings.
#
while (<STDIN>) {
    if (&is_comment) {
        $comments .= $_;
	next;
    };
    &split_cols;

    # save it
    chomp;
    push(@data, $_);

    $x = &force_numeric($f[$xf], $bogus_are_ignored);

    # duplicate?
    if (!defined($last)) {   # first time
	$run++;
	$last = $x;
	next;
    } elsif ($x == $last) {  # in run
	$run++;
	next;
    } else {   # end of run
	$n += $run;
	push(@scores, ($n) x $run);
	$run = 1;
	$last = $x;
    };
};
# Handle final run.
if ($run) {
    $n += $run;
    push(@scores, ($n) x $run);
};
die("$prog: internal error.\n") if ($#data != $#scores);


#
# If necessary, go back and make them percentiles.
#
die ("$prog: no input.\n") if ($n == 0);
my($i);
if ($rank_mode == $RANK_PERCENTILE) {
    for ($i = 0; $i <= $#scores; $i++) {
	$scores[$i] /= ($n + 0.0);
    };
} elsif ($rank_mode == $RANK_LOW_BEST) {
    for ($i = 0; $i <= $#scores; $i++) {
	$scores[$i] = 1 + $n - $scores[$i];
    };
};

#
# Output.
#
for ($i = 0; $i <= $#data; $i++) {
    print $data[$i] . $outfs . $scores[$i] . "\n";
};  

print $comments;
print "#  | $prog ", join(" ", @orig_argv), "\n";
exit 0;

