#!/usr/local/bin/perl -w

#
# db2dcliff
# Copyright (C) 1997-1998 by John Heidemann <johnh@isi.edu>
# $Id: db2dcliff,v 1.13 1999/11/23 02:04:29 johnh Exp $
#
# This program is distributed under terms of the GNU general
# public license, version 2.  See the file COPYING
# in $dblibdir for details.
#
sub usage {
	print STDERR <<END;
usage: $0 x_column y_column

Find ``cliffs'' in two-dimensional data.

A graph may have a shape like:

400 |   ****
300 |  *
200 |  *
100 |**
    +----------
     1234567

There's a ``cliff'' at x=3.
Mathatmetically, a cliff is the segment between two points where
the second derivative changes sign.
This program automatically finds these cliffs in potentially noisy,
discrete data.

Cliffs are found inspite of data noise by tuning the minimal
thresholds for cliff slope and height and taking the largest cliffs
first.

Options:
    -n N    Output only the top N cliffs, ranked by cliff height.
	    (Default: output all cliffs.)
    -s T    Set the threshold for minimal cliff slope a cliff to T.
	    (Default: 20.)
    -y T    Set the threshold for minimal height to T faction of the y range.
	    (Default: 0.05.)

Sample input:
#h x y
1 100
2 100
3.4 200
3.6 300
4 400
5 400
6 400
7 400


Sample command:
cat data.jdb | db2dcliff x y


Sample output:
#h startx starty endx endy slope prangey
1       100     4       300     66.6666666666667        0.666666666666667
#  | /home/johnh/BIN/DB/db2dcliff x y


Bugs:

Currently we require (and verify) that the data be monotonically
increasing in both axes.  It should be possible to relax this
restriction for the y values.

The algorithm requires O(n) memory, where n is the number of input lines.

END
	exit 1;
}
#'

BEGIN {
    $dblibdir = "/usr/local/lib/jdb";
    push(@INC, $dblibdir);
}
use DbGetopt;
require "$dblibdir/dblib.pl";

my(@orig_argv) = @ARGV;
my($prog) = &progname;
$top_n = undef;
$slope_threshold = 20;
$p_y_threshold = 0.05;
my($dbopts) = new DbGetopt("n:s:y:?", \@ARGV);
my($ch);
while ($dbopts->getopt) {
    $ch = $dbopts->opt;
    if ($ch eq 'n') {
	$top_n = $dbopts->optarg;
    } elsif ($ch eq 's') {
	$slope_threshold = $dbopts->optarg;
    } elsif ($ch eq 'y') {
	$p_y_threshold = $dbopts->optarg;
    } else {
	&usage;
    };
};
die("$prog: -n N must be positive.\n") if (defined($top_n) && $top_n < 0);
die("$prog: -p T nonsensical threshold.\n") if ($p_y_threshold < 0 ||
					    $p_y_threshold > 1);

&usage if ($#ARGV != 1);
my($x_col_name, $y_col_name) = @ARGV;

&readprocess_header;
die ("$prog: unknown column name $x_col_name.\n") if (!defined($colnametonum{$x_col_name}));
my($x_col) = $colnametonum{$x_col_name};
die ("$prog: unknown column name $y_col_name.\n") if (!defined($colnametonum{$y_col_name}));
my($y_col) = $colnametonum{$y_col_name};

my($infty) = 1e+50;

my($n) = 0;
my($STARTX, $STARTY, $ENDX, $ENDY, $SLOPE, $RANGEX, $RANGEY,
    $PSTARTX, $PSTARTY, $PENDX, $PENDY, $PRANGEX, $PRANGEY) = (0..20);
my($lastx, $lasty, $thisx, $thisy);
my(@d);

sub points_to_segment {
    my($lastx, $lasty, $thisx, $thisy) = @_;
    my(@a);
    @a[$STARTX, $STARTY, $ENDX, $ENDY] = ($lastx, $lasty, $thisx, $thisy);
    @a[$RANGEX, $RANGEY] = ($thisx - $lastx, $thisy - $lasty);
    $a[$SLOPE] = ($a[$RANGEX] == 0) ? $infty : ($a[$RANGEY]  / $a[$RANGEX]);
    return \@a;
}

sub scale {
    my($x, $min, $range) = @_;
    return ($x - $min) / $range;
}

#
# get all the data
#
@f = ();
while (<STDIN>) {
    &delayed_pass_comments && next;
    &split_cols;

    ($thisx, $thisy) = @f[$x_col, $y_col];
    if (defined($lastx) && defined($lasty)) {
	$d[$n] = points_to_segment($lastx, $lasty, $thisx, $thisy);
	die("$prog only supports monotonic data currently.\n")
	    if ($lastx > $thisx || $lasty > $thisy);
	$n++;
    };

    ($lastx, $lasty) = ($thisx, $thisy);
};
if ($n == 0) {
    print "$prog: no input\n";
    exit 1;
};

#
# normalize it
#
my($minx, $miny, $maxx, $maxy) = ($d[0][$STARTX], $d[0][$STARTY],
				    $d[$#d][$ENDX], $d[$#d][$ENDY]);
my($rangex, $rangey) = ($maxx - $minx, $maxy - $miny);
my($i);
sub scale_segment {
    my($seg) = @_;
    $seg->[$PSTARTX] = scale($seg->[$STARTX], $minx, $rangex);
    $seg->[$PSTARTY] = scale($seg->[$STARTY], $miny, $rangey);
    $seg->[$PENDX] = scale($seg->[$ENDX], $minx, $rangex);
    $seg->[$PENDY] = scale($seg->[$ENDY], $miny, $rangey);
    $seg->[$PRANGEX] = $seg->[$PENDX] - $seg->[$PSTARTX];
    $seg->[$PRANGEY] = $seg->[$PENDY] - $seg->[$PSTARTY];
}
for ($i = 0; $i < $#d; $i++) {
    scale_segment($d[$i]);
};

#
# merge neighbors
#
$starti = $endi = 0;
my(@e);
for ($i = 0; $i <= $#d; $i++) {
    if ($d[$i][$SLOPE] > $slope_threshold) {
	# in run
	next;
    } else {
	# end of run
	$endi = $i - 1;  $endi = $starti if ($endi < $starti);
	if ($endi > $starti && $d[$i][$PENDY] - $d[$starti][$PSTARTY] > $p_y_threshold) {
	    push(@e, points_to_segment($d[$starti][$STARTX],
					$d[$starti][$STARTY],
					$d[$endi][$ENDX],
					$d[$endi][$STARTY]));
            scale_segment($e[$#e]);
	};
	$starti = $i;
    };
}

#
# pick out the top n, if necessary
#
if (defined($top_n)) {
    my(@ei) = (0..$#e);
    # Sort them based on who covers the most area (descending).
    my(@sorted_ei) = sort { $e[$b][$PRANGEY] <=> $e[$a][$PRANGEY] } @ei;
    # The prior line doesn't work if we replace @ei with (0..$#e).
    # Pick the top n.
    # my(@raw_fi) = @sorted_ei[0..($top_n - 1)];
    # Put them back in order.
    my($last_ei) = $top_n - 1;  $last_ei = $#ei if ($last_ei > $#ei);
    my(@fi) = sort { $a <=> $b } @sorted_ei[0..$last_ei];
    # Extract them
    @f = @e[@fi];   # Perl rules!
} else {
    # Keep them all.
    @f = @e;
};


#
# dump the data
#
&write_header(qw(startx starty endx endy slope prangey));
for ($i = 0; $i <= $#f; $i++) {
#    print "$f[$i][$STARTX]\t$f[$i][$STARTY]\t$f[$i][$PSTARTX]\t$f[$i][$PSTARTY]\t$f[$i][$SLOPE]\t" . ($f[$i][$SLOPE] > 20) . "\n";
#    print "$f[$i][$STARTX]\t$f[$i][$STARTY]\t$f[$i][$SLOPE]\t" . ($f[$i][$SLOPE] > 20) . "\n";
    &write_these_cols($f[$i][$STARTX], $f[$i][$STARTY], $f[$i][$ENDX], $f[$i][$ENDY], $f[$i][$SLOPE], $f[$i][$PRANGEY]);
};

&delayed_flush_comments;
print "#  | $prog ", join(" ", @orig_argv), "\n";
exit 0;
