#!/usr/local/bin/perl -w

#
# dbjoin
# Copyright (C) 1991-1998 by John Heidemann <johnh@isi.edu>
# $Id: dbjoin,v 1.19 1999/11/23 02:04:29 johnh Exp $
#
# This program is distributed under terms of the GNU general
# public license, version 2.  See the file COPYING
# in $dblibdir for details.
#
sub usage {
    print <<END;
usage: $0 [-Sid] table1 table2 [-nNrR] column [column...]
    	Does a natural join on table1 and table2 around
	the specified columns.

	By default, data will be sorted lexically.
	If data is already sorted, dbjoin will run more efficiently
	with the -S option.

	Because two tables are required,
	input is typically in files.
	Standard input is accessible by the file "-".

Options:
    -i  include non-matches (each record which doesn't match at
		all will appear once)
    -S  assume data is already sorted
    -d  debug mode
    -e EmptyValue  specify what to use for non-existent fields (with -i)

These options are positional,
they must come after table1 and table2
and can appear multiple times for different columns:

	-r sort in reverse order (high to low)
	-R sort in normal order (low to high)
	-n sort numerically
	-N sort lexicographically

Sample input (dbsort -n cid <DATA/reg):
#h sid cid
1 10
2 11
1 12
2 12

Sample input (DATA/classes):
#h cid cname
10 pascal
11 numanal
12 os

Command:
cat DATA/reg.jdb | dbsort -n cid | dbjoin - DATA/classes -n cid

Sample output:
#h      cid     sid     cname
10      1       pascal
11      2       numanal
12      1       os
12      2       os
# - COMMENTS:
#  | /home/johnh/BIN/DB/dbsort -n cid
# DATA/classes COMMENTS:
# joined comments:
#  | /home/johnh/BIN/DB/dbjoin - DATA/classes cid

END
# '
    exit 1;
}

BEGIN {
    $dbbindir = "/usr/local/bin";
    $dblibdir = "/usr/local/lib/jdb";
    push(@INC, $dblibdir);
}
require "$dblibdir/dblib.pl";
use DbGetopt;

my(@orig_argv) = @ARGV;
my($prog) = &progname;
my($sorting_required) = 1;
my($debug) = 0;
my($non_match_inclusion) = 0;
my($null_value) = undef;

my($dbopts) = new DbGetopt("e:diS?", \@ARGV);
my($ch);
while ($dbopts->getopt) {
    $ch = $dbopts->opt;
    if ($ch eq 'S') {
	$sorting_required = 0;
    } elsif ($ch eq 'd') {
	$debug++;
    } elsif ($ch eq 'i') {
	$non_match_inclusion++;
    } elsif ($ch eq 'e') {
	$null_value = $dbopts->optarg;
    } else {
	&usage;
    };
};
&usage if ($#ARGV < 2);

my($aname, $bname) = @ARGV;
shift; shift;
my($join_columns) = join(" ", @ARGV);

if ($sorting_required) {
    open(A, "cat $aname | $dbbindir/dbsort $join_columns |") || die("$prog: cannot run dbsort over ``$aname''.\n");
} else {
    open(A, "<$aname") || die("$prog: cannot open ``$aname''.\n");
};
my($aheader, @arawdata, @acomments, @adata, $a_fs_code, @acoloptions, @acolnames, @asubcolnames, %acolnametonum);
$aheader = <A>;
@arawdata = <A>;
@acomments = grep(/^#/, @arawdata);
@adata = grep(!/^#/, @arawdata);
chomp(@adata);
undef @arawdata;
close (A);
&process_header($aheader);
$a_fs_code = $fs_code;
@acoloptions = @coloptions;
@acolnames = @colnames;
@asubcolnames = @acolnames;
%acolnametonum = %colnametonum;
@colnames = ();
%colnametonum = ();


if ($sorting_required) {
    open(B, "cat $bname | $dbbindir/dbsort $join_columns |") || die("$prog: cannot run dbsort over ``$bname''.\n");
} else {
    open(B, "<$bname") || die("$prog: cannot open ``$bname''.\n");
};
my($bheader, @brawdata, @bcomments, @bdata, $b_fs_code, @bcoloptions, @bcolnames, @bsubcolnames, %bcolnametonum);
$bheader = <B>;
@brawdata = <B>;
@bcomments = grep(/^#/, @brawdata);
@bdata = grep(!/^#/, @brawdata);
chomp(@bdata);
undef @brawdata;
close (B);
&process_header($bheader);
$b_fs_code = $fs_code;
@bcoloptions = @coloptions;
@bcolnames = @colnames;
@bsubcolnames = @bcolnames;
%bcolnametonum = %colnametonum;
# reset
@colnames = ();
%colnametonum = ();

die "$prog: cannot handle input data with different field separators.\n"
    if ($a_fs_code ne $b_fs_code);
@coloptions = @acoloptions;   # NEEDSWORk

#
# figure the joined columns
#
$i = 0;
for $key (@ARGV) {
    next if ($key =~ /^-/);   # we deal with this later
    die("$prog: column ``$key'' is not in table ``$aname''.\n") if (!defined($acolnametonum{$key}));
    die("$prog: column ``$key'' is not in table ``$bname''.\n") if (!defined($bcolnametonum{$key}));
    $colnames[$i] = $key;
    &col_mapping ($key, $i);
    $coltoa[$i] = $acolnametonum{$key};
    $coltob[$i] = $bcolnametonum{$key};
    $colina[$i] = 1;
    $colinb[$i] = 1;
    $i++;
}

#
# and the rest
#
foreach $key (@acolnames) {
    next if (defined($colnametonum{$key}));
    $colnames[$i] = $key;
    &col_mapping ($key, $i);
    $coltoa[$i] = $acolnametonum{$key};
    $colina[$i] = 1;
    $i++;
}
foreach $key (@bcolnames) {
    next if (defined($colnametonum{$key}));
    $colnames[$i] = $key;
    &col_mapping ($key, $i);
    $coltob[$i] = $bcolnametonum{$key};
    $colinb[$i] = 1;
    $i++;
}

#
# Figure the comparison code.
#
sub join_row_col_ab_fn {
    my($row, $colname, $n) = @_;
    return '$' . $row . 'f[' . ($row eq 'a' ? $coltoa[$n] : $coltob[$n]) . ']';   # '
}
my($ab_compare_code) = &generate_compare_code('ab_custom_compare', 'join_row_col_ab_fn', @ARGV);

sub join_row_col_aolda_fn {
    my($row, $colname, $n) = @_;
    return '$' . ($row eq 'a' ? 'a' : 'olda') . 'f[' . $coltoa[$n] . ']';   # '
}
my($aolda_compare_code) = &generate_compare_code('aolda_custom_compare', 'join_row_col_aolda_fn', @ARGV);

sub join_row_col_boldb_fn {
    my($row, $colname, $n) = @_;
    return '$' . ($row eq 'a' ? 'b' : 'oldb') . 'f[' . $coltob[$n] . ']';   # '
}
my($boldb_compare_code) = &generate_compare_code('boldb_custom_compare', 'join_row_col_boldb_fn', @ARGV);


&write_header();

#
# join the data (assumes data already sorted)
#
my($oldai, $oldbi, $ai, $bi, $firstmatchingbi, $lastmatchingbi, $inmatch);
$oldai = $oldbi = -1;   # keep track of the last entry to check for sortedness
$ai = $bi = 0;
$firstmatchingbi = $lastmatchingbi = -1;
    # $firstmatchingbi keeps track of the head of a run of bi's that match
    # $lastmatchingbi is different, it's the highestbi that's ever matched
$inmatch = 0;

sub save_ai {
    my($newval) = @_;
    @oldaf = @af;
    $oldai = $ai;
    $ai = $newval;
    @af = ($ai > $#adata ? () : split(/$fsre/, $adata[$ai]));
}

sub save_bi {
    my($newval) = @_;
    @oldbf = @bf;
    $oldbi = $bi;
    $bi = $newval;
    @bf = ($bi > $#bdata ? () : split(/$fsre/, $bdata[$bi]));
}

# loop through all a
&save_ai(0);
&save_bi(0);
while ($ai <= $#adata) {
    #
    # Get the two rows for comparision.
    #
    if ($bi > $#bdata) {
    	$result = -1;
    } else {
	# init sort checking
	if ($oldai == -1) {
	    @oldaf = @af;
	    $oldai = $ai;
	};
	if ($oldbi == -1) {
	    @oldbf = @bf;
	    $oldbi = $bi;
	};

	# check for sortedness
	print "# $ai($oldai) $bi($oldbi)\n" if ($debug);
	if ($oldai < $ai) {
	    die("$prog: table ``$aname'' is out of order (between data lines $oldai and $ai).\n")
	        if (&aolda_custom_compare() < 0);
	};
	if ($oldbi < $bi) {
	    die("$prog: table ``$bname'' is out of order (between data lines $oldbi and $bi).\n")
	        if (&boldb_custom_compare() < 0);
	};

	$result = &ab_custom_compare();
	# Old way:
	# $result2 = &compare();
	# print "$result\t$result2\n";
    };
    if ($result != 0 && $inmatch) {
    	$inmatch = 0;
	# $ai has matched before, so don't need to check non-match inclusion
	&save_ai($ai+1);
	&save_bi($firstmatchingbi);
	next;
    };
    if ($result < 0) {
	if ($non_match_inclusion) {
	    &generate_a_only_data();
	    print "# a-only\n" if ($debug);
	    &print_joined_data();
	};
	&save_ai($ai+1);
    } elsif ($result > 0) {
	if ($non_match_inclusion && $bi > $lastmatchingbi) {
	    &generate_b_only_data();
	    print "# b-only\n" if ($debug);
	    &print_joined_data();
	};
	&save_bi($bi+1);
    } else {
    	if (!$inmatch) {
	    $inmatch = 1;
	    $firstmatchingbi = $bi;
	    $lastmatchingbi = $bi if ($bi > $lastmatchingbi);
	} else {
	    $lastmatchingbi = $bi if ($bi > $lastmatchingbi);
	};
	&generate_joined_data();
	&print_joined_data();
	&save_bi($bi+1);
    };
};
#
# When we're done, there could be b's left.
#
if ($non_match_inclusion) {
    while ($bi <= $#bdata) {
        if ($bi > $lastmatchingbi) {
            &generate_b_only_data();
	    print "# post-scan b-only\n" if ($debug);
	    &print_joined_data();
        };
        $bi++;
	&save_bi($bi);
    };
};

print "# $aname COMMENTS:\n" .
    join("", @acomments) .
    "# $bname COMMENTS:\n" .
    join("", @bcomments) .
    "# joined comments:\n";
print "#  | $prog ", join(" ", @orig_argv), "\n";
exit 0;

# for compiler warnings
my($x) = $outfs;
$x = $dbopts->optarg;

sub generate_joined_data {
    my($ci);
    @c = ();
    foreach $ci (0..$#colnames) {
	if ($colina[$ci]) {
	    push (@c, $af[$coltoa[$ci]]);
	} else {
	    push (@c, $bf[$coltob[$ci]]);
	};
    };
}

sub generate_a_only_data {
    my($ci);
    @c = ();
    foreach $ci (0..$#colnames) {
	if ($colina[$ci]) {
	    push (@c, $af[$coltoa[$ci]]);
	} else {
	    die ("$prog: need empty value, specify with option -e Value.\n") if (!defined($null_value) && $non_match_inclusion);
	    push (@c, $null_value);
	};
    };
}

sub generate_b_only_data {
    my($ci);
    @c = ();
    foreach $ci (0..$#colnames) {
	if ($colinb[$ci]) {
	    push (@c, $bf[$coltob[$ci]]);
	} else {
	    die ("$prog: need empty value, specify with -e Value.\n") if (!defined($null_value) && $non_match_inclusion);
	    push (@c, $null_value);
	};
    };
}

sub print_joined_data {
    print join($outfs, @c) . "\n";
}

sub compare {
    my($result);
    my($i) = 0;
    foreach (@ARGV) {
	next if (/^-/);
    	$result = $af[$coltoa[$i]] cmp $bf[$coltob[$i]];
	return $result if ($result != 0);
	$i++;
    };
    return 0;
}

