#!/usr/local/bin/perl

# fake a cheap sgml-aware segmenter

$/='</C>';
$wc=0;
$prev="";
while (<>) {
    if (s/^([^\000]*<[^\000]*)(<[^\000]*)<C /$2<C /) {
	if ($prev) {
	    &pw($prev);
	    $prev="";
	};
	print $1;
    };
    if (s/<ABBR>(<C.*)$/<>/) {
	# <ABBR>N/M<HI REND="SUPER">o/r</HI></ABBR>
	if ($prev) {
	    &pw($prev);
	    $prev="";
	};
	($l1)=(($f1=$1)=~m@<C.*>(.+)</C>@);
	$wc++;
	s@<>@<W ID=W$wc TYPE=supabbr LEMMA=$l1@;
	print;
	$/='</ABBR>';
	$_=<>;
	s@\n<HI.*>(<C.*</C>)\n</HI></ABBR>@<>@;
	($l2)=(($f2=$1)=~m@<C.*>(.+)</C>@);
	s@<>@$l2>$f1$f2</W>@;
	print;
	$/='</C>';
    }
    elsif ($prev=~/>O</ && /ID=[^=>]*>\'</) {
	# O'Hagan
	$wc++;
	$prev=~s/<C/<W ID=W$wc TYPE=name LEMMA="O\'Hagan"><C/;
	print $prev;
	print;
	$_=<>;
	print;
	print "</W>";
	$prev="";
    }
    elsif ($prev=~/ID=[^=>]*>\'</ && /REND=nosep>s</) {
	# 's
	$wc++;
	$prev=~s/<C/<W ID=W$wc TYPE=poss LEMMA="'s"><C/;
	print $prev;
	print;
	print "</W>";
	$prev="";
    }
    elsif (m@ID=[^=>]+>/<@) {
	if ($prev=~/<W/) {
	    # compound/
	    $prev=~s@TYPE=.* LEMMA=.([^>]+).>@TYPE=compound LEMMA='$1/<>'>@;
	    $prev.=$_;
	    $_=<>;
	    ($cp)=/>([^<\n]+)</;
	    $prev=~s/<>/$cp/;
	    $prev.=$_;
	}
	elsif ($prev=~/>([A-Z0-9]+)</) {
	    $cp=$1;
	    $wc++;
	    $prev=~s@<C@<W ID=W$wc TYPE=compound LEMMA='$cp/<>'><C@;
	    $prev.=$_;
	    $_=<>;
	    ($cp)=/>([^<\n]+)</;
	    $prev=~s/<>/$cp/;
	    $prev.=$_;
	}
	else {
	    if ($prev) {
		&pw($prev);
	    };
	    $prev=$_;
	};
    }
    elsif (m@ID=[^=>]+>-<@) {
	if ($prev=~/<W/) {
	    # compound-
	    $prev=~s@TYPE=.* LEMMA=.([^>]+).>@TYPE=compound LEMMA='$1-<>'>@;
	    $prev.=$_;
	    $_=<>;
	    ($cp)=/>([^<\n]+)</;
	    $prev=~s/<>/$cp/;
	    $prev.=$_;
	}
	elsif ($prev) {
	    # a-b
	    ($cp)=($prev=~/>([^\n<]+)</);
	    $wc++;
	    $prev=~s@<C@<W ID=W$wc TYPE=hyphed LEMMA='$cp-<>'><C@;
	    $prev.=$_;
	    $_=<>;
	    ($cp)=/>([^<\n]+)</;
	    $prev=~s/<>/$cp/;
	    $prev.=$_;
	}
	else {
	    $prev=$_;
	};
    }
    elsif (m@ID=[^=>]+>,<@ && $prev=~/>([0-9]+)</) {
	$cp=$1;
	$cur=$_;
	$_=<>;
	if (/REND=nosep>([0-9]+)</) {
	    # n,n
	    $np=$1;
	    $wc++;
	    $prev=~s@<C@<W ID=W$wc TYPE=cnum LEMMA='$cp,$np'><C@;
	    $prev.=$cur . $_;
	}
	else {
	    # oops, clean up
	    &pw($prev);
	    &pw($cur);
	    $prev=$_;
	};
    }
    elsif (/REND=sep>\.</ ||
	   (/>\.</ && ($prev=~/>[\'\"\`]</))) {
	if ($prev=~m@(<C[^>]*>\.</C>)@) {
	    # etc. . .
	    $cp=$1;
	    &pw($prev);
	    $wc++;
	    $prev=$_;
	    $prev=~s/<C/<W ID=W$wc TYPE=ellipsis LEMMA='..'>$cp<C/;
	}
	else {
	    &pw($prev);
	    $wc++;
	    $prev=$_;
	    $prev=~s/<C/<W ID=W$wc TYPE=ellipsis LEMMA='.'><C/;
	};
	while (($_=<>)=~/REND=sep>\.</) {
	    $prev=~s/'>/.'>/;
	    $prev.=$_;
	};
	if ($prev!~/LEMMA='\.\.\.'/) {
	    # oops, take it all back, should be only one <C . . .>
	    $prev=~s/<W[^>]*>//;
	    $wc--;
	    die "Ellipsis catastrophe" if ($prev=~/<C.*<C/);		
	};
	&pw($prev);
	$prev=$_;
    }
    elsif ($prev && m@<C ID=[^=>]*>\.</C>@) {
	# T. H. etc. (require REND=sep NOT)
	if ($prev=~/>([TH]|etc)</) {
	    $wc++;
	    $ab=$1;
	    $prev=~s/<C/<W ID=W$wc TYPE=abbrev LEMMA='$ab.'><C/;
	    $prev.=$_;
	}
	elsif ($prev=~/>i</) {
	    # i.e.
	    $wc++;
	    $prev=~s/<C/<W ID=W$wc TYPE=abbrev LEMMA='i.e.'><C/;
	    print $prev;
	    print;
	    $_=<>;
	    print;
	    $_=<>;
	    print;
	    print "</W>";
	    $prev="";
	}
	else {
	    # just end-of-sentence
	    &pw($prev);
	    $prev=$_;
	};
    }
    else {
	if ($prev) {
	    &pw($prev);
	};
	$prev=$_;
    };
}
&pw($prev);

sub pw {
    local ($w)=@_;
    if ($w=~/<W/) {
	# already wrapped up
	print $w,"</W>";
    }
    else {
	$wc++;
	if ($w=~/>[a-zA-Z\300-\326\330-\366\370-\377]+</) {
	    $type='';
	}
	elsif ($w=~/>[0-9]+</) {
	    $type=' TYPE=num';
	}
	elsif ($w=~/>.</) {
	    $type=' TYPE=punct';
	}
	else {
	    $type=' TYPE=unk';
	};
	$w=~s@(<C.*</C>)@<W ID=W$wc$type>$1</W>@;
	print $w;
    }
}
