#!/usr/bin/perl -w
# Copyright (c) 2000 Udo Erdelhoff, All rights reserved.
# Written for the FreeBSD German Documentation Project
#
# Redistribution and use in source and compiled forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
#  1. Redistributions of source code must retain the above
#     copyright notice, this list of conditions and the following
#     disclaimer as the first lines of this file unmodified.
#
#  2. Redistributions in compiled form must reproduce the above
#     copyright notice, this list of conditions and the following
#     disclaimer in the documentation and/or other materials provided
#     with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY UDO ERDELHOFF "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL UDO ERDELHOFF BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
# IN ANY WAY OUT OF THE USE OF THIS DOCUMENTATION, EVEN IF ADVISED OF
# THE POSSIBILITY OF SUCH DAMAGE.
#
# $Id: suppe,v 1.9 2000/07/26 17:29:34 ue Exp $

use strict;
use Getopt::Long;

# Things that can be configured
my ($fli, $indent, $linecount, $maxlinelength, $sli, $debug);

# Activate debug code if != 0
$debug = 0;

# Minimum indent. Should be zero, increase it only if you need to format a
# small section of a document. For a FAQ entry, use $indent = 6;
$indent = 0;

# First level indent. Number of spaces to add when we descend a tag level.
# Taken from FDP
$fli = 2;

# Second level indent. Number of spaces to add when we wrap contents.
# Taken from FDP
$sli = 2;

# Maximum line length when wrapping
$maxlinelength = 70;

# Number of first line, adjust when doing work inside a document
$linecount = 0;

sub usage () {
	print STDERR "usage: suppe [-i initial indent] [-l linestart ]";
	print STDERR "[-m max linelength] [files ...]";
	die;
}

GetOptions ('debug'			=> \$debug,
	    'firstlevelindent=i'	=> \$fli,
	    'indent=i'			=> \$indent,
	    'linecount=i'		=> \$linecount,
	    'maxlinelength=i'		=> \$maxlinelength,
	    'secondlevelindent=i'	=> \$sli) || usage();

if ($indent < 0 || $indent >= $maxlinelength || $maxlinelength < 40) {
	usage();
}
	 
##########################################################################

# Variables for protblock handling
my (@protected, $protcount, @special, $protect, $pattern);

# List of elements where whitespace or linebreaks are significant.
# Expand if neccessary
@special = ("programlisting", "screen", "literallayout", "address", "pre");

# All other global variables
my ($in, $foo, $closer, $bar, $type, $lindent, $line, $counter, $flush);

# Sometimes, it's good to be bad...
$/ = "\0\0";		
$in = <>;
exit unless (defined $in);
chomp $in;
exit if ($in eq "");

if ($debug > 0) {
	$counter = length ($in);
	print "$counter bytes of input read\n";
}
# Skip the FPI and any comments, brutal version
($foo) = ($in =~ /^<!DOCTYPE\s+(.*?)\s/i);
unless (defined $foo) {
	print "Retrying search for DOCTYPE\n" if ($debug > 0);
	($foo) = ($in =~ /^[\000-\377]*?<!DOCTYPE\s+(.*?)\s/i);
}
if (defined $foo) {
	print "Found DOCTYPE $foo\n" if ($debug > 0);
	$foo =~ tr/A-Z/a-z/;
	($bar) = ($in =~ /^([\000-\377]*?)<$foo>/io);
	unless (defined $bar) {
		print STDERR "Could't find the opening <$foo>\n";
		die "Cannot handle this document\n";
	}
	$counter = length ($bar);
	print "Cutting $counter bytes to skip FPI\n" if ($debug > 0);
	substr ($in, 0, $counter) = "";
	print "$bar";
}

# look for areas where whitespace and/or linebreaks are significant
# Store them in an array and replace them with a "protblock" to insure
# that we don't mess with them. 
$protcount = 0;

foreach $protect (@special) {
	$pattern = "<$protect>[\000-\377]*?</$protect>";
	print "Protecting $protect, pattern is $pattern\n" if ($debug > 0);
	while (1) {
		undef $foo;
		($foo) = ($in =~ /($pattern)/i);
		last unless (defined $foo);
		print "Instance found\n" if ($debug > 0);
		$protcount++;
		$protected[$protcount] = $foo;
		$bar = "###PROT$protcount###";
		$in =~ s/$pattern/$bar/i;
	}
}
print "End of main protection loop\n" if ($debug > 0);

# Save multi-line SGML comments
print "Looking for multi-line SGML comments\n" if ($debug > 0);
$pattern = "<!--[\000-\377]*?-->";
while (1) {
	($foo) = ($in =~ /\G($pattern)/gic);
	last unless (defined $foo);
#	$counter = pos($in);
	print "Instance found\n" if ($debug > 0);
	next unless ($foo =~ /\n/);
	print "Protecting instance\n" if ($debug > 0);
	$protcount++;
	$protected[$protcount] = $foo;
	$bar = "###PROT$protcount###";
	$in =~ s/$pattern/$bar/i;
}

print "Nuking whitespace\n" if ($debug > 0);
# Nuke everything even remotely resembling whitespace
# Protect ".  " sequences
$in =~ s/\.  /__FULLSTOP_DOUBLE_SPACE__/g;
# Treat ".\n" as possible ".  "
$in =~ s/\.\n/__FULLSTOP_DOUBLE_SPACE__/g;
# Convert all tabs to spaces
$in =~ tr/\t/ /;
# Convert all newlines to spaces
$in =~ tr/\n/ /;
# Convert all sequences of spaces into a single space 
$in =~ s/\s+/ /g;
# Nuke space at SOL and EOL (Can't be more than one after the last line)
$in =~ s/^ //;
$in =~ s/ $//;
# Nuke space between tags
$in =~ s/> </></g;
# Restore ".  " sequences
$in =~ s/__FULLSTOP_DOUBLE_SPACE__/.  /g;
# But no more than two spaces after a full stop, please
$in =~ s/\.\s\s+/\.  /g;
print "Whitespace nuked\n" if ($debug > 0);
unless ($in =~ /^</) {
	die "Entry doesn't start with tag"
}

$lindent = -1;
$line = "";
$counter = 0;
$flush = 0;

while($in ne "") {
# Must check for SGML comments first!
	if ($in =~ /^<!--/) {
		undef $foo;
		($foo) = ($in =~ /^(<!--.*?-->)/);
		$in =~ s/^<!--.*?-->//;
		if ($line eq "") {
			print " " x $indent;
			print "$foo\n";
			$linecount++;
		} else {
			$line = "$line$foo";
		}
		next;
	}
# Check for character data
	unless ($in =~ /^<.*?>/) {
		undef $foo;
		($foo) = ($in =~ /^(.*?)</);
		$in =~ s/^.*?</</;
# Check for protblock
		undef $bar;
		($bar) = ($foo =~ /^ *###PROT([0-9]+)### *$/);
		if (defined $bar && $line eq "") {		# YES!
#print STDERR "dissolving protblock $bar at $linecount\n";
#			unless ($protected[$bar] =~ /^\n/) {
#				print "\n";
#				$linecount++;
#			}
			if ($lindent == $indent) { print "\n"; }
			print " " x $indent;
			print "$protected[$bar]";
			$lindent = $indent;
			$linecount += ($protected[$bar] =~ tr/\n/\n/);
			unless ($protected[$bar] =~ /\n$/) {
				print "\n";
				$linecount++;
			}
			next;
		}
		if (defined $bar) { 
			print STDERR "Hey, inline protblock at $linecount\n";
		} elsif ($foo =~ /###PROT[0-9]+###/) {
			print STDERR "Hey, hidden protblock at $linecount\n";
		}
		$line = "$line$foo";
		next;
	}

# Ladies and Gentlemen, we have a SGML tag.
	undef $foo;
	($foo) = ($in =~ /^<(.*?)>/);
	$in =~ s/^<.*?>//;
	$closer = ($foo =~ s!^/!!);
	$type = classify_tag ($foo);

	if ($type == 1 || $type == 2) {	#inline/single
		if ($counter == 0 && $line ne "") {
			unless ($line =~ /^<.*>/) {
				print STDERR ">>>Whoa, character data outside inline tag in $linecount!\n";
			}
			print " " x $indent;
			print "$line\n";
			$linecount++;
			$line = "";
		}
		if ($closer == 0) { # opening tag
			$line = "$line<$foo>";
			if ($type == 1) { $counter++; $indent += 2; }
			next;
		}
		if ($type == 2) {
			print "\n\nline: $line\n";	# Output trash for debug
			print "foo: $foo\n";
			print "closer: $closer\n";
			print "in: $in\n";
			die "closing single tag found";
		}
		$counter--;
		$indent -= 2;
		if ($counter < 0) {
			print "\n\nline: $line\n";	# Output trash for debug
			print "foo: $foo\n";
			print "closer: $closer\n";
			print "in: $in\n";
			die "counter < 0";
		}
		$line = "$line</$foo>";
		next unless ($counter == 0 || $counter == $flush);
		if ($counter == 0) {
			$flush = 0;
		}
		print_inline(1);
		$line = "";
		next;
	}
# Ok, this is a normal tag. Stay calm, we can handle it
# Sanity checks first
	if ($counter > 0) {
		if ($line ne "") {
			print_inline(2);
			$line = "";
		}
		print STDERR ">>>Hey, normal tag $foo inside inline element in $linecount!\n";
	}
	if ($counter == 0 && $line ne "") {
		print STDERR ">>>Whoa, character data before normal tag in $linecount!\n";
		print_inline(3);
		$line = "";
	}
	if ($closer == 1) {		# closing tag
		$indent -= 2;
		print " " x $indent;
		print "</$foo>\n";
		$linecount++;
		$lindent = $indent;
		next;
	}
# This memory is too small for me and this opening tag
	if ($indent == $lindent) {
		print "\n";
		$linecount++;
	}
	print " " x $indent;
	print "<$foo>\n";
	$linecount++;
	$lindent = $indent;
	$indent += 2;
}

sub print_inline {
	my ($mode, $rci);
	$mode = $_[0];

	if ($mode == 2) {
		$rci = $indent - 2 * $counter;
	} else {
		$rci = $indent;
	}
	if ($lindent == $rci && $mode != 3) {
		print "\n";
		$linecount++;
	}
	real_print (" " x $rci, $line, $mode);
	$line = "";
	if ($mode != 3) {
		$lindent = $rci;
	}
	if ($mode == 2) {
		$flush = $counter;
	}
}

# Printing routine for inline elements, first part
# Distinguish between protblock markers and normal inputs (tags, char data)
# protblock markers are replaced by the contents of the protected block
# length of last line of the contents is remembered for further wrapping
# normal input is fed to wrapup to line wrapping

sub real_print () {
	my ($base, $content, $toprint, $protid, $protlen, $offset);

	$base = $_[0];
	$content = $_[1];
	$offset = -1;

	while ($content ne "") {
		unless ($content =~ /###PROT[0-9]+###/) {
			$linecount += wrapup ($base, $content, $offset);
			print "\n";
			$linecount++;
			last;
		}

		($protid) = ($content =~ /^###PROT([0-9]+)###/);
		if (defined $protid) {
			$linecount += ($protected[$protid] =~ tr/\n/\n/);
			print "$protected[$protid]";
			$content =~ s/^###PROT[0-9]+### *//;
			($toprint) = ($protected[$protid] =~ /\n(.*?)$/);
			if (defined $toprint) {
				$offset = length ($toprint);
			} else {
				$offset = 0;
			}
			next;
		}
		($toprint) = ($content =~ /^(.*?)###PROT[0-9]+###/);
		unless (defined $toprint) {
			print "Internal logic error\n";
			print "Protblock left, not in front, but nothing in front of it?\n";
			print "Original input: $_[1]\n";
			print "Remaining content: >>>$content<<<\n";
			print "!!!$toprint!!!\n";
			die "Internal Logic Error";
		}
		$linecount += wrapup ($base, $toprint, $offset);
		$content =~ s/^(.*?)###PROT([0-9]+)###/###PROT$2###/;
	}
}

# printing routine for inline elements, second part
# 
sub wrapup {
	my ($b, $base, $pos, $content, $mark, $out, $lpos, $count, $offset, $o);

	$base = $_[0];
	$content = $_[1];
	$offset = $_[2];


	if ($offset == -1) {
		$mark = 0;
	} else {
		$mark = 1;
		$base = "  $base";		# XXX FLI/SLI
	};
	$count = 0;

	while (1) {
		if ($offset == -1) {
			$o = length ($base);
		} else {
			$o = $offset;
		};
		if (length ($content) + $o  < $maxlinelength) {
			if ($offset == -1) {
				print "$base$content";
			} else {
				print "$content";
			}
			last;
		}
		$pos = -1;
		$lpos = -1;
		while (($pos = index ($content, " ", $pos)) > -1) {
			last if ($pos + $o > $maxlinelength);
			$lpos = $pos;
			$pos++;
		}
		if ($lpos == -1) {			# No early space
			if ($pos != -1) {		# is there any space
				$lpos = $pos;		# We use that
			} else {			# if not
				print "$content";	# we're done
				last;
			}
		}
		$out = substr ($content, 0, $lpos);
		$out =~ s/ +$//;

		if ($offset == -1) {
			print "$base$out\n";
		} else {
			print "$out\n";
			$offset = -1;
		}
		$count++;
		if ($mark == 0) {
			$base = "  $base";		# XXX FLI/SLI
			$mark = 1;
		}
		substr ($content, 0, $lpos+1) = "";
		if ($content =~ /^ /) {
			substr ($content, 0, 1) = "";
		}
	}
	return ($count);
}

my ($lastresult, $lastpattern);

sub classify_tag {
	my ($inlinetags, $singletags, $normaltags, $pattern, $result);

	$inlinetags = "<citerefentry><refentrytitle><manvolnum><subtitle><title><ulink><link><filename><command><surname><pubdate><emphasis><replaceable><literal><hostid><email><acronym><entry><keycap><userinput><application><prompt><seg><devicename><option><symbol><quote><username><userinput><varname><function><para><city><street><postcode><country><phone><fax><otheraddr><state><holder><releaseinfo><firstname><year><attribution><envar><wordasword><makevar><citetitle><constant><errorname><firstterm><trademark><type><systemitem><parameter><token><action><database><guimenuitem><guimenu><keysym><keycombo><hardware><term><h1><h2><h3><h4><h5><h6><h7><h8><h9><p><i><a><b><sgmltag>";
	$singletags = "<anchor><xref><area><co>";
	$normaltags = "<sect1><sect2><sect3><sect4><sect5><sect6><sect7><sect8><sect9><qandaset><qandaentry><question><answer><itemizedlist><listitem><book><bookinfo><authorgroup><author><abstract><preface><informaltable><tgroup><thead><row><tbody><tip><note><warning><footnote><orderedlist><chapter><table><segmentedlist><seglistitem><variablelist><varlistentry><procedure><step><example><informalexample><programlisting><literallayout><screen><article><artheader><affiliation><abstract><programlistingco><areaspec><areaset><calloutlist><callout><copyright><blockquote><qandadiv><html><head><body><ol><li>";

	$pattern = $_[0];
	$pattern =~ s/ .*$//;
	$pattern = "<$pattern>";
	if ($inlinetags =~ $pattern) {
		$result = 1;
	} elsif ($singletags =~ $pattern) {
		$result = 2;
	} elsif ($normaltags =~ $pattern) {
		$result = 3;
	} elsif ($pattern eq "<>" and defined $lastresult) {
		$result = $lastresult;
		print STDERR ">>>Hey, $lastpattern closed with </>\n";
		# Call by reference is powerfull.
		# All power corrupts, absolute power is even more fun.
		# I'm tempted. I'm really tempted.
		# $_[0] = $lastpattern;
	} else {
		die "Unknown tag $_[0]";
	}
	$lastresult = $result;
	$lastpattern = $pattern;
	return ($result);
}

