#!/usr/local/bin/perl
# -*- perl -*-

#
# $Id: dvips2ascii,v 1.14 1999/08/14 23:07:55 eserte Exp $
# Author: Slaven Rezic
#
# Copyright  1997,1999 Slaven Rezic. All rights reserved.
# This program is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
# Mail: <URL:mailto:eserte@cs.tu-berlin.de>
# WWW:  <URL:http://www.cs.tu-berlin.de/~eserte/>
#

# convert a postscript file created by dvips to a readable ISO-8859-1 text
# file

require "getopts.pl";

if (!&Getopts('li:s:t:')) {
    &usage;
}

if (@ARGV) {
    $in = shift @ARGV;
    if (@ARGV == 0) {
	if ($in =~ /\.ps$/i) {
	    ($out = $in) =~ s/\.ps$/.txt/i;
	} else {
	    $out = "$in.txt";
	}
    } else {
	$out = shift @ARGV;
	if (@ARGV) {
	    &usage;
	}
    }
	
    if (-e $out) {
	print "<$out> already exists. Should I overwrite this file? (y/N) ";
	$yn = <STDIN>;
	if ($yn !~ /^y/) {
	    die "Aborting...\n";
	}
    }

    open(STDIN, $in) || die "Can't open file $in: $!";
    open(STDOUT, ">$out") || die "Can't write to file $out: $!";
}

$resolution = 300; # default resolution

if ($opt_s == 11) {
    $small_horiz_from = ord('e'); # XXX kann bis "g" gehen, aber dann
    # verschwinden oft die Abstnde zwischen Wrtern...
    $small_horiz_to   = ord('k');
} else {
    $small_horiz_from = ord('c');
    $small_horiz_to   = ord('k');
}

# overread prolog
while (<>) {
    if (/^%%Creator:/) {
	if (!/dvips/) {
	    print STDERR
	      "Warning: This file is not created by dvips. Expect horrible results.\n";
	}
    } elsif (/^%%Feature:.*Resolution\s+(\d+)dpi/) {
	$resolution = $1;
    } elsif (/^%%Page:/) {
	last;
    }
}

$| = 1; # flush output because of the slowness of this program

while($text = <>) {
    $text =~ tr/\n/ /;
    $len = length($text);
    # $i holds the current pointer to $text
    # $tokenbeg shows the beginning of current token
    $tokenbeg = 0;
    for($i=0; $i<$len; $i++) {
	$ch = substr($text, $i, 1);
	if ($ch eq ' ') {
	    &do_token(substr($text, $tokenbeg, $i-$tokenbeg));
	    $tokenbeg = $i;
	} elsif ($ch eq '(' && $lastch[0] ne '\\') {
	    # PostScript string, beginning
	    &do_token(substr($text, $tokenbeg, $i-$tokenbeg));
	    $tokenbeg = $i;
	} elsif ($ch eq ')' && 
		 ($lastch[0] ne '\\' || $lastch[1] eq '\\')) {
	    # PostScript string, end
	    $i++;
	    &do_token(substr($text, $tokenbeg, $i-$tokenbeg));
	    $tokenbeg = $i;
	}
	@lastch = ($ch, $lastch[0]);;
    }
}

close STDIN;
close STDOUT;

# process a token
sub do_token {
    local($token) = $_[0];

    $token =~ s/^\s+//;
    $token =~ s/\s+$//;

    $token ne '' && do {
	if ($token eq '@endspecial') {
	    $special = 0;
	} elsif ($special) { # NOP
	} elsif ($token eq '@beginspecial') {
	    $special = 1;
	} elsif ($token =~ /^[ay]$/) { # moveto
	    if ($lasttoken[0]-$lasty > &transres(26)) {
		&output("\n" x (defined($lasty)
				? (($lasttoken[0]-$lasty)/&transres(62) + 1)
				: 1));
	    }
	    $lasty = $lasttoken[0]; # remember current y position
# XXX manchmal geht's gut, manchmal nicht. Am besten wre es, wenn die
# aktuelle x-Position aufgezeichnet wird, aber das ist schwierig...
#	    if ($lasttoken[1] > 9) { 
#		# horiz. shift
#		&output(" " x ($lasttoken[1]/&transres(30) + 1));
#	    }
	} elsif ($token =~ /^\((.*)\)$/) { # String
	    $token = $1;
	    if ($opt_i && $font eq $opt_i) { # cmmi encoding
		$token =~ s/:/./g; # change dots from contents
	    } else {
		$token =~ s/\\013/ff/g;	# convert ligatures
		$token =~ s/\\014/fi/g;
		$token =~ s/\\015/fl/g;
		$token =~ s/\\016/ffi/g;
		if ($token =~ /^\\017$/ && $lasttoken[0] =~ /^F.$/) {
		    # there was a font switch and only one character in token?
		    # then it is a item bullet, otherwise a ligature
		    $token = '*';
		} else {
		    $token =~ s/\\017/ffl/g;
		}
		$token =~ s/\\031//g;
		$token =~ s/\\045/%/g;
		$token =~ s/\{/--/g;
		$token =~ s/\|/---/g;
		if ($opt_t && $font eq $opt_t) { # cmtt encoding
		    $token =~ s/\\\\/\\/g;
		} else {
		    $token =~ s/\\\\/\"/g;  # closing "
		}
 		$token =~ s/\\([\(\)])/$1/g; # brackets
	    }
	    &output($token);
	} elsif ($token =~ /^[bw]$/ && $lasttoken[0] > 9) {
	    &output(" " x ($lasttoken[0]/&transres(30) + 1)); # horiz. shift
#	} elsif ($token =~ /^[c-k]$/) {	# small horizontal shift
	} elsif (length($token) == 1 &&
		 ord($token) >= $small_horiz_from &&
		 ord($token) <= $small_horiz_to) { # small horizontal shift
	    &output(" ");
	} elsif ($token eq '%%Page:') {	# new page
	    &output("\n") unless $opt_l;
	    undef $lasty;	# reset y position
	} elsif ($token =~ /^F(.)$/) { # switch to another font
	    $font = $1;
	}
	@lasttoken = ($token, @lasttoken[0..1]); # token ring :-)
    }
}

sub output {
    local($string) = @_;

# XXX in helena/texte/schering_bewerbung.ps kommt einmal
# "zwlfw ochiges" vor. Mglicher Fix: diacritic bekommt die ersten
# zwei Buchstaben des Strings. Wenn der erste ein Space ist und ein
# akzentuierter Buchstabe gebaut werden kann, wird dieses Space ignoriert,
# ansonsten bleibt alles beim alten.
    $string = &diacritic(substr($string, 0, 1)) . substr($string, 1);

    if ($string =~ /^(.*)(\\(177|030|023|027))$/) {
	$lastchar = $2;
	$string = $1;
    }

    print $string;

}

# XXX Option fr latin2 etc.
# XXX mehr Zeichen mit Akzenten!
sub diacritic {
    local($char) = @_;
    if ($lastchar eq '\177') {
	if ($char =~ /^[aeiouyAEIOU]/) {
	    $char =~ tr/aeiouyAEIOU//;
	}
	else {
	    $char = "" . $char;
	}
	$lastchar = '';
    } elsif ($lastchar eq '\027') {
	if ($char =~ /^a/i) {
	    $char =~ tr/aA//;
	}
	else {
	    $char = "" . $char;
	}
        $lastchar = '';
    } elsif ($lastchar eq '\030') {
	if ($char =~ /^c/i) {
	    $char =~ tr/cC//;
	}
	else {
	    $char = "" . $char;
	}
	$lastchar = '';
    } elsif ($lastchar eq '\023') {
	if ($char =~ /^[aeiouyAEIOUY]/) {
	    $char =~ tr/aeiouyAEIOUY//;
	}
	else {
	    $char = "" . $char;
	}
	$lastchar = '';
    } 
    $char;
}

sub transres {
    local($y) = @_;
    ($resolution/300)*$y;
}

sub usage {
    die "$0: [-i cmmifont] [-s points] [-t cmttfont] [-l] [infile [outfile]]
-i: dvips font for cmmi/cmmib
-s: font size in TeX points
-t: dvips font for cmtt
-l: ignore pagebreaks
";
}
