#                                                         -*- Perl -*-
# eijiro-fpw - FreePWING script for EIJIRO
# $Id: eijiro-fpw.pl 1.0.0.7 2000/10/08 15:32:32 rei Exp $
#
# !!! NOTICE !!!
# DO NOT CHANGE THE KANJI-CODE OF THIS SCRIPT. IT SHOULD BE SHIFT JIS.
#
# Copyright (C) 2000, Rei <rei@wdic.org>.
# This program is distributed in the hope that it will be useful, but
# without any warranty. See the GNU General Public License for the details.
#
require 5.005;

use Getopt::Long;
use Jcode;
use FreePWING::FPWUtils::FPWParser;
use strict "vars";

my($fpwtext, $fpwheading, $fpwword2, $fpwkeyword, $fpwcopyright);
my $opt_charset;	# sjis, euc (eucjp, ujis), jis or iso_2022_jp.

my $sjis_any = "(?:[\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc])";

my %skip_word = (
	'a',	1,
	'an',	1,
	'and',	1,
	'at',	2,
	'by',	2,
	'for',	2,
	'in',	2,
	'on',	2,
	'or',	1,
	'the',	1,
	'to',	2
);


#-----------------------------------------------------------------------#
#	main routine														#
#-----------------------------------------------------------------------#

my $wc = 0;
my($current_word, $current_pos, $pos_index);
my($prev_word, $prev_pos) = ('', '');

#
# initialize this module.
#
&initialize();
initialize_fpwparser('text' => \$fpwtext,
		'heading' => \$fpwheading,
		'word2' => \$fpwword2,
		'keyword' => \$fpwkeyword,
		'copyright' => \$fpwcopyright);

#
# read the input files and write entries to the book.
#
while (<>) {
	s/\r?\n$//;
	next if (/^$/);
	&error("Unexpected line in $ARGV ($_)") if (!s/^//);

	#
	# extract the word part from the line.
	#
	$current_word = $_;
	$_ = '';
	while ($current_word =~ s/^(.+)( : .+)$/\1/) {
		$_ = $2 . $_;
	}
	s/^ : //;
	$current_word =~ s/\s+$//;

	#
	# extract POS from the word if any.
	#
	if ($current_word =~ s/^(.+)\{((?:$sjis_any|[0-9\-])+)\}/\1/) {
		$current_pos = $2;
		if ($current_pos =~ s/\-([0-9]+)$//) {
			if ($current_pos eq $prev_pos) {
				$pos_index++;
			} else {
				$pos_index = 1;
			}
		} else {
			$pos_index = 0;
		}
		$current_word =~ s/\s+$//;
	} else {
		$pos_index = 0;
		$current_pos = '';
	}

	if ($current_word eq $prev_word) {
		#
		# same word as previous, do not write heading or keyword.
		#
		if ('' ne $current_pos && $current_pos eq $prev_pos) {
			&write_pos_index($current_pos, $pos_index);
		} else {
			&write_pos($current_pos, $pos_index);
			$prev_pos = $current_pos;
		}
	} else {
		#
		# a new word is found. change to the new context and write the
		# heading and keyword.
		#
		$wc++;
		&new_entry($wc, $current_word);
		$prev_word = $current_word;
		$prev_pos = $current_pos;
		if (/^y/) {
			&write_word_info($_);
			$fpwtext->add_indent_level(2);
			next;
		} else {
			$fpwtext->add_indent_level(2);
			if ('' ne $current_pos) {
				&write_pos($current_pos, $pos_index);
			}
		}
	}

	#
	# write the meanings of the current word.
	#
	write_meaning($_);
}

print "Total $wc entries were written.\n";
&write_copyright();

#
# clean up.
#
finalize_fpwparser('text' => \$fpwtext,
		'heading' => \$fpwheading,
		'word2' => \$fpwword2,
		'keyword' => \$fpwkeyword,
		'copyright' => \$fpwcopyright);
exit(0);


#-----------------------------------------------------------------------#
#	initialization														#
#-----------------------------------------------------------------------#
#
# void initialize();
# read the command line switches and initialize optional values.
#
sub initialize
{
	my $opt_usage;

	if (!GetOptions('help|h|?' => \$opt_usage,
			'charset|c:s' => \$opt_charset)
			|| scalar(@ARGV) < 1) {
		usage(1);
	}
	usage(0) if ($opt_usage);

	if (!defined($opt_charset) || '' eq $opt_charset) {
		if ($ENV{'LANG'} =~ /ja_JP\.([a-zA-Z]+)$/) {
			$opt_charset = $1;
		}
	}

	$opt_charset =~ tr/A-Z/a-z/;
	if ('ujis' eq $opt_charset || 'eucjp' eq $opt_charset) {
		$opt_charset = 'euc';
	} elsif (!defined($opt_charset) || '' eq $opt_charset) {
		if ($ENV{'OS'} =~ /Windows/i
				|| $ENV{'COMSPEC'} =~ /COMMAND\.COM$/i
				|| $ENV{'COMSPEC'} =~ /CMD\.EXE$/i) {
			$opt_charset = 'sjis';
		} else {
			$opt_charset = 'euc';
		}
	} elsif ('sjis' ne $opt_charset
			&& 'euc' ne $opt_charset
			&& 'jis' ne $opt_charset
			&& 'iso_2022_jp' ne $opt_charset) {
		&error("$opt_charset: Unknown character set");
	}

	print "Detected display character set: $opt_charset\n";

	-f './copyright.sjis' ||
		&error("A necessary file \'./copyright.sjis\' not found.");
}


#-----------------------------------------------------------------------#
#	writing routines													#
#-----------------------------------------------------------------------#
#
# void new_entry(char *word);
# switch to the new entry and write the heading and keyword. this
# version does not support cross-reference, so the tag is not added.
#
sub new_entry
{
	my($index, $word) = @_;
	my($text_pos, $heading_pos);
	my(@key, @key2, $key, $key2);

	$word =~ s/\x81\x7c/\-/g;
	$word =~ s/__/_/g;
	$word =~ s/F/: /g;

	&jprint("$index: $word\n");
	$word = Jcode::convert($word, 'euc', 'sjis');

	$fpwtext->new_entry() ||
		&error("Failed to add a new entry", $fpwtext);
	$fpwheading->new_entry() ||
		&error("Failed to add a new entry", $fpwheading);

	$text_pos = $fpwtext->entry_position();
	$heading_pos = $fpwheading->entry_position();

	#
	# heading.
	#
	$fpwheading->add_text($word) ||
		&error("Failed to add a heading", $fpwheading);

	#
	# search words - let them be found as far as possible...
	#
	@key = split(/; /, $word);

	foreach $key (@key) {
		if ($key =~ / \| /) {
			@key2 = split(/ \| /, $key);
			foreach $key (@key2) {
				$fpwword2->add_entry($key, $heading_pos, $text_pos) ||
					&error("Failed to add a search word ($key)", $fpwword2);
			}

		} else {
			while (128 < length($key)) {
				#
				# too long word, truncate it.
				#
				$key =~ s/\s*[^\s]+$//;
			}
			$fpwword2->add_entry($key, $heading_pos, $text_pos) ||
				&error("Failed to add a search word ($key)", $fpwword2);

			#
			# a little workaround for DDwin and Jamming.
			#
			if ($key =~ s/\s*[\"\$%\+_]\s*//g && $key !~ /^$/) {
				$fpwword2->add_entry($key, $heading_pos, $text_pos) ||
					&error("Failed to add a search word ($key)", $fpwword2);
			}
		}
	}

	#
	# also add each word as a keyword for 'jouken kensaku'. articles,
	# symbols and some other words may be skipped according to the word
	# count.
	#
	@key = ();
	@key2 = sort(split(/ +/, $word));
	$key2 = '';

	while (1) {
		$key = shift(@key2);
		last if (!defined($key) || '' eq $key);
		$key =~ s/^[\(\[\"]+//;
		$key =~ s/[\)\]\",\.:;!\?]+$//;
		$key =~ tr/A-Z/a-z/;

		next if ('' eq $key);
		if ($key ne $key2) {
			next if ((defined($skip_word{$key}) && 1 == $skip_word{$key})
					|| $key =~ /^[^a-zA-Z0-9]+$/);
			push(@key, $key);
			$key2 = $key;
		}
	}

	foreach $key (@key) {
		if (scalar(@key) <= 5 || !defined($skip_word{$key})) {
			$fpwkeyword->add_entry($key, $heading_pos, $text_pos) ||
				&error("Failed to add keyword ($key)", $fpwkeyword);
		}
	}

	#
	# write the title to the body as a keyword.
	#
	@key = split(/; /, $word);
	$key = shift(@key);

	if (!$fpwtext->add_keyword_start()
			|| !$fpwtext->add_text($key)
			|| !$fpwtext->add_keyword_end()) {
		&error("Failed to add a keyword ($key)", $fpwtext);
	}

	foreach $key (@key) {
		if (!$fpwtext->add_text('; ')
				|| !$fpwtext->add_keyword_start()
				|| !$fpwtext->add_text($key)
				|| !$fpwtext->add_keyword_end()) {
			&error("Failed to add a keyword ($key)", $fpwtext);
		}
	}

	$fpwtext->add_newline() || &error("Failed to add a newline", $fpwtext);
}

#
# void write_word_info(char *line);
# write the line that contains such information like the pronounce etc.
#
sub write_word_info
{
	my $katakana = "(?:\x83[\x40-\x96])";	# [@-]
	my($info) = @_;

	$info =~ s/yz(?:$katakana|\x81[\x41\x5b])*//;
	$info =~ s/yxz[0-9]*(?:A)*//;
	$info =~ s/ywz(?:A)*//;
	$info =~ s/Ay/ y/g;
	$info =~ s/A//g;

	if ($info !~ /^$/) {
		if (!$fpwtext->add_text(Jcode::convert($info, 'euc', 'sjis'))
				|| !$fpwtext->add_newline()) {
			&error("Failed to add text ($info)", $fpwtext);
		}
	}
}

#
# void write_pos(char *pos, int index);
#
sub write_pos
{
	my($pos, $index) = @_;

	if ('' ne $pos) {
		$fpwtext->add_text('[' . Jcode::convert($pos, 'euc', 'sjis') . ']') ||
			&error("Failed to write POS [$pos]", $fpwtext);
	}
	if (0 < $index) {
		if (!$fpwtext->add_newline() || !$fpwtext->add_text("$index\. ")) {
			&error("Failed to write [$pos] index $index", $fpwtext);
		}
	} else {
		$fpwtext->add_text(' ') ||
			&error("Failed to write a white space", $fpwtext);
	}
}

#
# void write_pos_index(char *pos, int index);
#
sub write_pos_index
{
	my($pos, $index) = @_;

	$fpwtext->add_text("$index\. ") ||
		&error("Failed to write [$pos] index $index", $fpwtext);
}

#
# void write_meaning(char *line)
# write the meaning(s) of the current word.
#
sub write_meaning
{
	my $chars = "(?:[\x20-\x7d][\x20-\x7d]|$sjis_any)";
	my $hiragana = "(?:\x82[\x9f-\xf1])";	# [-]
	my(@list, $char, $next, $mean, $yorei);

	$mean = shift(@_);
	$mean =~ s/\x81\x6f(?:$hiragana|\x81[\x5e\x69\x6a]| )+\x81\x70//g;

	@list = unpack('C*', $mean);
	$mean = '';

	while (1) {
		$char = shift(@list);
		last if (!defined($char) || !$char);
		if (0x81 == $char) {
			$next = shift(@list);
			if (0x41 == $next || 0x43 == $next) {		# AC
				$mean .= ', ';
			} elsif (0x42 == $next || 0x44 == $next) {	# BD
				$mean .= '. ';
			} elsif (0x46 == $next) {		# F
				$mean .= ': ';
			} elsif (0x47 == $next) {		# G
				$mean .= '; ';
			} elsif (0x48 == $next) {		# H
				$mean .= '? ';
			} elsif (0x49 == $next) {		# I
				$mean .= '! ';
			} elsif (0x51 == $next) {		# Q
				$mean .= '_';
			} elsif (0x5e == $next) {		# ^
				$mean .= '/';
			} elsif (0x69 == $next) {		# i
				$mean .= '(';
			} elsif (0x6a == $next) {		# j
				$mean .= ')';
			} elsif (0x6d == $next) {		# m
				$mean .= '[';
			} elsif (0x6e == $next) {		# n
				$mean .= ']';
			} elsif (0x7b == $next) {		# {
				$mean .= '+';
			} elsif (0x7c == $next) {		# |
				$mean .= '-';
			} elsif (0x81 == $next) {		# 
				$mean .= '=';
			} else {
				$mean .= pack('CC', $char, $next);
			}
		} elsif (0x82 == $char) {
			$next = shift(@list);
			if (0x4f <= $next && $next <= 0x58) {		# O-X
				$mean .= pack('C', $next - 0x1f);
			} elsif (0x60 <= $next && $next <= 0x79) {	# `-y
				$mean .= pack('C', $next - 0x1f);
			} elsif (0x81 <= $next && $next <= 0x9a) {	# -
				$mean .= pack('C', $next - 0x20);
			} else {
				$mean .= pack('CC', $char, $next);
			}
		} elsif ((0x81 < $char && $char <= 0x9f)
				|| (0xe0 <= $char && $char <= 0xfc)) {
			$mean .= pack('CC', $char, shift(@list));
		} else {
			$mean .= pack('C', $char);
		}
	}

	@list = ();
	$mean =~ s/y/ y/g;
	$mean =~ s/ ; /; /g;
	$mean =~ s/  +/ /g;
	$mean =~ s/([,\.!\?]) ([,\.])/\1\2/g;

	#
	# extract examples if any.
	#
	if ($mean =~ s/ \/ yp(?:E$sjis_any+(?:\-[0-9]+)?)?z(.*)$//) {
		$yorei = $1;
	}

	#
	# write the meaning.
	#
	$mean =~ s/\s+$//;
	if (!$fpwtext->add_text(Jcode::convert($mean, 'euc', 'sjis'))
			|| !$fpwtext->add_newline()) {
		&error("Failed to write text ($mean)", $fpwtext);
	}

	#
	# write the examples.
	#
	if ('' ne $yorei) {
		$fpwtext->add_indent_level(3);
		@list = split(/ \/ /, $yorei);
		foreach $yorei (@list) {
			$yorei =~ s/^\s+//;
			$yorei =~ s/\s+$//;
			$yorei = ' ' . $yorei;
			if (!$fpwtext->add_text(Jcode::convert($yorei, 'euc', 'sjis'))
					|| !$fpwtext->add_newline()) {
				&error("Failed to write an example ($yorei)", $fpwtext);
			}
		}
		$fpwtext->add_indent_level(2);
	}
}

#
# void write_copyright(void);
# add a copyright page to the book.
#
sub write_copyright
{
	my($sec, $min, $hour, $day, $mon, $year);
	my $username = '(unknown user)';

	print "Writing copyright information...\n";
	open(IN, '<./copyright.sjis') ||
		&error("Failed to open \'./copyright.sjis\'");

	while (<IN>) {
		s/\r?\n$//;
		if (!/^$/) {
			$fpwcopyright->add_text(Jcode::convert($_, 'euc', 'sjis')) ||
				&error("Failed to write text ($_)", $fpwcopyright);
		}
		$fpwcopyright->add_newline() ||
			&error("Failed to add a newline", $fpwcopyright);
	}

	close(IN);

	if (defined($ENV{'USER'})) {
		$username = $ENV{'USER'};
	} elsif (defined($ENV{'USERNAME'})) {
		$username = $ENV{'USERNAME'};
	}

	($sec, $min, $hour, $day, $mon, $year) = localtime(time);
	$_ = sprintf('This book is generated by %s on %02d/%02d/%02d %02d:%02d:%02d.',
			$username, $year + 1900, $mon, $day, $hour, $min, $sec);

	if (!$fpwcopyright->add_newline()
			|| !$fpwcopyright->add_text('--')
			|| !$fpwcopyright->add_newline()
			|| !$fpwcopyright->add_text($_)
			|| !$fpwcopyright->add_newline()
			|| !$fpwcopyright->add_text($0 . ' $Revision: 1.0.0.7 $')
			|| !$fpwcopyright->add_newline()) {
		&error("Failed to write compilation date and time", $fpwcopyright);
	}
}


#-----------------------------------------------------------------------#
#	message output														#
#-----------------------------------------------------------------------#
#
# jprint(char *str);
#
sub jprint
{
	my($str) = @_;

	if ($str =~ /[^\x00-\x7e]/) {
		$str = Jcode::convert($str, $opt_charset);
	}
	print $str;
}

#
# error(char *errmsg, char *func, obj);
# exit program with a error message.
#
sub error
{
	my($errmsg, $obj) = @_;

	if ($errmsg =~ /[^\x00-\x7e]/) {
		$errmsg = Jcode::convert($errmsg, $opt_charset);
	}
	if (defined($obj)) {
		$errmsg .= (': ' . $obj->error_message());
		$errmsg =~ s/:\s*$//;
	}
	die "Error: $errmsg\.\n";
}

#
# void usage(int exit_code);
#
sub usage
{
	print<<__EOM__;
Usage: $0 [-c charset] file ...
__EOM__
	exit($_[0]);
}
