#!/usr/bin/perl 

# ########################################################################### #
# Html2Wml                                                                    #
# ========                                                                    #
# Author: Sebastien Aperghis-Tramoni <maddingue@free.fr>                      #
#                                                                             #
# This program converts HTML pages to WML pages.                              #
# See the documentation for more informations.                                #
#                                                                             #
# This program is available under the GNU General Public License.             #
#                                                                             #
# You can find the original archive of this program on the author's web site  #
#   http://www.maddingue.org/techie/                                          #
#                                                                             #
# and on the web site of Html2Wml on SourceForge                              #
#   http://htmlwml.sourceforge.net/                                           #
#                                                                             #
# Copyright (c)2000, 2001 Sebastien Aperghis-Tramoni                          #
# ########################################################################### #

use strict;
use CGI;
use File::Basename;
use Getopt::Long;
use HTML::Parser;
use LWP::UserAgent;
use POSIX qw(isatty);
use Text::Template;
use URI;
use URI::URL;

use vars qw($program $version);
$program = 'Html2Wml';
$version = '0.4.6 pre 5';


# --------------------------------------------------------------------------- #
# Static configuration                                                        #
#                                                                             #
#   If you want to hard-code some parameters of Html2Wml, this is the         # 
#   place to edit. Please check the document for more information.            #
#                                                                             #
my %options = (
    help           => 0,    ## show the usage and exit
    version        => 0,    ## show the program name and version and exit
    
    ## conversion options
    ascii          => 0,    ## convert named entities to US-ASCII
    collapse       => 1,    ## collapse white space characters
    compile        => 0,    ## compile WML to binary 
   'ignore-images' => 0,    ## completly ignore image links
   'img-alt-text'  => 1,    ## replace IMG tags with their ALT attribute
    linearize      => 1,    ## suppress the tables tags
    nopre          => 0,    ## don't use PRE tag
   'numeric-non-ascii' => 0,  ## convert non-ASCII characters to numeric entities
    
    ## links reconstruction options
    hreftmpl       => '{FILEPATH}{FILENAME}{$FILETYPE =~ s/s?html?/wml/o; $FILETYPE}', 
    srctmpl        => '{FILEPATH}{FILENAME}{$FILETYPE =~ s/gif|png|jpe?g/wbmp/o; $FILETYPE}', 

    ## card splitting options
   'split-card'           => 1,      ## slice the document by cards
   'split-deck'           => 0,      ## slice the document by decks
   'max-card-size'        => 1_400,  ## maximum size of data per card
   'card-split-threshold' =>    50,  ## card split threshold
   'next-card-label'      => '[&gt;&gt;]',  ## label of the link to go to the next card
   'prev-card-label'      => '[&lt;&lt;]',  ## label of the link to go to the previous card
    
    ## HTTP authentication
   'http-user'     => '',   ## HTTP user
   'http-passwd'   => '',   ## HTTP password
    
    ## debugging options
   'debug'         => 0,    ## activate the debug mode
   'xmlcheck'      => 0,    ## perform a well-formedness check (using XML::Parser)
);

# You should not edit below this line unless you know what you are doing.     #
# --------------------------------------------------------------------------- #

# 
# globals
# 
sub debug;
sub error;
sub fatal;

use vars qw($cgi);
$cgi = 0;
my $agent;   ## LWP user agent
my $result;  ## WML deck in text format
my $binary;  ## WML deck in binary format
my $xmlckres = '';
my $complres = '';

my %optname = (
   'a' => 'ascii', 
   'c' => 'compile', 
   'd' => 'debug', 
  #'h' => 'help',         ## shell only
   'n' => 'numeric-non-ascii', 
  #'o' => 'output',       ## shell only
   'p' => 'nopre', 
   'P' => 'http-passwd',  ## CGI only: passwd
   's' => 'max-card-size', 
   't' => 'card-split-threshold', 
   'U' => 'http-user',    ## CGI only: user
  #'v' => 'version',      ## shell only
  #'V' => 'version',      ## shell only
);
my %optchar = ();

## used by the html parser
use vars qw(%state);
%state = (
    doc_uri  => '',        ## document absolute URI
    self_url => '',        ## the CGI's URL for self-referencing
    output   => '',        ## buffer for storing output
    decks    => {},        ## hash that contains the decks, indexed by their id
    skip     => 0,         ## skip switch (on/off)
    stack    => [],        ## tag stack
    cardsize => 0,         ## size of the current card/deck
    cardid   => 'wdf000',  ## ID of the current card/deck (stands for "WML Document - Fragment 000")
    title    => '',        ## title of the WML deck
    encoding => '',        ## encoding of the document
    wmlvers  =>            ## WML version and identifier
        q|<!DOCTYPE wml PUBLIC "-//WAPFORUM//DTD WML 1.2//EN" "http://www.wapforum.org/DTD/wml12.dtd">|, 
);

my %entities;  ## named entities conversion table

# 
# The following two hashes are based on the WML DTD. They are the hardcoded 
# conversion tables which describe the legal syntax of WML tags. 
# 
my %dtdent = ();
    $dtdent{emph}   = 'em,strong,b,i,u,big,small';
    $dtdent{layout} = 'br';
    $dtdent{text}   = $dtdent{emph};
    $dtdent{flow}   = "$dtdent{text},$dtdent{layout},img,anchor,a,table";
    $dtdent{fields} = "$dtdent{flow},input,select,fieldset";

my %with = (
    html     => { action => 'replace',  new_value => 'wml'  }, 
    wml      => { action => 'keep',     nest => 'head,template,card'  }, 
    
    ## header tags
    head     => { action => 'keep',     nest => 'meta,access' }, 
   # meta     => { action => 'keep',     nest => 'EMPTY',  attributes => 'http-equiv,name,content' }, 
    template => { action => 'keep',     nest => 'do,onevent' }, 
    title    => { action => 'skip' }, 
    style    => { action => 'skip' }, 
    script   => { action => 'skip' }, 
    
    ## structural tags
    body     => { action => 'replace',  new_value => 'card' }, 
    card     => { action => 'keep',     nest => 'do,p,pre' }, 
    h1       => { action => 'replace',  new_value => 'p',  render => 'big,strong',  special => 'nowidow' }, 
    h2       => { action => 'replace',  new_value => 'p',  render => 'big',  special => 'nowidow'}, 
    h3       => { action => 'replace',  new_value => 'p',  render => 'strong',  special => 'nowidow' }, 
    h4       => { action => 'replace',  new_value => 'p',  special => 'nowidow' }, 
    h5       => { action => 'replace',  new_value => 'p',  special => 'nowidow' }, 
    h6       => { action => 'replace',  new_value => 'p',  special => 'nowidow' }, 
    li       => { action => 'replace',  new_value => 'p' }, 
    dt       => { action => 'replace',  new_value => 'p' }, 
    dd       => { action => 'replace',  new_value => 'p' }, 
    div      => { action => 'replace',  new_value => 'p' }, 
    p        => { action => 'keep',     nest => "$dtdent{fields},do",  attributes => 'align' }, 
    br       => { action => 'keep',     nest => 'EMPTY' }, 
    pre      => { action => 'keep',     nest => 'a,br,i,b,em,strong,input,select' }, 
    tt       => { action => 'replace',  new_value => 'pre' }, 
    
    ## tables tags
    table    => { action => 'keep',     nest => 'tr',  attributes => 'title,align' }, 
    caption  => { action => 'skip' }, 
   'tr'      => { action => 'keep',     nest => 'td' }, 
    th       => { action => 'replace',  new_value => 'td' }, 
    td       => { action => 'keep',     nest => "$dtdent{emph},$dtdent{layout},img,a,anchor" }, 
    
    ## link tags
    a        => { action => 'keep',     nest => 'br,img',  attributes => 'id,name,href,title,accesskey', 
                                        attrconv => { name => 'id' } }, 
    anchor   => { action => 'keep',     nest => 'br,go,img',  attributes => 'id,title,accesskey' }, 
    img      => { action => 'keep',     nest => 'EMPTY',  attributes => 'id,src,alt,align' }, 
    frame    => { action => 'special' }, 
    area     => { action => 'special' }, 
    
    ## style tags
    em       => { action => 'keep',     nest => $dtdent{flow} }, 
    strong   => { action => 'keep',     nest => $dtdent{flow} }, 
    b        => { action => 'keep',     nest => $dtdent{flow} }, 
    i        => { action => 'keep',     nest => $dtdent{flow} }, 
    u        => { action => 'keep',     nest => $dtdent{flow} }, 
    big      => { action => 'keep',     nest => $dtdent{flow} }, 
    small    => { action => 'keep',     nest => $dtdent{flow} }
    
    ## form tags -- currently not handled
   #'select'  => { action => 'keep',     nest => 'optgroup,option',  attributes => 'title,name,value,multiple' }, 
   # optgroup => { action => 'keep',     nest => 'optgroup,option',  attributes => 'title' }, 
   # option   => { action => 'keep',     nest => 'onevent',  attributes => 'title,value' }, 
   # input    => { action => 'keep',     nest => 'EMPTY',  attributes => 'name,type,value,title,size,maxlength'}, 
);


# 
# The following hash hardcodes the parent-lookup for each element 
# of the WML syntax, i.e. for each element, it gives the prefered 
# parent element. 
# 
my %reverse = (
    ## head tags
    wml => ' ',         head => 'wml',      meta => 'head',     
    access => 'head',   template => 'wml',  onevent => 'template', 
    
    ## structural tags
    card => 'wml',      p => 'card',        pre => 'card',      br => 'p',
    
    ## tables tags
    table => 'p',      'tr' => 'table',     td => 'tr',
    
    ## link tags
    a => 'p',           anchor => 'p',      img => 'p',
    
    ## style tags
    b => 'p',           i => 'p',           u => 'p', 
    strong => 'p',      em => 'p', 
    big => 'p',         small => 'p', 
    
    ## form tags
   'select' => 'p',     option => 'select', optgroup => 'select', 
   'do' => 'p',         input => 'p',       fieldset => 'p',
);



# 
# main
# 
$| = 1;
my $time = time;

Getopt::Long::config qw(no_auto_abbrev);

fileparse_set_fstype('Unix');  ## this is because I use fileparse() to 
                               ## split the URL fragments

## CGI security options
$CGI::POST_MAX = 1024 * 1;  # max 1K posts
$CGI::DISABLE_UPLOADS = 1;  # no uploads

load_entities();

## create the user agent
$agent = new LWP::UserAgent;
$agent->agent("[$program/$version ".$agent->agent.']');

for my $opt (keys %optname) {
    $optchar{$optname{$opt}} = $opt
}

if(@ARGV or isatty(\*STDOUT)) {
    ## launched from shell
    
    my @opts = (
      ## usage options
      qw(help|h  version|v|V  output|o=s),
      
      ## conversion options
      qw(ascii|a!  collapse!  compile|c  ignore-images  img-alt-text!  
         linearize!  nopre|p  numeric-non-ascii|n), 
      
      ## links reconstructions options
      qw(hreftmpl=s  srctmpl=s), 

      ## card splitting options
      qw(split-card  split-deck
         max-card-size|s=i  card-split-threshold|t=i 
         next-card-label=s  prev-card-label=s), 
      
      ## debugging options
      qw(debug|d:i  xmlcheck!)
    );
    
    ## getting options
    GetOptions(\%options, @opts);
    version() if $options{version};
    usage() if $options{help};
    usage() unless @ARGV;
    apply_options();
    
    ## converting the file
    $result = html2wml(shift);
    
} else {
    ## launched from web
    $cgi = new CGI;
    $agent->($cgi->agent . ' ' . $agent->agent);
    
    ## get the options
    for my $param ($cgi->param) {
        my $option = length($param) <= 2 ? $optname{$param} : $param;
        next unless exists $options{$option};
        $options{$option} = $cgi->param($param)
    }
    
    apply_options();
    
    ## creating static part of the self url
    my $cgi_options = '';
    for my $param ($cgi->param) {
        next if "url,id" =~ /\b$param\b/;
        $cgi_options .= "$param=" . ($cgi->param($param)) . ';' 
    }
    
    $state{self_url} = $cgi->url(-relative => 1) . "?$cgi_options";
    
    ## send debug header if needed
    print $cgi->header if $options{'debug'};
    
    ## execute main part
    $result = html2wml($cgi->param('url') || '/');
}


## special case: splitting by decks
if($cgi and $options{'split-deck'}) {
    $result = $state{decks}{ $cgi->param('id') || (sort keys %{$state{decks}})[0] }
}


## XML check
if($options{xmlcheck}) {
    eval {
      require XML::Parser;
      my $xmlparser = new XML::Parser Style => 'Tree', ErrorContext => 2;
      $xmlparser->parse($result);
    };
    $xmlckres = $@ ? $@ : "Well-formed";
}

## XML compile
if($options{compile}) {
    $binary = '';
    my $buf;
    
    eval {
      require IPC::Open2;
      require FileHandle;
      my $in  = new FileHandle;
      my $out = new FileHandle;
      
      ## TODO: wmlc must be patched: it should accept a '-' parameter for the output
      IPC::Open2::open2($out, $in, 'wmlc', '-', '/proc/self/fd/1');
      
      ## TODO: which method is the best one ?
      #syswrite($in, $result, length $result);
      #while(sysread($out, $buf, 1024) == 1024) { $binary .= $buf }
      print $in $result;
      $binary = join '', <$out>;
    };
    
    $complres = $@
}


if($options{'debug'}) { ## debug output
    $time = time - $time;
    my @times = times;
    $times[0] += $times[2];  ## total user time
    $times[1] += $times[3];  ## total system time
    $times[2] = $times[0] + $times[1];  ## total time
    
    my $i = 1;
    $result .= "\n";
    $result =~ s/^/@{[sprintf '%3d', $i++]}: /gm;  ## add lines number
    $result = simple_wrap($result);
    
    if($cgi) {
        print qq|<html>\n<head>\n<title>$program -- Debug Mode</title>\n|, 
              qq|<style type="text/css">\n  BODY { background-color: #ffffff}\n|, 
              qq|  .tag { color: #8811BB }\n  .attr { color: #553399 }\n </style>\n|, 
              qq|</head>\n<body>\n<h1>$program -- Debug Mode</h1>\n|, 
              qq|<p>This is the result of the conversion of the document |, 
              qq|<a href="$state{doc_uri}">$state{doc_uri}</a> by $program v$version.</p>\n|, 
              qq|<hr />\n|, 
              htmlize($result), 
              qq|<hr />\n<p>Result of XML check:</p>\n|, 
              htmlize($xmlckres); 
        
        print qq|<hr />\n<p>Result of WML compilation:</p>\n|, 
              htmlize(hextype($binary)), "\n"  if $options{compile}; 
        
        printf "<hr />\n<p>Time: $time wallclock secs (%.2f usr + %.2f sys = %.2f cpu)</p>\n", 
               @times[0..2];
        
        print qq|\n</body>\n</html>|
        
    } else {
        my $s = "$program -- Debug Mode\n";
        print $s, '-'x length($s), "\n", 
              $result, "\n", ' -'x5, "\n", 
              $xmlckres, "\n";
        print ' -'x5, "\nCompiled WML\n", ' -'x5, 
              ($complres ? "$complres\n" : hextype($binary)) 
              if $options{compile};
        print ' -'x5, "\n";
        printf "Time: $time wallclock secs (%.2f usr + %.2f sys = %.2f cpu)\n", @times[0..2];
    }
    
} else { ## normal output
    my $out = \*STDOUT;
    
    if(length $options{output}) {
        open(OUT, $options{output}) or fatal "cannot write to '$options{output}': $!\n";
        $out = \*OUT;
    }
    
    if($options{compile}) {
        print $out $cgi->header(
            -type => 'application/vnd.wap.wmlc', 
            -content_length => length $result
        ) if $cgi;
        print $out $binary;
    
    } else {
        print $out $cgi->header(
            -type => "text/vnd.wap.wml; charset=$state{encoding}", 
            -content_length => length $result
        ) if $cgi;
        print $out $result;
    }
}



# 
# apply_options()
# -------------
sub apply_options {
    if($options{linearize}) {
        delete @with{qw(table tr td th)};
        $with{'caption'} = { action => 'replace', new_value => 'p', render => 'b' };
        $with{'tr'} = { action => 'replace', new_value => 'p' };
        delete @reverse{qw(table tr td)};
    }
    
    if($options{'ignore-images'}) {
        delete $with{img};
    }
    
    if($options{'debug'}) {
        $options{xmlcheck} = 1;
    }
    
    if($options{'nopre'}) {
        delete $with{pre};
        $with{'pre'} = { action => 'replace', new_value => 'p' };
    }
    
    if($cgi) {
        $options{'split-card'} = 0;
        $options{'split-deck'} = 1;
    }
}


# 
# html2wml()
# --------
sub html2wml {
    my $url = shift;
    my $file = '';
    my $type = '';
    my $enc  = '';
    my $converter = new HTML::Parser api_version => 3;
    my $date = localtime;
    
    return unless $url;
    
    ## read the file 
    if($url =~ m{https?://}) {  ## absolute uri
        ($file,$type,$enc) = get_url($url)
    
    } elsif(not $cgi and -f $url) {  ## local file
        $file = read_file($url)
    
    } else {  ## absolute url relative to the server
	    ($file,$type,$enc) = get_url( $url = URI::URL->new($url, $cgi->url)->abs )
    }
    
    $enc ||= '';
    $enc =~ s/charset=//i;
    $state{encoding} = $enc;
    $state{doc_uri} = $url;
    
    ## try to get the document charset encoding
    if($file =~ m|meta +http-equiv.+charset=["']?([a-zA-Z0-9_-]+)['"]?|i) {
        $enc = lc $1
    }
    
    $type ||= '';
    $state{encoding} ||= $enc;
    
    ## if it's an image, call send_image()
    if($url =~ /\.(?:gif|jpg|png)$/i or $type =~ /image/) {
        @_ = ($file, $url);
        goto &send_image
    }
    
    ## get the document title
    if($file =~ m|<title>([^<]+)</title>|) {
        $state{title} = convert_entities(clean_spaces($1))
    }
    
    ## WML header
    $state{skip} = 0;
    $state{output} = join '', q|<?xml version="1.0"|, 
        ($state{encoding} ?  qq| encoding="$state{encoding}"| : ''),
        qq|?>\n$state{wmlvers}\n<!-- Converted by $program $version on $date -->\n|;
    
    ## affectation of the HTML::Parser handlers
    $converter->unbroken_text(1);
    $converter->handler(start       => \&start_tag,   'tagname, attr');
    $converter->handler(end         => \&end_tag,     'tagname');
    $converter->handler(text        => \&text_tag,    'text, is_cdata');
    $converter->handler(comment     => \&comment_tag, 'tokens');
    $converter->handler(declaration => \&default_handler, 'text');
    $converter->handler(process     => \&default_handler, 'text');
    $converter->handler(default     => \&default_handler, 'text');
    
    ## begin the conversion
    $converter->parse($file);
    $converter->eof;
    
    ## flush the stack
    while(my $tag = pop @{$state{stack}}) {
        $state{output} .= "</$tag>"
    }
    
    post_conversion_cleanup();
    
    $state{decks}{$state{cardid}} = $state{output};
    
    return $state{output}
}


# 
# post_conversion_cleanup()
# -----------------------
# 
sub post_conversion_cleanup {
    ## convert alone ampersand characters to entities
    $state{output} =~ s/\&\s/\&amp; /go;
    
    ## correct unclosed numeric entities
    $state{output} =~ s/(\&#\d+)[^\d;]/$1;/go;
    
    ## convert the named HTML entities to numeric entities
    $state{output} = convert_entities($state{output});
    
    ## convert non-ASCII characters to numeric entities
    if($options{'numeric-non-ascii'}) {
        $state{output} =~ s/([\x80-\xFF])/'&#'.ord($1).';'/eg;
    }
    
    ## escape $ chars
    $state{output} =~ s/\$/\$\$/go;
    
    collapse($state{output}) if $options{'collapse'};
    
    ## set the title of the card
    if(length $state{title}) {
        my $title = $state{title};
        $title =~ s/"/\&quot;/go;
        $title =~ s/\$/\$\$/go;
        $title =~ s/(\&#\d+)[^\d;]/$1;/go;
        $state{output} =~ s/<card/<card title="$title"/g;
    }
}


# 
# collapse()
# --------
# Collapse empty spaces and paragraphes from the given parameter
# 
sub collapse{
    ## converts CR/LF to native eol
    $_[0] =~ s/\015\012|\012|\015/\n/go;
    $_[0] =~ s|\s+>|>|go;    ## collapse spaces inside tags
    $_[0] =~ s|\s+/>|/>|go;  ## collapse spaces inside empty tags 
    $_[0] =~ s|<(\w+) +|<$1 |g;    ## collapse spaces between tag and attributes
    $_[0] =~ s|<p>\s+|<p>|go;      ## collapse spaces at the begining of a paragraph
    $_[0] =~ s|\s+</p>|</p>|go;    ## collapse spaces at the end of a paragraph
    
    ## collapse empty paragraphs
    $_[0] =~ s|<p[^>]*>\s*</p>||go;
    $_[0] =~ s|<p[^>]*>\s*(?:<br/>)+\s*</p>||go;
    $_[0] =~ s|<p[^>]*>\s*(?:\&nbsp;\s*)+</p>||go;
    $_[0] =~ s|<p[^>]*>\s*(?:\&#32;\s*)+</p>||go;
    $_[0] =~ s|<p[^>]*>\s*(?:\[IMG\]\s*)+</p>||go;
    $_[0] =~ s|<(\w+)>\s*</\1>||go;
    
    ## collapse multiple lines
    $_[0] =~ s/\n+/\n/go;
    $_[0] =~ s/(?: +\n)+/\n/go;
}


# 
# get_url()
# -------
# This function gets and returns the file from the given URI. 
# If called in a array context, returns the file content and the associated 
# MIME type (as given by the server). 
# 
sub get_url {
    my $uri = shift;
    my $quiet = shift || 0;
    my $request = new HTTP::Request GET => $uri;
    my $response = $agent->request($request);
    
    if($response->is_error) {
        if($response->status_line == 401) {
            ## Authorization required
           #print "<hr noshade>\n<pre>", $response->as_string, "</pre>\n<hr noshade>\n"; 
            my($realm) = ($response->header('WWW-Authenticate') =~ /realm=(.+)/);
            my $self = "$state{self_url}url=$state{doc_uri}";
           #print "<pre>self = $self\nrealm = $realm\nuser = $options{'http-user'}\npassword = $options{'http-passwd'}</pre>\n";
           #print "<pre>CGI Parameters\n";
           #for my $param ($cgi->param) { print "  $param = ", $cgi->param($param), "\n" }
           #print "</pre>";
           #print "<pre>Html2Wml Options\n";
           #for my $opt (sort keys %options) { print "  $opt = $options{$opt}\n" }
           #print "</pre>";
            
            if(length $options{'http-user'} and length $options{'http-passwd'}) {
               #$agent->credentials($realm, $uri, $options{'http-user'}, $options{'http-passwd'});
                $request->authorization_basic($options{'http-user'}, $options{'http-passwd'});
                $response = $agent->request($request);
                
            } else {
                print $cgi->header(-type => 'text/vnd.wap.wml'), <<"PASSFORM"; exit
<?xml version="1.0"?>
$state{wmlvers}
<wml><card title="Authentication">
<p>Please enter your user name and password for $realm. </p>
<p>User: <input name="U" type="text" emptyok="false"/><br/>
Password: <input name="P" type="password" emptyok="false"/></p>
<do type="accept"><go href="$self;U=\$(U);P=\$(P)"/></do>
</card></wml>
PASSFORM
            }
            
        } else {
            return $quiet ? '' : cgi_error(<<"ERR");
The following error occured while trying to access the URL <$uri>
Error @{[ $response->status_line ]}
ERR
        }
    }
    
    return wantarray ? ($response->content, $response->content_type,
        $response->content_encoding) : $response->content
}


# 
# read_file()
# ---------
# This function reads and returns the file from the local disk. 
# 
sub read_file {
    my $file = shift;
    my $quiet = shift || 0;
    open(FILE, $file) or ($quiet ? return '' : fatal("Can't read file '$file': $!\n"));
    local $/ = undef;
    $file = <FILE>;
    close(FILE);
    return $file
}


# 
# send_image()
# ----------
# This function allow Html2Wml to send WBMP images to the client. 
# Currently, it send an empty hardcoded image, but support for 
# conversion from common formats (GIF, JPEG, PNG) will be added soon. 
# 
sub send_image {
    my $data = shift;
    my $path = shift;
    
    my $pixel = pack 'C*', 0, 0, 1, 1, 0xFF;  ## this is one white pixel
    
    ## TODO: add the code to allow conversion using an external program
    
    print $cgi->header(-type => 'image/wbmp', -content_length => length $pixel), $pixel;
    exit
}


# 
# convert_entities()
# ----------------
# This function converts the named HTML entities into numeric entities. 
# 
sub convert_entities {
    my $text = shift;
    my $ascii = $options{ascii};
    
    my $code = q|  while($text =~ /&(\w+);/g) {                   |
             . q|      my $ent = $1;                              |
             . q|      if(exists $entities{$ent}) {               |
    .($ascii ? q|          my $chr = $entities{$ent}[1];          |
             : q|          my $chr = '&#'.$entities{$ent}[0].';'; | )
             . q|          $text =~ s/&$ent;/$chr/g               | 
             . q|      }                                          |
             . q|  }                                              |;
    
    eval $code;
    
    return $text
}


# 
# clean_spaces()
# ------------
sub clean_spaces {
    my $str = shift;
    $str =~ s/\t+/ /go;
    $str =~ s/^\s+/ /go;
    $str =~ s/ +/ /go;
    return $str
}


# 
# HTML::Parser start tag handler
# 
sub start_tag {
    my($tag, $attr) = @_;
    return unless exists $with{$tag};
    my $curr_tag = ($with{$tag}{action} eq 'replace' ? $with{$tag}{new_value} : $tag);
    my $prev_tag = scalar @{$state{stack}} ? ${$state{stack}}[-1] : 0;
    
    ## special case: replacing image with its alternative text when necessary
    if($curr_tag eq 'img' and $options{'img-alt-text'}) {
        my $alt = $attr->{alt} || $attr->{title} || $attr->{id} || $attr->{name} || '[IMG]';
        text_tag($alt) and return
    }
    
    ## special case: <frameset> frame tag
    if($tag eq 'frame') {
        if($prev_tag eq 'p') { pop @{$state{stack}}; $state{output} .= '</p>' }
        $state{output} .= qq|<p><small>Frame: <a href="$$attr{src}">$$attr{name}</a></small></p>|;
        return
    }
    
    ## special case: <area> image map tag
    if($tag eq 'area') {
        if($prev_tag eq 'p') { pop @{$state{stack}}; $state{output} .= '</p>' }
        $state{output} .= qq|<p><small>Image map: <a href="$$attr{href}">$$attr{href}</a></small></p>|;
        return
    }
    
    ## special case: when inside a <a> don't allow opening tags
    if($prev_tag eq 'a' and $with{a}{nest} !~ /\b$curr_tag\b/) {
        return
    }
    
    ## special case: <a name=".."> is replaced by <anchor id="..">
    if($curr_tag eq 'a' and not exists $attr->{href}) {
        $curr_tag = $tag = 'anchor';
        $attr->{id} = exists $attr->{id} ? $attr->{id} : $attr->{name};
        delete $attr->{name};
    }
    
    ## reconstruct well-formed attributes list with only the allowed ones
    if(scalar keys %$attr and exists $with{$curr_tag}{attributes}) {
        $attr = join ' ', 
            map { (exists $with{$tag}{attrconv}{$_} ? $with{$tag}{attrconv}{$_} : $_) 
                  . '="' . 
                  (/href|src/ ? xlate_url($attr->{$_}, $_) : convert_entities($attr->{$_})) 
                  . '"' if exists $attr->{$_} 
            } split(',', $with{$curr_tag}{attributes});
        $attr = ' ' . $attr if length $attr;
    } else {
        $attr = ''
    }
    
    ## set the skip mode state
    $state{skip} = 1 if $with{$curr_tag}{action} eq 'skip';
    
    debug [2], "\n(start tag) $tag => action: ", 
               ($with{$tag}{action} ? $with{$tag}{action} : 'clear'), 
               ($curr_tag ne $tag ? " with $curr_tag " : ''), 
               ($attr? " attributes:$attr" : ''), "\n";
    
    
    if($with{$curr_tag}{action} eq 'keep') { 
        # TODO: this part of the syntax repairing engine will have to be 
        #       re-written. Maybe a loop on the stack to check whether the 
        #       tree is correct, and in case not, insert the missing ones 
        
        if(scalar @{$state{stack}}) { 
            if($with{$curr_tag}{nest} eq 'EMPTY' and ${$state{stack}}[-1] ne $reverse{$curr_tag}) {
                push @{$state{stack}}, $reverse{$curr_tag};
                $state{output} .= "<$reverse{$curr_tag}>";
            }
            
            debug [2], "  -> syntax repair: closing tags ";
            ## syntax repair: close the tags that were left opened
            while($prev_tag = pop @{$state{stack}}) {
                if($with{$prev_tag}{nest} =~ /\b$curr_tag\b/ 
                or $with{$prev_tag}{nest} =~ /\b$reverse{$curr_tag}\b/) {
                    push @{$state{stack}}, $prev_tag;
                    last
                }
                debug [2], "</$prev_tag> ";
                $state{output} .= "</$prev_tag>";
            }
            debug [2], "\n";
        }
    
        ## syntax repair: open the tags that should have been opened
        my $outtertag = ${$state{stack}}[0] || $curr_tag;
        my @nesting_tags = ();
        
        debug [2], "  -> syntax repair: opening tags ";
        while($outtertag ne 'wml') {
            last unless $reverse{$outtertag};
            debug [2], "<$outtertag> ";
            $outtertag = $reverse{$outtertag};
            unshift @{$state{stack}}, $outtertag;
            unshift @nesting_tags, $outtertag;
        }
        
        for my $t (@nesting_tags) { $state{output} .= "<$t>" }
        debug [2], "\n";
    }
    
    ## clean up a little
    collapse($state{output}) if $options{'collapse'};
    
    ## split the card if needed
    my $remaining_size = $options{'max-card-size'} - $state{cardsize};
    if($remaining_size < $options{'card-split-threshold'} 
      and exists $with{$tag}{special} and $with{$tag}{special} =~ /nowidow/) {
        split_card()
    }
    
    ## simple tag translation
    if($with{$curr_tag}{action} eq 'keep') {
        if($with{$curr_tag}{nest} eq 'EMPTY') {
            $state{cardsize} += length($curr_tag) + length($attr);
            $state{output} .= "<$curr_tag$attr/>"
        } else {
            $state{cardsize} += length($curr_tag) + length($attr);
            $state{output} .= "<$curr_tag$attr>";
            push @{$state{stack}}, $curr_tag;
        }
    
    } else {
        ## do nothing
    }
    
    ## additional rendering effects
    if(defined $with{$tag}{render}) {  ## note that it's $tag, not $curr_tag
        for my $t (split ',', $with{$tag}{render}) {
            $state{cardsize} += length $t;
            $state{output} .= "<$t>"
        }
    }
}


# 
# HTML::Parser end tag handler
# 
sub end_tag {
    my($tag) = @_;
    return unless exists $with{$tag};
    my $curr_tag = ($with{$tag}{action} eq 'replace' ? $with{$tag}{new_value} : $tag);
    
    ## special case for anchors
    if($tag eq 'a' and ${$state{stack}}[-1] eq 'anchor') { $curr_tag = $tag = 'anchor'}
    
    debug [2], "( end tag ) $curr_tag, stack = (@{$state{stack}})\n\n";
    
    $state{skip} = 0 if $with{$tag}{action} eq 'skip';
    return if exists $with{$tag}{nest} and $with{$tag}{nest} eq 'EMPTY';
    
    ## additional rendering effects
    if(defined $with{$tag}{render}) {  ## note that it's $tag, not $curr_tag
        for my $t (reverse split ',', $with{$tag}{render}) {
            $state{cardsize} += length $t;
            $state{output} .= "</$t>"
        }
    }
    
    ## closing element
    if(${$state{stack}}[-1] eq $curr_tag  and  $with{$curr_tag}{action} eq 'keep') {
        $state{cardsize} += length $curr_tag;
        $state{output} .= "</$curr_tag> ";
        pop @{$state{stack}};
    
    } else {
        ## do nothing
    }
    
    ## clean up a little
    collapse($state{output}) if $options{'collapse'};
    
    ## check current card size
    if($state{cardsize} > $options{'max-card-size'}) {
        split_card()
    }
}


# 
# HTML::Parser text handler
# 
sub text_tag {
    my($text) = @_;
    my $curr_tag = ${$state{stack}}[-1] || '';
    
    debug [3], "(- text --) stack = (@{$state{stack}})\n- - - - -\n@_\n- - - - -\n";
    
    return if $state{skip};
    
    ## add a para tag if we're on the card node
    if($curr_tag eq 'card') {
        $state{cardsize} += 4;
        $state{output} .= "\n<p>";
        push @{$state{stack}}, 'p';
    }
    
    $text = clean_spaces($text) if $options{'collapse'} and $curr_tag ne 'pre';
    # 
    # TODO: add the code that split too long chunks of text
    # 
    collapse($text);
    $state{output} .= $text;
    $state{cardsize} += length $text;
}


# 
# HTML::Parser comment tag handler
# 
sub comment_tag {
    my($comment) = @_;
    local $_;
    
    $comment = join '', @$comment;
    
    debug [3], "( comment ) stack = (@{$state{stack}})\n    $comment\n";
    
    ## SSI engine
    if($comment =~ /^\s*\[(\w+)\s+(.*)\]\s*$/) {
        my $element = $1;
        my %attributes = map { /\G(\w+)=["']([^"']+)["']/g } split /\s+/, $2;
        
        for my $attr (keys %attributes) {
            if($attr eq 'virtual' and $attributes{virtual} !~ /^http:/) {
                $attributes{virtual} = URI::URL->new( $attributes{virtual}, $state{doc_uri} )->abs
            }
        }
        
        for($element) {
            /include/ and do {
                if(defined $attributes{virtual}) { $state{output} .= get_url($attributes{virtual}, 1) }
                elsif(defined $attributes{file}) { $state{output} .= read_file($attributes{file}, 1) }
            };
            
            /fsize/ and do {
                if(defined $attributes{virtual}) { $state{output} .= length get_url($attributes{virtual}, 1) }
                elsif(defined $attributes{file}) { $state{output} .= length read_file($attributes{file}, 1) }
            };
        }
    }
}


# 
# HTML::Parser default handler
# 
sub default_handler {
    my($text) = @_;
    debug [2], "( default ) $text\n\n";
   #$state{output} .= $text;
}


# 
# split_card()
# ----------
# This function closes the current card and creates a new one. 
# 
sub split_card {
    my @stack = @{$state{stack}};
    shift @stack;  ## shift the <wml> tag
    shift @stack;  ## shift the <card> tag
    
    my $id = $state{cardid}++;
    $state{cardsize} = 0;
    
    for my $tag (reverse @stack) { $state{output} .= "</$tag>" }
    
    my $doc_uri;
    
    if($cgi) {
        my($srv) = ($cgi->url =~ m|^(https?://[\w.-]+)/|);
        $doc_uri = $state{doc_uri}; $doc_uri =~ s/^$srv//;
    }
    
    my $link_to_next = $options{'split-deck'} ?
        "$state{self_url}url=$doc_uri;id=$state{cardid}" : "#$state{cardid}";
    
    $state{output} .= join '', qq|\n<p align="right">|, 
        qq|<do type="prev" label="$options{'prev-card-label'}"><prev/></do>|, 
        qq|<do type="accept" label="$options{'next-card-label'}"><go href="$link_to_next"/></do>|, 
        qq|</p>\n</card>\n|;
    
    if($options{'split-deck'}) {
        post_conversion_cleanup();
        $state{output} .= '</wml>';
        $state{decks}{$id} = $state{output};
        $state{output} = join '', q|<?xml version="1.0"|, 
            ($state{encoding} ?  qq| encoding="$state{encoding}"| : ''), 
            qq|?>\n$state{wmlvers}\n<wml>|; 
    }

    $state{output} .= qq|<card id="$state{cardid}">\n|;
    
    for my $tag (@stack) { $state{output} .= "<$tag>" }
}


# 
# xlate_url()
# ---------
# This function translates the given url so that the pointed document will 
# pass through this CGI for conversion when in CGI mode, or construct a url 
# that fits the needs of the webmaster using the given template, if present. 
# 
sub xlate_url {
    my $url  = shift;  ## $url is the url from a href or a src attribute
    my $type = shift;  ## $type is 'src' or 'href'
    
    ## we only treat http URLs
    return $url if $url =~ /^(\w+):/ and lc($1) !~ /https?/;
    
    ## escape some characters
    $url =~ s'[$]'%24'go;
    $url =~ s'&'&amp;'go;
    
    if($cgi) {
        ## CGI mode
        my $cgi_options = '';
        my $link = URI::URL->new($url, $state{doc_uri})->abs;
        
        my($srv) = ($cgi->url =~ m|^(https?://[\w.-]+)(:\d+)/|);
        $link =~ s/^$srv//;
        
        return "$state{self_url}url=$link"
        
    } else {
        ## shell mode
        
        ## This is where the link reconstruction engine lives...  (waah... :)
        
        if($options{"${type}tmpl"} and $url !~ m|^https?://|) { 
            ## we don't touch absolute urls
            
            my $tmpl = $options{"${type}tmpl"};
            my $uri = new URI $url, 'http';
            
            if($uri->path) {
                my($filename,$filepath,$filetype) = fileparse($uri->path, '((?:\.\w+)+)');
                
                my $init_vars = qq|{
                    sub FILEPATH { q<$filepath> }
                    sub FILENAME { q<$filename> }
                    sub FILETYPE { q<$filetype> }
                    sub URL { q<$url> }
                }|;
                
                my $new_url = new Text::Template TYPE => 'STRING', SOURCE => $init_vars.$tmpl
                    or fatal("Can't construct template: $Text::Template::ERROR\n"); 
                
                return $new_url->fill_in(HASH => {
                    'FILEPATH' => $filepath,  
                    'FILENAME' => $filename, 
                    'FILETYPE' => $filetype, 
                    'URL' => $url
                }) or fatal("$Text::Template::ERROR\n")
                
            } else {
                return $url
            }
            
        } else {
            return $url
        }
    }
}


# 
# htmlize()
# -------
# This function translate the given text into HTML
# 
sub htmlize {
    my $str = shift;
    my @res = ();
    
    ## convert special chars to entities
    $str =~ s/&/\&amp;/go;
    $str =~ s/</\&lt;/go;
    $str =~ s/>/\&gt;/go;
    
    ## add a small syntax highlighting
    $str =~ s{(\&lt;[!?/]?)(\w+)(.*?)([!?/]?\&gt;)}
             {<b>$1<span class="tag">$2</span></b><span class="attr">$3</span><b>$4</b>}gs;
    $str =~ s{\&lt;!--(.*?)--\&gt;}{\&lt;!--<i>$1</i>--\&gt;}gs;
    $str =~ s{href="([^\"]+)"}{href="<a href="$1">$1</a>"}gs;
    
    return "<pre>$str</pre>"
}


# 
# hextype()
# -------
# This function generates a human readable representation of binary data
# 
sub hextype {
    my $data = shift;            ## data to print
    my $colwidth = shift || 16;  ## width of ASCII column
    
    my $half = $colwidth/2;
    my $line = 1;
    my $out = '';
    
    while(length $data) {
        my @hex = unpack 'C'x$colwidth, substr($data, 0, $colwidth);
        substr($data, 0, $colwidth) = '';
        $out .= sprintf '%3d:  '. ((('%02x 'x$half).' ')x2) .'   ', $line++, @hex;
        $out .= sprintf ''.('%s'x$half)x2 . "\n", map { $_ > 32 ? chr : '.' } @hex; 
    }
    
    return $out
}


# 
# simple_wrap()
# -----------
# This function wraps the text given in parameter. 
# 
sub simple_wrap {
    my $orig = ref $_[0] ? $_[0] : \$_[0];
    my $text = '';
    my $curlen = 0;
    my $beg = ' 'x5;
    my $cols = 75;
    
    while($$orig =~ m/(\s*\S+\s+)/gm) {
        if($curlen + length($1) > $cols) {
            $text .= "\n$beg$1";
            $curlen = 1 + length($beg) + length($1)
        } else {
            $text .= $1;
            $curlen += length $1;
        }
        $curlen = 0 if index($1, "\n") >= 0;
    }
    
    return $text
}


# 
# load_entities()
# -------------
# 
sub load_entities {
    %entities = (
        ## Spacing characters
        nbsp     => [ 32, ' '],    ## non-breaking space (real value #160)
        ensp     => [ 32, ' '],    ## en space (real value: #8194, U+2002)
        emsp     => [ 32, ' '],    ## em space (real value: #8195, U+2003)
        thinsp   => [ 32, ' '],    ## thin space (real value: #8201, U+2009)
        zwnj     => [  0, '' ],    ## zero width non-joiner (real value: #8204, U+200C)
        zwj      => [  0, '' ],    ## zero width joiner (real value: #8205, U+200D)
        
        ## Latin Extended-A entities + Mathematical symbols
        sbquo    => [130, ','],    ## single low-9 quotation mark
        fnof     => [131, 'f'],    ## latin small f with hook = florin
        bdquo    => [130, ',,'],   ## double low-9 quotation mark
        hellip   => [133, '...'],  ## horizontal ellipsis
        dagger   => [134, ' '],    ## dagger
        Dagger   => [135, ' '],    ## double dagger
        circ     => [136, '^'],    ## modifier letter circumflex accent
        permil   => [137, 'o/oo'], ## per mille sign
        Scaron   => [138, 'S'],    ## latin capital letter S with caron
        lsaquo   => [139, '<'],    ## single left-pointing angle quotation mark
        OElig    => [140, 'OE'],   ## latin capital ligature OE
        lsquo    => [145, "'"],    ## left single quotation mark
        rsquo    => [146, "'"],    ## right single quotation mark
        ldquo    => [147, '"'],    ## left double quotation mark
        rdquo    => [148, '"'],    ## right double quotation mark
        bull     => [149, 'o'],    ## bullet
        ndash    => [150, '-'],    ## en dash
        mdash    => [151, '--'],   ## em dash
        tilde    => [152, '~'],    ## small tilde
        trade    => [153, '(tm)'], ## trademark sign
        scaron   => [154, 's'],    ## latin small letter s with caron
        rsaquo   => [155, '>'],    ## single right-pointing angle quotation mark
        oelig    => [156, 'oe'],   ## latin small ligature oe
        Yuml     => [159, 'Y'],    ## latin capital letter Y with diaeresis
        
        ## ISO-Latin-1 entities
        iexcl    => [161, '!'], 
        cent     => [162, '-c-'], 
        pound    => [163, '-L-'], 
        curren   => [164, 'CUR'], 
        yen      => [165, 'YEN'], 
        brvbar   => [166, '|'], 
        sect     => [167, 'S:'], 
        uml      => [168, '"'], 
        copy     => [169, '(c)'], 
        ordf     => [170, '-a'], 
        laquo    => [171, '<<'], 
       'not'     => [172, 'NOT'], 
        shy      => [173, '-'], 
        reg      => [174, '(R)'], 
        macr     => [175, '-'], 
        deg      => [176, 'DEG'], 
        plusmn   => [177, '+/-'], 
        sup2     => [178, '^2'], 
        sup3     => [179, '^3'], 
        acute    => [180, '\''], 
        micro    => [181, 'u'], 
        para     => [182, 'P:'], 
        middot   => [183, '.'], 
        cedil    => [184, ','], 
        sup1     => [185, '^1'], 
        ordm     => [186, '-o'], 
        raquo    => [187, '>>'], 
        frac14   => [188, ' 1/4'], 
        frac12   => [189, ' 1/2'], 
        frac34   => [190, ' 3/4'], 
        iquest   => [191, '?'], 
        Agrave   => [192, 'A'], 
        Aacute   => [193, 'A'], 
        Acirc    => [194, 'A'], 
        Atilde   => [195, 'A'], 
        Auml     => [196, 'Ae'], 
        Aring    => [197, 'A'], 
        AElig    => [198, 'AE'], 
        Ccedil   => [199, 'C'], 
        Egrave   => [200, 'E'], 
        Eacute   => [201, 'E'], 
        Ecirc    => [202, 'E'], 
        Euml     => [203, 'E'], 
        Igrave   => [204, 'I'], 
        Iacute   => [205, 'I'], 
        Icirc    => [206, 'I'], 
        Iuml     => [207, 'I'], 
        ETH      => [208, 'DH'], 
        Ntilde   => [209, 'N'], 
        Ograve   => [210, 'O'], 
        Oacute   => [211, 'O'], 
        Ocirc    => [212, 'O'], 
        Otilde   => [213, 'O'], 
        Ouml     => [214, 'Oe'], 
       'times'   => [215, '*'], 
        Oslash   => [216, 'O'], 
        Ugrave   => [217, 'U'], 
        Uacute   => [218, 'U'], 
        Ucirc    => [219, 'U'], 
        Uuml     => [220, 'Ue'], 
        Yacute   => [221, 'Y'], 
        THORN    => [222, 'P'], 
        szlig    => [223, 'ss'], 
        agrave   => [224, 'a'], 
        aacute   => [225, 'a'], 
        acirc    => [226, 'a'], 
        atilde   => [227, 'a'], 
        auml     => [228, 'ae'], 
        aring    => [229, 'a'], 
        aelig    => [230, 'ae'], 
        ccedil   => [231, 'c'], 
        egrave   => [232, 'e'], 
        eacute   => [233, 'e'], 
        ecirc    => [234, 'e'], 
        euml     => [235, 'e'], 
        igrave   => [236, 'i'], 
        iacute   => [237, 'i'], 
        icirc    => [238, 'i'], 
        iuml     => [239, 'i'], 
        eth      => [240, 'e'], 
        ntilde   => [241, 'n'], 
        ograve   => [242, 'o'], 
        oacute   => [243, 'o'], 
        ocirc    => [244, 'o'], 
        otilde   => [245, 'o'], 
        ouml     => [246, 'o'], 
        divide   => [247, '/'], 
        oslash   => [248, 'o'], 
        ugrave   => [249, 'u'], 
        uacute   => [250, 'u'], 
        ucirc    => [251, 'u'], 
        uuml     => [252, 'u'], 
        yacute   => [253, 'y'], 
        thorn    => [254, 'p'], 
        yuml     => [255, 'y'], 
    );
}


# 
# error()
# -----
sub error {
    print STDERR @_
}


# 
# fatal()
# -----
sub fatal {
    print STDERR @_;
    exit -1;
}


# 
# debug()
# -----
sub debug {
    if($options{'debug'}) {
        my $level = ref $_[0] ? shift->[0] : 1;
        print STDERR @_ if $level <= $options{'debug'}
    }
}


# 
# version()
# -------
sub version {
    print "$program/$version\n"; exit
}


# 
# usage()
# -----
sub usage {
    print STDERR <<"USAGE"; exit
usage: $0 [options] file

options: 
      --ascii               use 7 bits ASCII emulation to convert named entities
      --nocollapse          don't collapse spaces and empty paragraphs
      --hreftmpl=template   set the template for the links reconstruction engine
      --ignore-images       completly ignore image links
      --noimg-alt-text      don't replace the images by their alternative text
      --nolinearize         don't linearize the tables
      --numeric-non-ascii   convert non-ASCII characters to numeric entities
      --nopre               don't use the <pre> tag
  
      --split-card                  slice the document by cards
      --split-deck                  slice the document by decks
  -s, --max-card-size=size          set the card size upper limit
  -t, --card-split-threshold=size   set the card splitting threshold 
      --next-card-label=label       set the label of the link to the next card
      --prev-card-label=label       set the label of the link to the previous card
   
  -d, --debug       activate the debug mode
      --xmlcheck    activate the XML check: output is passed through XML::Parser
  
  -h, --help        show this help screen and exit
  -v, --version     show the program name and version and exit

Read the documentation for more information. 
USAGE
}


# 
# cgi_error()
# ---------
sub cgi_error {
    if($options{'debug'}) {
        print $cgi->header, <<"OUTPUT"; exit
<html>
<head>
<title>Html2Wml - Error</title>
</head>
<body>
<h1>Html2Wml - Error</h1>
<p>This CGI was called with incorrect parameters or an error occured 
when processing the request. Please check your request and try again </p>
<hr>
<p>@_</p>
</body>
</html>
OUTPUT
    } else {
        print $cgi->header(-type => 'text/vnd.wap.wml'), <<"OUTPUT"; exit
<?xml version="1.0"?>
$state{wmlvers}
<wml><card title="Html2Wml - Error">
<p>This CGI was called with incorrect parameters or an error occured 
when processing the request. Please check your request and try again </p>
<p>@_</p>
</card></wml>
OUTPUT
    }
}


1;

