
package HTML::WebMake::MSHTMLtoHTML;

# -------------------------------------------------------------------------

sub convert {
  my ($self, $contobj, $txt) = @_;

  # use John Walker's demoroniser first off
  $txt = demoronise($txt);

  # first, trim off <head> and tails.  Set a metadatum from the title
  # if possible.
  if ($txt =~ s/^(.*?)(<body\b[^>]*>)//is) {
    my $head = $1;
    if ($head =~ /<title>(.*?)<\/title>/i) {
      $txt = "<wmmeta name='Title'>". $1 . "</wmmeta>" . $txt;
    }
  }
  $txt =~ s{</body>.*?$}{}gis;

  # <![if ...]>...<![endif]>
  $txt =~ s{<!\[if\s.+?\]>}{}gs;
  $txt =~ s{<!\[endif\]>}{}gs;

  # remove Mso classes on HTML tags
  $txt =~ s{<(\S+)\s+class=\"Mso\S+\"}{<$1}gis;

  # Office-namespace tags
  $txt =~ s{</?o:\S+(?:\s[^>]+|)>}{}gis;

  # Word-namespace tags
  $txt =~ s{</?w:\S+(?:\s[^>]+|)>}{}gis;

  $txt =~ s{<span\s+style="([^"]+)">(.*?)</span>}{
    _mshtml_fix_styles('span', $1, $2);
  }gies;
  $txt =~ s{<span\s+style='([^']+)'>(.*?)</span>}{
    _mshtml_fix_styles('span', $1, $2);
  }gies;

  # nbsp's at the end of a paragraph
  $txt =~ s{(?:\&nbsp;)+(<\/p>)}{$1}gis;

  # empty tags
  foreach my $tag (qw(b i u em font small big strong code div ul ol
      blockquote h1 h2 h3 h4 h5 h6 pre table))
  {
    $txt =~ s{
      <${tag}(?:\s+[^>]*|\s*)>
      (?:\s+|<\s*br\s*/?>|\&nbsp;)*
      <\/${tag}>
    }{}gisx;
  }

  $txt;
}

sub _mshtml_fix_styles {
  my ($tag, $style, $text) = @_;
  $style =~ s/\s+/ /gs;

  my @styles = split (/\s*;\s*/s, $style);
  my @newstyles = ();
  foreach my $st (@styles) {
    if ($st !~ /^mso-/) { push (@newstyles, $st); }
  }

  if (scalar @newstyles == 0) {
    return $text;
  } else {
    return "<$tag style=\"". join (';', @newstyles). "\">$text</$tag>";
  }
}

# from John Walker's demoroniser:
#
#           De-moron-ise Text from Microsoft Applications
# 
#                   by John Walker -- January 1998
#                      http://www.fourmilab.ch/
#
#               This program is in the public domain.
#
sub demoronise {
    local($s) = @_;
    local($i, $c);

    #   Eliminate idiot MS-DOS carriage returns from line terminator

    $s =~ s/\s+$//;
    $s .= "\n";

    #   Map strategically incompatible non-ISO characters in the
    #   range 0x82 -- 0x9F into plausible substitutes where
    #   possible.

    $s =~ s/\x82/,/g;
    $s =~ s-\x83-<em>f</em>-g;
    $s =~ s/\x84/,,/g;
    $s =~ s/\x85/.../g;

    $s =~ s/\x88/^/g;
    $s =~ s-\x89- <B0>/<B0><B0>-g;

    $s =~ s/\x8B/</g;
    $s =~ s/\x8C/Oe/g;

    $s =~ s/\x91/`/g;
    $s =~ s/\x92/'/g;
    $s =~ s/\x93/"/g;
    $s =~ s/\x94/"/g;
    $s =~ s/\x95/*/g;
    $s =~ s/\x96/-/g;
    $s =~ s/\x97/--/g;
    $s =~ s-\x98-<sup>~</sup>-g;
    $s =~ s-\x99-<sup>TM</sup>-g;

    $s =~ s/\x9B/>/g;
    $s =~ s/\x9C/oe/g;

    #   Now check for any remaining untranslated characters.

    if ($s =~ m/[\x00-\x08\x10-\x1F\x80-\x9F]/) {
        for ($i = 0; $i < length($s); $i++) {
            $c = substr($s, $i, 1);
            if ($c =~ m/[\x00-\x09\x10-\x1F\x80-\x9F]/) {
		my $err = sprintf "untranslated character 0x%02X in MS-HTML input\n", unpack('C', $c);
		warn $err;
            }
        }
    }
    #   Supply missing semicolon at end of numeric entity if
    #   Billy's bozos left it out.

    $s =~ s/(&#[0-2]\d\d)\s/$1; /g;

    #   Fix dimbulb obscure numeric rendering of &lt; &gt; &amp;

    $s =~ s/&#038;/&amp;/g;
    $s =~ s/&#060;/&lt;/g;
    $s =~ s/&#062;/&gt;/g;

    #   Fix unquoted non-alphanumeric characters in table tags

    $s =~ s/(<TABLE\s.*)(WIDTH=)(\d+%)(\D)/$1$2"$3"$4/gi;
    $s =~ s/(<TD\s.*)(WIDTH=)(\d+%)(\D)/$1$2"$3"$4/gi;
    $s =~ s/(<TH\s.*)(WIDTH=)(\d+%)(\D)/$1$2"$3"$4/gi;

    #   Correct PowerPoint mis-nesting of tags

    $s =~ s-(<Font .*>\s*<STRONG>.*)(</FONT>\s*</STRONG>)-$1</STRONG></Font>-gi;

    #   Translate bonehead PowerPoint misuse of <UL> to achieve
    #   paragraph breaks.

    $s =~ s-<P>\s*<UL>-<p>-gi;
    $s =~ s-</UL><UL>-<p>-gi;
    $s =~ s-</UL>\s*</P>--gi;

    #   Repair PowerPoint depredations in "text-only slides"

    $s =~ s-<P></P>--gi;
    $s =~ s- <TD HEIGHT=100- <tr><TD HEIGHT=100-ig;
    $s =~ s-<LI><H2>-<H2>-ig;

    $s;
}

# -------------------------------------------------------------------------

1;
