#===========================================================================

package Sitescooper::HTMLLobotomize;

=head1 NAME

Sitescooper::HTMLLobotomize - Apply extreme simplification to a HTML page

=head1 SYNOPSIS

  use Sitescooper::HTMLLobotomize;
  [TODO]

=head1 DESCRIPTION

C<Sitescooper::HTMLLobotomize> will apply a selection of techniques
to reduce a HTML page to an extreme simplification thereof, suitable
for display on a limited browser such as a Palm handheld HTML viewer
or NCSA Mosaic circa 1995.

=head1 SEE ALSO

L<sitescooper(1)>

=head1 COPYRIGHT

Copyright (c) 2000 Justin Mason, All rights reserved.

This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=head1 AUTHOR

Justin Mason <jm-cpan@jmason.org>

=cut

require Exporter;
use Carp;
use URI::URL;

use strict;

use vars       qw(
		@ISA @EXPORT $VERSION

		$BUILT_TAG_REGEXPS
		@TAGS_TO_CLOSE %OPEN_TAG_REGEXPS
		%CLOSE_TAG_REGEXPS
		);

@TAGS_TO_CLOSE = qw(blockquote b h1 h2 h3 h4 h5 h6 div em
		      a i u code small big strong pre ul ol
		      q abbr acronym address cite del ins
		      s strike sub sup samp kbd var plaintext
		      listing xmp font td tr table);

%OPEN_TAG_REGEXPS = ();
%CLOSE_TAG_REGEXPS = ();

@ISA = qw(Exporter);
@EXPORT= qw();
$VERSION = "0.1";
sub Version { $VERSION; }

sub new {
  my $class = shift; $class = ref($class) || $class;

  my $self = {
    	'page'	=> undef,
	'url_regexp_cache' => { },
    	'url'	=> 'file:///'
    };

  bless ($self, $class);
  $self;
}

# ---------------------------------------------------------------------------

=item

Set the URL for this page. Any other URLs found will be considered
relative to this.

=cut

sub set_url {
  my ($self, $url) = @_;
  $self->{url} = $url;
}

# ---------------------------------------------------------------------------

=item

Set the text for the page.  This is mandatory.

=cut

sub set_text {
  my ($self, $page) = @_;
  $self->{page} = $page;
}

# ---------------------------------------------------------------------------

=item

Get the page's text.

=cut

sub get_text {
  my ($self) = @_;
  $self->{page};
}

# ---------------------------------------------------------------------------

=item

Set a callback to receive debug messages.  If undef, no debug output
will be delivered.

=cut

sub set_debug_callback {
  my ($self, $func) = @_;
  $self->{dbgcb} = $func;
}

# ---------------------------------------------------------------------------

=item

Set a regular expression for which IMG tags with ALT attributes will be
replaced with that ALT text.  If the image SRC url matches this pattern,
the ALT text is used.

=cut

sub set_alt_tags_allowed_regexp {
  my ($self, $re) = @_;
  $self->{alt_tags_allowed_regexp} = $re;
}

# ---------------------------------------------------------------------------

=item

Set a regular expression for which IMG tags are maintained as-is, ie.
not stripped or converted to their ALT text.

=cut

sub set_keep_img_regexp {
  my ($self, $re) = @_;
  $self->{keep_img_regexp} = $re;
}

# ---------------------------------------------------------------------------

=item

Set a function used to load images, or track which images need to be loaded
later.  If you set a regular expression with set_keep_img_regexp(), this
callback is what gets called when an IMG SRC url matches that regexp.

The function must accept three arguments, the referrer URL, the URL of the
image and the entire contents of the IMG tag, and must return a URL to use
instead of the SRC url in the converted IMG tag.

=cut

sub set_image_loader {
  my ($self, $sub) = @_;
  $self->{img_loader} = $sub;
}

# ---------------------------------------------------------------------------

=item

Convert an RSS page to rudimentary HTML.  You only need to call this if
the page is actually RSS data.

=cut

sub rss_to_html {
  my ($self) = @_;
  my $page = $self->{page};

  # Convert the RSS formatting into a nice HTML display
  $page =~ s{<channel>(.*?)<title>(.*?)<\/title>(.*?)<\/channel>}{
    <h2>$2<\/h2> $1 $3
  }gis;

  my $link;
  $page =~ s/<link>(.*?)<\/link>/
    $link = $1; $link =~ s,^.*<url>(.*)<\/url>.*$,$1,g;
    $link = $self->absolute_url ($link);
    '(<a href='.$link.'>'.$link.'<\/a>)';
  /gies;	#/

  $page =~ s,<title>(.*?)<\/title>,<p><em>$1<\/em><\/p> ,gis;
  $page =~ s,<channel>,<ul>,gis; $page =~ s,<\/channel>,<\/ul>,gis;
  $page =~ s,<item>,<li>,gis; $page =~ s,<\/item>,<\/li>,gis;
  $page =~ s,<description>,<p>,gis; $page =~ s,<\/description>,<\/p>,gis;
  $page =~ s,<language>.*?<\/language>, ,gis;

  # the description is converted for RSS 0.91 sites -- the "fat" format
  $page =~ s,<description>(.*?)<\/description>,$1 ,gis;

  $self->{page} = $page;
}

# ---------------------------------------------------------------------------

=item

Strip advanced tags, stylesheets, Javascript, applets, comments etc.
This is a good first step in the conversion process.

=cut

sub strip_advanced_tags {
  my ($self) = @_;
  my $page = $self->{page};

  # strip tags we know we do not want
  # modified by Albert K T Hui <avatar /at/ deva.net>: allow text in
  # <head> tag, netscape will display it so so should we.
  #
  $page =~ s/<head(?:\s+[^>]+|)>(.*?)<\/head>/$1<br>/gis;
  $page =~ s/<(?:html|body)(?:\s+[^>]+|)>/ /gis;
  $page =~ s/<\/(?:html|body)>/ /gis;
  $page =~ s/<iframe(?:\s+[^>]+|)>.*?<\/iframe>/ /gis;
  $page =~ s/<ilayer(?:\s+[^>]+|)>.*?<\/ilayer>/ /gis;
  $page =~ s/<layer(?:\s+[^>]+|)>.*?<\/layer>/ /gis;
  $page =~ s/<\/?frame(?:\s+[^>]+|)>/ /gis;
  $page =~ s/<\/?frameset(?:\s+[^>]+|)>/ /gis;
  $page =~ s/<script(?:\s+[^>]+|)>.*?<\/script>/ /gis;
  $page =~ s/<style(?:\s+[^>]+|)>.*?<\/style>/ /gis;	# not yet
  $page =~ s/<!--.*?-->/ /gis;			# MSIE-style comments
  $page =~ s/<!--[^>]+>/ /gis;			# Netscape-style comments
  $page =~ s/<form(?:\s+[^>]+|)>.*?<\/form>/ /gis;
  $page =~ s/<image(?:\s+[^>]+|)>.*?<\/image>/ /gis;	# RDF tag
  $page =~ s/<channel(?:\s+[^>]+|)>.*?<\/channel>/ /gis;	# RDF tag
  $page =~ s/<map(?:\s+[^>]+|)>.*?<\/map>/ /gis;
  $page =~ s/<applet(?:\s+[^>]+|)>.*?<\/applet>/ /gis;
  $page =~ s/<item(?:\s+[^>]+|)>.*?<\/item>/ /gis;	# some RDF items
  $page =~ s/<link(?:\s+[^>]+|)>.*?<\/link>/ /gis;	# some RDF items
  $page =~ s/<title(?:\s+[^>]+|)>.*?<\/title>/ /gis;	# some RDF items
  $page =~ s/<!doctype\s+[^>]+>/ /gis;
  $page =~ s/<meta\s+[^>]+>/ /gis;
  $page =~ s/<link\s+[^>]+>/ /gis;	# reported by Olivier Lamer
  $page =~ s/<base\s+[^>]+>/ /gis;
  $self->{page} = $page;
}

# ---------------------------------------------------------------------------

=item

Clean inline images from the page.
A good step after strip_advanced_tags().

=cut

sub clean_inline_images {
  my ($self) = @_;

  $self->{page} =~ s{<a\s+([^>]*)>}{
    	$self->clean_inline_images_in_a_hrefs ($self->{url}, $1)
  }gies;
  $self->{page} =~ s{<img\s+([^>]*)>}{
    	$self->clean_inline_images_work ($self->{url}, $1)
  }gies;
}

# ---------------------------------------------------------------------------

=item

Try to add closing tags, since we may have stripped off the original ones, or
the page itself could be broken.  In sitescooper, this allows us to ensure
that page formatting is returned to the baseline between pages.

=cut

sub balance_tags {
  my ($self) = @_;
  my $page = $self->{page};

  # we cache these regexps for speed.
  if (!$BUILT_TAG_REGEXPS) {
    foreach my $tag (@TAGS_TO_CLOSE) {
      $OPEN_TAG_REGEXPS{$tag} = qr/<${tag}(?:\s[^>]*|)\s*>/i;
      $CLOSE_TAG_REGEXPS{$tag} = qr/<\/${tag}\s*>/i;
    }
    $BUILT_TAG_REGEXPS = 1;
  }

  my @pre = ();
  my @post = ();

  study $page;
  foreach my $tag (@TAGS_TO_CLOSE) {
    my $openers = 0;
    my $closers = 0;

    while ($page =~ m/$OPEN_TAG_REGEXPS{$tag}/g) { $openers++; }
    while ($page =~ m/$CLOSE_TAG_REGEXPS{$tag}/g) { $closers++; }

    if ($openers < $closers) {
      my $txt = "<".$tag.">";
      for (; $openers != $closers; $openers++) {
	$self->dbg ("re-adding stripped opening tag: $txt");
	unshift (@pre, $txt);
      }

    } elsif ($openers > $closers) {
      my $txt = "</".$tag.">";
      for (; $openers != $closers; $closers++) {
	$self->dbg ("re-adding stripped closing tag: $txt");
	push (@post, $txt);
      }
    }
  }

  $self->{page} = join('', @pre).$page.join('', @post);
}

# ---------------------------------------------------------------------------

=item

Convert PRE text to plain HTML as much as possible.  This is probably only
useful for display on small screens, where 80-column text will not fit,
and therefore for which normal PRE text is not acceptable.

=cut

sub convert_pre_to_html {
  my ($self) = @_;

  # convert <pre> text to proper HTML, it displays better.
  $self->{page} =~ s/<pre>(.*?)<\/pre>/$self->clean_pre_work($1);/gies;
  $self->{page} =~ s/<code>(.*?)<\/code>/$self->clean_pre_work($1);/gies;
}

# ---------------------------------------------------------------------------

=item

Do some general HTML cleanup, including tidying whitespace and line endings.
This is not useful if you will be converting to text, but it's good for HTML
viewers.

=cut

sub clean_whitespace_and_colors {
  my ($self) = @_;
  my $page = $self->{page};

  # strip all existing line breaks, they will just confuse matters
  # when we convert to text or HTML. It's also easier to do proper diffs
  # when we control the placement of newlines.
  $page =~ s/[\r\n]+/ /gs;

  # This works around a bug (I think) in iSilo that makes Wired News
  # indexes look rotten. Shouldn't be harmful in general anyway.
  $page =~ s/<br>\s*\&nbsp;\s*<br>/<br><br>/gis;

  # clean up useless tags and whitespace at the start and end of the text.
  1 while $page =~ s,^\s*<(?:br|hr|/th|/td|/table|/p|/tr|/h\d|/div)\s*[^>]*>,,gis;
  1 while $page =~ s,<(?:br|hr|th|td|table|p|tr|h\d|div)\s*[^>]*>\s*$,,gis;

  # remove now-empty table items, text markup, paragraphs etc.  the
  # ordering of the tags in the foreach loop is important; strip the
  # "smallest" ones first. (actually, do not do td's, they can
  # affect the formatting quite a lot.)
  #
  # TODO - this is currently offline - some HTML will cause an infinite
  # loop in perl 5.004's regular expression implementation.
  #
  if (defined $self->{strip_empty_tag_sets}) {
    my $tag;
    foreach $tag (qw(b i u em font small big strong code div ul ol
	blockquote h1 h2 h3 h4 h5 h6 pre table))
    {
      $page =~ s{(<\s*${tag}(?:\s+[^>]*|\s*)>(?:\s+|<\s*br\s*>|\&nbsp;)*<\s*\/\s*${tag}\s*>)}{
	$self->dbg ("stripping now-empty tag set: $1");
      }gies;
    }
  }

  # since we're rendering to HTML, line breaks are OK. Put them back in!
  $page =~
    s,(<(?:br|p|hr|table|td|/td|th|/th|/table|/p|/tr|/h\d|/div)\s*[^>]*>),$1\n,gis;

  # strip colors.
  $page =~ s/(<\S+\s*[^>]*\s)
  	(?:bg|fg|border|)color\s*=\s*[\"\']?[-_\#0-9a-z]+[\"\']?
	/$1/gisx;

  $self->{page} = $page;
}

# ---------------------------------------------------------------------------

=item

Rearrange tables according to the argument.  If it is 'keep', the tables will
be left alone; 'flatten' will strip out table formatting info entirely; and
'list' will use the Exten::Table module (if available) to reformat them into
lists.

=cut

sub rearrange_tables {
  my ($self, $proc) = @_;

  if (defined ($proc) && $proc eq 'keep') {
    return 1;			# do nothing

  } elsif (!defined ($proc) || $proc eq 'flatten') {
    # strip out all table parameters
    $self->{page} =~ s/<\s*(?:td|tr|th|table)(?:\s[^>]*|)>/<p>/gis;
    $self->{page} =~ s/<\/\s*(?:td|tr|th|table)(?:\s[^>]*|)>/<p>/gis;
    # collapse <p><p><p>... trails
    $self->{page} =~ s/<p>\s*(?:<p>\s*|<br>\s*)+/<p>/gis;
    return 1;

  } elsif ($proc eq 'list') {
    my $page = $self->{page};
    eval '
      require Exten::Table;
      my $parser = Exten::Table->new();
      $parser->parse($page);
      $page = $parser->get_result();
    1; ' or croak "Failed to rearrange_tables: $@";
    $self->{page} = $page;
    return 1;

  } else {
    warn ("TableProcess \"$proc\" invalid");
    return 0;
  }
}

# ---------------------------------------------------------------------------

=item

Convert to plain text.

=cut

sub convert_to_text {
  my ($self) = @_;
  my $page = $self->{page};

  # We're converting to DOC or text format, so we need to do a lot
  # more work here.
  # a sidebar enclosed by a table? separate it from the rest of the text.
  $page =~ s/<\/tr>/\n\n/gis;
  $page =~ s/<\/table>/\n\n/gis;	# end of <table>
  $page =~ s/<\/pre>/\n\n/gis;	# end of <pre> text
  $page =~ s/<(?:\/h\d|h\d)(?:\s+[^>]+|)>/\n\n/gis;	# headings
  $page =~ s/<\/?blockquote(?:\s+[^>]+|)>/\n\n/gis;	# quotes
  $page =~ s/<hr(?:\s+[^>]+?|)>/\n\n/gis;	# horiz lines
  $page =~ s/<br(?:\s+[^>]+?|)>/\n/gis;	# end-of-line markers
  $page =~ s/<li(?:\s+[^>]+?|)>/\n/gis;	# list items

  $page =~ s/<\/?p(?:\s+[^>]+?|)>/\n\n/gis;
  # don't worry, multiple blank lines are sorted later

  $page =~ s/<\/td>/\n/gis;		# end-of-table-item

  1 while ($page =~ s/<[^>]+?>//gs);	# trim all other tags
  HTML::Entities::decode_entities($page);

  $self->{page} = $page;
}

# ---------------------------------------------------------------------------

=item

Fix some HTML entities that aren't official entities, but do appear now and
again. Also canonicalise whitespace.

=cut

# Convert some HTML entities that the viewers can't handle.
sub fix_entities {
  my ($self) = @_;
  my $page = $self->{page};

  $page =~ s/\&apos;/\'/gi;	# confuses iSilo
  $page =~ s/\&\#150;/-/gi;	# bad Industry Standard - no cookie!

  $page =~ s/[ \t]+/ /g;	# canonicalise down to one space
  $page =~ s/\n /\n/gs;		# leading w/s on each line
  $page =~ s/\n{3,}/\n\n/gs;	# too many blank lines
  $page =~ s/^\s+//gs;		# blank space at start of story
  $page =~ s/\s+$//gs;		# blank space at end of story

  $self->{page} = $page;
}

# ---------------------------------------------------------------------------

sub match_url {
  my ($self, $url, $pat) = @_;
  my $re;
  $re = $self->{url_regexp_cache}->{$pat};
  if (!defined $re) {
    $re = $self->{url_regexp_cache}->{$pat} = qr{^${pat}(?:\#|$)};
  }

  $url =~ $re;
}

sub absolute_url {
  my $self = $_[0];
  local ($_) = $_[1];

  s/^\s*\"//; s/\"\s*$//;       # trim quotes if necessary
  s/^\s*\'//; s/\'\s*$//;       # dodgy quotes
  s/^%22//; s/%22.*?$//;        # trim escaped quotes (!!)

  # HTML escapes are not supposed to be in URLs, but they do get there
  s/&amp;/&/g;      

  if (/^[^\/]+:/) {
    # if the URL begins with protocol:, assume it's already absolute.
    return $_;
  }

  require URI::URL;
  my $url = new URI::URL ($_, $self->{url});
  $url->abs->as_string;
}

# try to preserve images used as capital letters starting a story. NYTimes
# does this. Also track down images we want to keep.
sub clean_inline_images_work {
  my $self = shift;
  my $url = shift;
  my $tag = shift;

  my $usealt = $self->{alt_tags_allowed_regexp};
  if (defined ($usealt) &&
	$tag =~ /(?:^|\s)src\s*=\s*[\"\']?([^\"\'> ]+?)[\"\']?(?:$|\s)/is)
  {
    my $src = $1;

    if ($tag =~ /(?:^|\s)alt\s*=\s*\"([^\"]+)\"(?:$|\s)/is
      || $tag =~ /(?:^|\s)alt\s*=\s*\'([^\']+)\'(?:$|\s)/is
      || $tag =~ /(?:^|\s)alt\s*=\s*([^>\s]+?)(?:$|\s)/is)
    {
      my $alt = $1;

      $src = $self->absolute_url ($src);
      if ($self->match_url ($src, $usealt)) {
	$self->dbg ("using alt tag \"$alt\" for img: $src");
	return $alt;
      } else {
	# $self->dbg ("not using alt tag \"$alt\" for img: $src");
      }
    }
  }

  if ($tag =~ /(?:^|\s)alt\s*=\s*[\"\']?([A-Za-z0-9])[\"\']?(?:$|\s)/is) {
    $self->dbg ("converting one-letter img to letter: $1");
    return $1;
  }

  my $imgurl = $self->{keep_img_regexp};
  if (defined ($imgurl)) {
    if ($tag =~ /(?:^|\s)src=[\"\']?([^\"\'> ]+)[\"\']?(?:$|\s)/is) {
      my $src = $1;

      $src = $self->absolute_url ($src);
      if ($self->match_url ($src, $imgurl)) {
	$self->dbg ("keeping img: $src");
	$tag =~ s/(?:^|\s)src=[\"\']?[^\"\'> ]+[\"\']?(?:$|\s)/ /gis;

	my $newimgtag;
	if (defined $self->{img_loader}) {
	  $newimgtag = &{$self->{img_loader}} ($self->{url}, $src, $tag);
	} else {
	  carp "image_loader not set!";
	}

	if (defined $newimgtag) {
	  return $newimgtag;
	} else {
	  return "";
	}
      }
    }
  }

  " ";
}

sub clean_inline_images_in_a_hrefs {
  my $self = shift;
  my $url = shift;
  my $tag = shift;

  my $imgurl = $self->{keep_img_regexp};
  if (!defined ($imgurl)
      || $tag !~ /(?:^|\s)href=[\"\']?([^\"\'> ]+)[\"\']?(?:$|\s)/is)
  {
    return "<a $tag>";
  }

  my $src = $1;

  # for this code, ensure the URL has an image type in the filename.
  # don't $-anchor the pattern, as that would block some mapping-type
  # image URLs, such as http://foo/foo.gif?x=24&y=34 .
  #
  $src = $self->absolute_url ($src);
  if (!$self->match_url ($src, $imgurl)
    	|| $src !~ /\.(?:gif|png|jpeg|jpg|jpe)/i)
  {
    return "<a $tag>";
  }

  $self->dbg ("keeping img in A HREF: $src");
  $tag =~ s/(?:^|\s)href=[\"\']?[^\"\'> ]+[\"\']?(?:$|\s)/ /gis;

  my $relative;
  if (defined $self->{img_loader}) {
    $relative = &{$self->{img_loader}} ($self->{url}, $src, $tag);
    # that method will turn it into an <img src=...> tag. Just
    # extract the image file's URL.
    $relative =~ s/^.*src=\"(\S+)\".*$/$1/g;
  } else {
    carp "image_loader not set!";
  }

  if (defined $relative) {
    return "<a href=\"".$relative."\" $tag>";
  } else {
    return "<a href=\"".$src."\" $tag>";
  }
}

sub clean_pre_work {
  my ($self, $txt) = @_;
  $txt =~ s/[ \t]+\n/\n/g;
  $txt =~ s/<(|\/)(pre|code)>//g;	# strip extra <pre> tags!

  # convert blank lines to a paragraph separator.
  $txt =~ s/\n{1,}\n/<p>\n\n/g;

  # The idea with this one is to add a <br> at the end of lines shorter
  # than 50 columns, and conversely to allow lines longer than 50 cols to
  # run into the next line as if they were part of a paragraph.  I'm not
  # sure about it, but a lot of <pre> sites are just copies of emails, so
  # it can make them look a lot better, since the Palm's screen is a
  # lot narrower than 80 columns (which is what most <pre> pages aim for).
  # REVISIT - Make this a .site file parameter?
  $txt =~ s/\n\s*(.+[<>].+)\s*\n/<br>\n$1<br>\n/g;
  $txt =~ s/\n\s*([^\n]{1,50})\s*\n/\n$1<br>\n/g;

  $txt =~ s/[ \t]+/ /g;
  $txt;
}

sub dbg {
  my $self = shift;
  if (defined $self->{dbgcb}) {
    &{$self->{dbgcb}} (@_);
  }
}

1;
