# XML::Driver::HTML
#
# Copyright (c) 2000 Michael Koehne
#
# XML::Driver::HTML is free software. You can use and redistribute
# this copy under terms of the GNU General Public License.
use 5.006;
no warnings 'utf8' ;
package XML::Driver::HTML;
use HTML::TreeBuilder;
use strict;
use vars qw($VERSION $METHODS);
$VERSION = '0.06';
$METHODS = {
start_document => 1,
end_document => 1,
start_element => 1,
end_element => 1,
characters => 1,
comment => 1
};
$HTML::TreeBuilder::Debug = 0; # default debug level
sub new {
my $proto = shift;
my $self = ($#_ == 0) ? { %{ (shift) } } : { @_ };
my $class = ref($proto) || $proto;
bless($self, $class);
}
sub parse {
my $self = shift;
my $args = ($#_ == 0) ? { %{ (shift) } } : { @_ };
my $file;
my $result = undef;
$self->{'Source'} = $args->{'Source'} if $args->{'Source'};
$self->{'Handler'} = $args->{'Handler'} if $args->{'Handler'};
die "no Source defined" unless $self->{'Source'};
die "no Handler defined" unless $self->{'Handler'};
my $h = HTML::TreeBuilder->new;
$h->ignore_unknown(1);
$h->warn(0);
$h->{'_store_comments'}=1;
$h->{'_store_declarations'}=1;
if ($self->{'Source'}{'ByteStream'}) {
$h->parse_file($self->{'Source'}{'ByteStream'});
} elsif ($self->{'Source'}{'String'}) {
$h->parse($self->{'Source'}{'String'});
$h->eof();
} elsif ($self->{'Source'}{'SystemId'}) {
$h->parse_file($self->{'Source'}{'SystemId'});
} else {
die "no Source defined";
}
$self->{'Methods'} = {};
foreach (keys %$METHODS) {
$self->{'Methods'}{$_} = 1 if $self->{'Handler'}->can($_);
}
delete $self->{'Recode'};
$self->{'Recode'} = 1 if lc($self->{'Source'}{'Encoding'}) eq "iso-8859-1";
$self->{'Recode'} = 1 if lc($self->{'Source'}{'Encoding'}) eq "iso88591";
$self->{'Recode'} = 1 if lc($self->{'Source'}{'Encoding'}) eq "latin1";
$self->{'Handler'}->start_document()
if $self->{'Methods'}{'start_document'};
$self->dumptree($h);
$result = $self->{'Handler'}->end_document()
if $self->{'Methods'}{'end_document'};
$h = $h->delete(); # nuke it!
return $result;
}
sub dumptree {
my ($self,$element) = @_;
my $tag = $element->{'_tag'};
my $cont = $element->{'_content'};
my $attr = {};
my $value;
return if $tag eq "style";
if ($tag) {
if ($self->{'Recode'}) {
foreach (keys %$element) {
if ($_ !~ '^_') {
$value = $element->{$_};
$value =~
s/([\x80-\xFF])/chr(0xC0|ord($1)>>6).chr(0x80|ord($1)&0x3F)/eg;
$attr->{$_} = $value;
}
}
} else {
foreach (keys %$element) {
$attr->{$_} = $element->{$_}
if $_ !~ '^_';
}
}
if ($tag !~ /^\~/) {
$self->{'Handler'}->start_element( {
'Name' => $tag, 'Attributes' => $attr
} )
if $self->{'Methods'}{'start_element'};
foreach (@$cont) {
if (ref $_ eq 'HTML::Element') {
$self->dumptree($_)
} else {
s/([\x80-\xFF])/chr(0xC0|ord($1)>>6).chr(0x80|ord($1)&0x3F)/eg
if $self->{'Recode'};
$self->{'Handler'}->characters( { 'Data' => $_ } )
if ($self->{'Methods'}{'characters'} && (lc($_) !~ "^[ \t]*{'Handler'}->end_element( { 'Name' => $tag } )
if $self->{'Methods'}{'end_element'};
}
if ($tag eq "~comment") {
$self->{'Handler'}->comment( { 'Data' => $attr->{'text'} } )
if $self->{'Methods'}{'comment'};
}
}
}
1;
__END__
=head1 NAME
XML::Driver::HTML - SAX Driver for non wellformed HTML.
=head1 SYNOPSIS
use XML::Driver::HTML;
$driver = new XML::Driver::HTML(
'Handler' => $some_sax_filter_or_handler,
'Source' => $some_PerlSAX_like_hash
);
$driver->parse();
or
use XML::Driver::HTML;
$driver = new XML::Driver::HTML();
$driver->parse(
'Handler' => $some_sax_filter_or_handler,
'Source' => $some_PerlSAX_like_hash
);
$driver->parse(
'Handler' => $some_other_sax_filter_or_handler,
'Source' => $some_other_source
);
=head1 DESCRIPTION
XML::Driver::HTML is a SAX Driver for HTML. There is no need for the HTML
input to be weel formed, as XML::Driver::HTML is generating its SAX events
by walking a HTML::TreeBuilder object. The simplest kind of use, is a
filter from HTML to XHTML using XML::Handler::YAWriter as a SAX Handler.
my $ya = new XML::Handler::YAWriter(
'Output' => new IO::File ( ">-" ),
'Pretty' => {
'NoWhiteSpace'=>1,
'NoComments'=>1,
'AddHiddenNewline'=>1,
'AddHiddenAttrTab'=>1,
}
);
my $html = new XML::Driver::HTML(
'Handler' => $ya,
'Source' => { 'ByteStream' => new IO::File ( "<-" ) }
);
$html->parse();
=head2 METHODS
=over
=item new
Creates a new XML::Driver::HTML object. Default options
for parsing, described below, are passed as key-value
pairs or as a single hash. Options may be changed
directly in the object.
=item parse
Parses a document. Options, described below, are
passed as key-value pairs or as a single hash.
Options passed to B override the default options
in the parser object for the duration of the parse.
=back
=head2 OPTIONS
The following options are supported by XML::Driver::HTML :
=over
=item Handler
Default SAX Handler to receive events
=item Source
Hash containing the input source for parsing.
The `Source' hash may contain the following parameters:
=over
=item ByteStream
The raw byte stream (file handle) containing the document.
=item String
A string containing the document.
=item SystemId
The system identifier (URL) of the document.
=item Encoding
A string describing the character encoding.
=back
If more than one of `ByteStream', `String', or `SystemId',
then preference is given first to `ByteStream', then
`String', then `SystemId'.
=back
=head1 NOTES
XML::Driver::HTML requires Perl 5.6 to convert from ISO-8859-1 to UTF-8.
=head1 BUGS
not yet implemented:
Interpretation of SystemId as being an URI
XHTML document type
other bugs:
HTML::Parser and HTML::TreeBuilder bugs concerning DOCTYPE and CSS.
Perl handling of UFT8 is compatible between different versions. So
you need exactly Perl 5.6.0, not lower not higher.
=head1 AUTHOR
Michael Koehne, Kraehe@Copyleft.De
(c) 2001 GNU General Public License
=head1 SEE ALSO
L and L
=cut