#
# htmlscan.rb
#
#   Copyright (C) Ueno Katsuhiro 2000,2001
#
# $Id: htmlscan.rb,v 1.5 2001/01/04 12:36:04 katsu Exp $
#

require 'xmlscan'


class XMLScanner

  module HTML

    private

    def entityref_literal(ref)
      PredefinedEntity[ref]
    end


    def on_attribute_value(key, val)
      if val then
        inc = 0
        val.gsub!(/&([^;\s]+?\b);?/) { |m|
          if (s = $1)[0] == ?\# then
            rep = parse_charref(s)
          else
            rep = entityref_literal(s)
          end
          unless rep then
            @unexpanded_entityrefs = [] unless defined? @unexpanded_entityrefs
            @unexpanded_entityrefs.push [ key.dup, $~.begin(0) + inc, s ]
            rep = ''
          end
          inc += rep.size - m.size
          rep
        }
      end
      true
    end


    def scan_content(s)
      while true
        unless /&/ =~ s then
          on_chardata s
        else
          on_chardata s unless (s = $`).empty?
          $'.split(/&/, -1).each { |i|
            unless /\A([^;\s]+?\b);?/ =~ i then
              i = '&' << i
            else
              e, i = $1, $'
              if e[0] == ?\# then
                parse_charref e
              else
                on_entityref e
              end
            end
            on_chardata i unless i.empty?
          }
        end
        break if @src.tag_start?
        s = @src.pop
        break unless s
        on_chardata '>' unless s == '>'
      end
    end


    def scan_pi(s)
      s[0,2] = ''
      pi = s
      until @src.tag_end?
        s = @src.pop
        unless s then
          parse_error "unterminated PI meets EOF"
          break
        end
        pi << '>' if s[0] != ?<
          pi << s
      end
      on_pi '', pi
    end


    def scan_stag(s)
      attr = {}
      unless /(?=[\/\s])/ =~ s then
        name = s
        name[0,1] = ''
        if name.empty? then   # << or <>
          if @src.tag_end? then
            parse_error "found an empty start tag `<>'"
          else
            parse_error "parse error at `<'"
            return on_chardata('<' + s)
          end
        end
      else
        name, s = $`, $'
        name[0,1] = ''
        if name.empty? then   # < tag
          parse_error "parse error at `<'"
          unless /\A(?:(?!\n\s*\n)\s)*([^\/\s]+)/ =~ s then
            return on_chardata('<' << s)
          end
          name, s = $1, $'
        end
        begin
          complete = true
          s.scan(/\s+(?:([^=\s]+)(?:\s*=\s*('[^']*'?|"[^"]*"?|[^="'\s]+))?|\z)|\s*(.[^='"\s]*)/m
                 ) { |key,val,err|
            if key then
              if val then
                if val[0] == ?" or val[0] == ?' then  #'"
                  qmark = val.slice!(0,1)
                  if val[-1] == qmark[0] then
                    val.chop!
                  else
                    s = read_until(/#{qmark}/, val, 'attribute value')
                    complete = false
                    # always break here.
                  end
                end
              end
              if on_attribute_value(key, val) then
                parse_error "doubled attribute `#{key}'" if attr.key? key
                attr[key] = val || true
              end
            elsif err then
              parse_error "parse error at `#{err.split(/\b|\s/,2)[0]}'"
            end
          }
        end until complete
      end
      unclosed_tag 'start tag' unless @src.tag_end?
      on_stag name, attr
    end


    DOCTYPEPattern = instance_eval {
      pidc = '[-\'()+,./:=?;!*#@$_% \\r\\na-zA-Z0-9]'
      pidc2 = pidc.delete("'")
      /\A([^\s\["']+)(?:\s+(?:SYSTEM|PUBLIC(?:\s+("#{pidc}*"|'#{pidc2}*'))?)\s+("[^"]*"?|'[^']*'?))\s*/i
    }

    def scan_doctype(s)
      unless DOCTYPEPattern =~ s then
        parse_error "parse error in DOCTYPE"
        return
      end
      root, pubid, sysid, s = $1, $2, $3, $'
      if pubid then
        pubid.chop!
        pubid[0,1] = ''
        pubid.gsub!(/\s+/, ' ')
      end
      if sysid then
        c = sysid.slice!(0,1)
        if c[0] == sysid[-1] then
          sysid.chop!
        else
          s = read_until(/#{c}\s*/, sysid, 'DOCTYPE')
        end
      end
      parse_error "parse error at `#{s.split(/\b|\s/,2)[0]}'" unless s.empty?
      unclosed_tag 'DOCTYPE' unless @src.tag_end?
      pubid, sysid = sysid, nil if pubid.nil? and sysid
      on_doctype root, pubid, sysid
    end


    def scan_prolog
      while s = @src.pop and not /\A<!DOCTYPE\s+/i =~ s
        if s.nil? or s.strip.size != 0 then
          @prolog = false
          return (s or scan_text(s))
        end
      end
      ret = scan_doctype($')
      @prolog = false
      ret
    end

  end

end



class HTMLScanner < XMLScanner
  include HTML
end

class LooseHTMLScanner < HTMLScanner
  include Loose
end

class RecoverableHTMLScanner < HTMLScanner
  include Recoverable
end




if __FILE__ == $0 then
  class TestScanner < HTMLScanner
    def on_error(path, lineno, msg)
      STDERR.printf "parse error:%s:%d: %s\n", path, lineno, msg
    end
  end
  STDOUT.sync = STDERR.sync = true

  if /\A--?\z/ === ARGV[0] then
    if ARGV.shift == '--' and ARGV.size == 1 then
      p = IO.popen("diff -u #{ARGV[0]} -", 'w')
      STDOUT.reopen p
      class Hash
        def []=(k,v)
          (@a ||= []).push [ k, v ]
        end
        def each(&b)
          @a.each(&b) if defined? @a
        end
      end
    end
    class TestScanner
      require 'xmltoken'
      def self.def_handler(*name)
        name.each { |i|
          eval "def on_#{i.downcase}(*a);print Tokenize::#{i}.new(*a).to_s;end"
        }
      end
      def_handler 'CharData', 'Comment', 'Doctype', 'PI', 'ETag', 'STag'
    end
  end

  src = ARGF.read
  scan = TestScanner.new
  t1 = Time.times.utime
  scan.parse(src)
  t2 = Time.times.utime
  STDERR.printf "%2.3f sec\n", t2 - t1
end
