#
# xmltoken.rb
#
#   Copyright (C) Ueno Katsuhiro 2000
#
# $Id: xmltoken.rb,v 1.2 2000/12/19 11:36:13 katsu Exp $
#

require 'xmlscan'


class XMLScanner

  module Tokenizer

    class Node
      def escape!(str)
        str.gsub!(/&/, '&amp;')
        str.gsub!(/</, '&lt;')
        str.gsub!(/>/, '&gt;')
        str.gsub!(/"/, '&quot;')
        str
      end
      private :escape!
      def inspect
        "#{super.split(' ',2)[0]} #{to_s.inspect}"
      end
      def content
        nil
      end
      def to_s
        content
      end
    end

    class CharData < Node
      def initialize(str)
        @content = str
      end
      def concat(s)
        @content << s
      end
      attr_reader :content
      def to_s
        escape! @content
      end
    end

    class Comment < Node
      def initialize(src)
        @src = src
      end
      def content
        @src = @src.join if @src.is_a? Array
        @src
      end
      def to_s
        "<!--#{content}-->"
      end
    end

    class PI < Node
      def initialize(target, pi)
        @target, @content = target, pi
      end
      attr_reader :target, :content
      def to_s
        "<?#{@name} #{@content}?>"
      end
    end

    class XMLDecl < Node
      def initialize(version, encoding, standalone)
        @version, @encoding, @standalone = version, encoding, standalone
      end
      attr_reader :version, :encoding, :standalone
      def to_s
        s = %'<?xml version="#{@version}"'
        s << %' encoding="#{encoding}"' if @encoding
        if standalone then
          s << %' standalone="yes"'
        elsif standalone == false then
          s << %' standalone="no"'
        end
        s << ' ?>'
      end
    end

    class Doctype < Node
      def initialize(root, pubid, sysid)
        @root, @pubid, @sysid = root, pubid, sysid
      end
      attr_reader :root, :pubid, :sysid
      def public?
        not pubid.nil?
      end
      def system?
        pubid.nil? and not sysid.nil?
      end
      def to_s
        s = "<!DOCTYPE #{@root}"
        if @pubid then
          s << %' PUBLIC "#{@pubid}"'
          s << %' "#{@sysid}"' if sysid
        elsif @sysid then
          if /"/ =~ @sysid then
            s << " SYSTEM '#{@sysid}'"
          else
            s << %' SYSTEM "#{@sysid}"'
          end
        end
        s << '>'
      end
    end

    class Tag < Node
      attr_reader :name
    end

    class ETag < Tag
      def initialize(name)
        @name = name
      end
      def to_s
        "</#{@name}>"
      end
    end

    class STag < Tag
      def initialize(name, attr)
        @name, @attr = name, attr
      end
      def attr_to_s
        @attr.collect{ |k,v| "#{k}=\"#{escape!(v)}\"" }.unshift('').join(' ')
      end
      private :attr_to_s
      def to_s
        "<#{@name}#{attr_to_s}>"
      end
    end

    class EmptyElem < STag
      def to_s
        "<#{@name}#{attr_to_s}/>"
      end
    end

    class Reference < Node
      attr_reader :content
    end

    class EntityRef < Reference
      def initialize(name,s)
        @name, @content = name, s
      end
      def to_s
        "&#{@name};"
      end
    end

    class CharRef < Reference
      def initialize(code)
        @content = code
      end
      def to_s
        "&\##{@content};"
      end
    end


    def initialize(*args)
      super
      @__token_parsed__ = []
    end


    private

    def entityref_literal(ref)
      PredefinedEntity[ref] or ''
    end

    def scan_prolog
      ret = super
      if @__token_parsed__.empty? then
        ret
      else
        @__token_parsed__.push ret if ret
        @__token_parsed__.shift
      end
    end


    def on_chardata(str)
      super
      if (l = @__token_parsed__[-1]) and l.is_a? CharData then
        l.concat str
      else
        @__token_parsed__.push CharData.new(str)
      end
      nil
    end

    def on_entityref(ref)
      super
      @__token_parsed__.push EntityRef.new(ref, entityref_literal(ref))
      nil
    end

    def on_charref(ref)
      super
      @__token_parsed__.push CharRef.new(ref)
      nil
    end

    def on_comment(strs)
      super
      ret = Comment.new(strs)
      if @prolog then
        @__token_parsed__.push ret
        ret = nil
      end
      ret
    end

    def on_pi(target, pi)
      super
      ret = PI.new(target, pi)
      if @prolog then
        @__token_parsed__.push ret
        ret = nil
      end
      ret
    end

    def on_xmldecl(version, encoding, standalone)
      super
      @__token_parsed__.push XMLDecl.new(version, encoding, standalone)
      nil
    end

    def on_doctype(root, pubid, sysid)
      super
      @__token_parsed__.push Doctype.new(root, pubid, sysid)
      nil
    end

    def on_etag(name)
      super
      ETag.new(name)
    end

    def on_stag(name, attr)
      super
      STag.new(name, attr)
    end

    def on_emptyelem(name, attr)
      super
      EmptyElem.new(name, attr)
    end


    public

    def get_token
      unless @__token_parsed__.empty? then
        @__token_parsed__.shift
      else
        step or @__token_parsed__.shift
      end
    end


    include Enumerable

    def each(src = nil)
      @src.feed src if src
      yield(scan_prolog) if @prolog
      while true
        yield @__token_parsed__.shift until @__token_parsed__.empty?
        break unless s = @src.pop
        s = scan_text(s)
        yield s if s
      end
      self
    end

  end

end



class XMLTokenizer < XMLScanner
  include Tokenizer
  private :step, :parse
end




if $0 == __FILE__ then
  class TestScanner < XMLTokenizer
    def on_error(path, lineno, msg)
      STDERR.printf "%s:%d: %s\n", path, lineno, msg
    end
  end
  if /\A--?\z/ === ARGV[0] then
    if (opt = ARGV.shift) == '--' and ARGV.size == 1 then
      p = IO.popen("diff -u #{ARGV[0]} -", 'w')
      STDOUT.reopen p
      class Hash
        def []=(k,v)
          (@a ||= []).push [ k, v ]
        end
        def each(&b)
          @a.each(&b) if defined? @a
        end
      end
    end
  end

  src = ARGF.read
  scan = TestScanner.new(src)
  if opt then
    t1 = Time.times.utime
    scan.each { |i| print i.to_s }
    t2 = Time.times.utime
  else
    t1 = Time.times.utime
    while t = scan.get_token
    end
    t2 = Time.times.utime
  end
  STDERR.printf "%2.3f sec\n", t2 - t1
end
