module REXML
	# Generates Source-s.  USE THIS CLASS.
	class SourceFactory
		# Generates a Source object
		# @param arg Either a String, or an IO
		# @return a Source, or nil if a bad argument was given
		def SourceFactory::create_from arg#, slurp=true
			if arg.kind_of? String
				source = Source.new(arg)
			elsif arg.kind_of? IO
				source = IOSource.new(arg)
			end
			source
		end
	end

	# A Source can be searched for patterns, and wraps buffers and other
	# objects and provides consumption of text
	class Source
		# The current buffer (what we're going to read next)
		attr_reader :buffer
		# The line number of the last consumed text
		attr_reader :line
		attr_reader :encoding

		UTF16=0
		UTF8=1
		ISO_8859_1=2
		UNILE=3
		ENCODINGS = {
			UTF16=>"UTF-16",
			UTF8=>"UTF-8",
			ISO_8859_1=>"ISO-8859-1",
			UNILE=>"UNILE"
		}

		# Constructor
		# @param arg must be a String, and should be a valid XML document
		def initialize arg
			@orig = @buffer = arg
			@encoding = check_encoding( @buffer )
			@buffer = utf8_enc(@buffer) if [UTF16,UNILE,ISO_8859_1].include?(@encoding)
			@line = 0
		end

		def Source::encoding_val( enc )
			case enc
			when /^iso-8859-1$/i; ISO_8859_1
			when /^utf-8$/i; UTF8
			when /^unile$/i; UNILE
			when /^utf-16/i; UTF16
			end
		end
		def Source::val_encoding( enc )
			ENCODINGS[enc]
		end

		def encoding=(enc)
			@encoding = Source::encoding_val(enc)
			if @encoding != UTF8
				@to_utf = true
				@buffer = utf8_enc(@buffer)
			end
		end

		# Scans the source for a given pattern.  Note, that this is not your
		# usual scan() method.  For one thing, the pattern argument has some
		# requirements; for another, the source can be consumed.  You can easily
		# confuse this method.  Originally, the patterns were easier
		# to construct and this method more robust, because this method 
		# generated search regexes on the fly; however, this was 
		# computationally expensive and slowed down the entire REXML package 
		# considerably, since this is by far the most commonly called method.
		# @param pattern must be a Regexp, and must be in the form of
		# /^\s*(#{your pattern, with no groups})(.*)/.  The first group
		# will be returned; the second group is used if the consume flag is
		# set.
		# @param consume if true, the pattern returned will be consumed, leaving
		# everything after it in the Source.
		# @return the pattern, if found, or nil if the Source is empty or the
		# pattern is not found.
		def scan pattern, consume=false
			return nil if @buffer.nil?
			rv = @buffer.scan(pattern)
			@buffer = $' if consume and rv.size>0
			rv
		end

		def match pattern, consume=false
			md = pattern.match @buffer
			@buffer = $' if consume and md
			return md
		end

		# @return true if the Source is exhausted
		def empty?
			@buffer.nil? or @buffer.strip.nil?
		end

		# @return the current line in the source
		def current_line
			lines = @orig.split
			res = lines.grep @buffer[0..30]
			res = res[-1] if res.kind_of? Array
			lines.index( res ) if res
		end

		# Taken from code contributed by Ernest Ellingson <erne@powernav.com>
		def check_encoding(str)
			if str=~/^\376\377/
				UTF16  #unicode big endian
			elsif str=~/^\377\376/
				UNILE #unicode litle endian
			else
				UTF8
			end
		end

		# Taken from code contributed by Ernest Ellingson <erne@powernav.com>
		def utf8_enc(str)
			if @encoding==ISO_8859_1
				asc(str)
			elsif @encoding==UTF16
				utf16(str)
			elsif @encoding==UNILE
				unile(str)
			else
				return str
			end
		end

		def asc(str)
			str.unpack('C*').pack('U*')
		end
		def utf16(str)
			array_enc=str.unpack('C*')
			array_utf8 = []
			2.step(arrayEnc.size-1, 2){|i| 
				array_utf8 << (array_enc.at(i+1) + array_enc.at(i)*0x100)
			}
			array_utf8.pack('U*')
		end
		def unile(str)
			array_enc=str.unpack('C*')
			array_utf8 = []
			2.step(array_enc.size-1, 2){|i| 
				array_utf8 << (array_enc.at(i) + array_enc.at(i+1)*0x100)
			}
			array_utf8.pack('U*')
		end
		def utf8(str)
			str
		end
	end

	# A Source that wraps an IO.  See the Source class for method
	# documentation
	class IOSource < Source
		attr_reader :block_size

		def initialize arg, block_size=500
			@er_source = @source = arg
			@block_size = block_size
			super @source.read(@block_size)
			@to_utf = true
			al = case @encoding
			when UTF16; :utf16
			when UNILE; :unile
			when ISO_8859_1; :asc
			else
				@to_utf = false
				:utf8
			end
			instance_eval "alias :encode #{al}"
		end

		def scan pattern, consume=false
			rv = super
			# You'll notice that this next section is very similar to the same
			# section in match(), but just a liiittle different.  This is
			# because it is a touch faster to do it this way with scan()
			# than the way match() does it; enough faster to warrent duplicating
			# some code
			if rv.size == 0
				until @buffer =~ pattern or @source.nil?
					begin
						str = @source.read(@block_size)
						str = encode(str) if str and @to_utf
						@buffer << str
					rescue
						@source = nil
					end
				end
				rv = super
			end
			rv
		end

		def match pattern, consume=false
			rv = super
			while !rv and @source
				begin
					str = @source.read(@block_size)
					str = encode(str) if str and @to_utf
					@buffer << str
					rv = super
				rescue
					@source = nil
				end
			end
			rv
		end
		
		def empty?
			super and ( @source.nil? || @source.eof? )
		end

		# @return the current line in the source
		def current_line
			@er_source.rewind
			line = 0
			count = 0
			@buffer.split("\n").each {|l|
				b = Regexp.escape( l )
				@er_source.find {|l| l =~ /#{b}/u}
				line = @er_source.lineno - count
				count += 1
			}
			line
		end
	end
end
