#!/usr/local/bin/python
# PyTREX: A clean-room implementation of TREX in Python
# by James Tauber
#
# http://pytrex.sourceforge.net/
#
# Copyright (c) 2001, James Tauber
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# * Redistributions of source code must retain the above copyright
#   notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
#   notice, this list of conditions and the following disclaimer in
#   the documentation and/or other materials provided with the
#   distribution.
# * The name "James Tauber" may not be used to endorse or promote
#   products derived from this software without specific prior written
#   permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.


########################################################################
#
# Has support for pretty much everything discussed in sections 1-6 of
# tutorial
#
# TO USE FROM THE COMMAND LINE:
# - python pytrex.py <trex-file> <instance-file>
#
# TO USE IN OTHER PYTHON SCRIPTS:
# - import the pytrex.py file
# - parse the TREX file:
#     trex = parse_TREX("foo.trex")
# - parse the instance file:
#     instance = parse_Instance("bar.xml")
# - validate
#     match = validate(trex, instance)
#   match will be an Error object if invalid - test with isError()
#
# You can see an internal representation of the TREX grammar and the
# instance with trex.display() and instance.display() respectively
#
# You can also see a representation of the match object returned by
# validate with match.display()
#
# NOT IMPLEMENTED YET
# (besides patterns not listed below)
# - support for QNames
# - trex namespace
#
# PATTERNS SUPPORTED
# - element
# - attribute
# - anyString
# - string
# - empty
# - oneOrMore
# - group
# - choice
# - interleave
# - grammar (at top level)
# - start
# - ref (without parent attribute)
# - zeroOrMore
# - optional
#
# NAME-CLASSES SUPPORTED
# - name
#
# questions:
#
# zeroOrMore = empty | oneOrMore
# but empty doesn't allow whitespace and zeroOrMore does

debug_output = 0

########################################################################
### COMMON

class HandlerBase:
    def __init__(self, parser, parent):
        self.parser = parser
        self.parent = parent
        self.set_handlers()

    def set_handlers(self):
        pass

    def char(self, data):
        pass

    def child(self, name, atts):
        pass
    
    def end(self, name):
        if self.parent != None:
            self.parent.set_handlers()
        else: # must be root
            pass



########################################################################
### TREX PARSING

def parse_TREX(location, baseURI=None):
    if baseURI==None:
        baseURI = location

    import pyexpat
    parser = pyexpat.ParserCreate(namespace_separator="^")
    parser.SetBase(baseURI)
    parser.returns_unicode = 1

    r = T_RootHandler(parser, None)

    from urllib import urlopen
    f = urlopen(location)
    try:
        parser.ParseFile(f)
    except pyexpat.error:
        print u"Error parsing file at line '%s' and column '%s'\n" % (parser.ErrorLineNumber, parser.ErrorColumnNumber)
    except TREXError:
        print "Error parsing TREX file"
    f.close()

    return r.product


class TREXError:
    def __init__(self, value):
        self.value = value
    def __str__(self):
        return `self.value`


class T_RootHandler(HandlerBase):
    def __init__(self, parser, parent):
        HandlerBase.__init__(self, parser, parent)

    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.EndElementHandler = self.end

    def child(self, name, atts):
        if not handlePattern(self.parser, self, name, atts):
            print "%s not supported as root" % name

    def add_pattern(self, pattern):
        self.product = pattern


def handlePattern(parser, handler, name, atts):
    if name=="element":
        T_ElementHandler(parser, handler, atts)
    elif name=="empty":
        T_EmptyHandler(parser, handler)
    elif name=="zeroOrMore":
        T_ZeroOrMoreHandler(parser, handler)
    elif name=="oneOrMore":
        T_OneOrMoreHandler(parser, handler)
    elif name=="anyString":
        T_AnyStringHandler(parser, handler)
    elif name=="string":
        T_StringHandler(parser, handler, atts)
    elif name=="optional":
        T_OptionalHandler(parser, handler)
    elif name=="choice":
        T_ChoiceHandler(parser, handler)
    elif name=="interleave":
        T_InterleaveHandler(parser, handler)
    elif name=="mixed":
        T_MixedHandler(parser, handler)
    elif name=="group":
        T_GroupHandler(parser, handler)
    elif name=="attribute":
        T_AttributeHandler(parser, handler, atts)
    elif name=="grammar":
        T_GrammarHandler(parser, handler)
    elif name=="ref":
        T_RefHandler(parser, handler, atts)
    else:
        return 0
    return 1


class T_ElementHandler(HandlerBase):
    def __init__(self, parser, parent, atts):
        HandlerBase.__init__(self, parser, parent)
        self.product = T_Element()
        if atts.has_key(u"name"):
            # TODO: currently following only supports NCName, not QName
            self.add_nameclass(ExpandedName("",atts[u"name"]))
            
    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def child(self, name, atts):
        if name=="name":
            T_NameHandler(self.parser, self)
        else:
            if not handlePattern(self.parser, self, name, atts):
                print "WARNING: %s not supported inside element yet" % name

    def end(self, name):
        if self.product.name_class==None:
            raise TREXError, "element must have a name"
        self.parent.add_pattern(self.product)
        HandlerBase.end(self, name)

    def add_nameclass(self, name_class):
        self.product.name_class = name_class

    def add_pattern(self, pattern):
        if self.product.pattern==None:
            self.product.pattern = pattern
        else:
            group = T_Group(self.product.pattern, pattern)
            self.product.pattern = group


class T_AttributeHandler(HandlerBase):
    def __init__(self, parser, parent, atts):
        HandlerBase.__init__(self, parser, parent)
        self.product = T_Attribute()
        if atts.has_key(u"name"):
            # TODO: currently following only supports NCName, not QName
            self.add_nameclass(ExpandedName("",atts[u"name"]))
            
    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def child(self, name, atts):
        if name=="name":
            T_NameHandler(self.parser, self)
        else:
            if not handlePattern(self.parser, self, name, atts):
                print "WARNING: %s not supported inside attribute yet" % name

    def end(self, name):
        if self.product.name_class==None:
            raise TREXError, "attribute must have a name"
        if self.product.pattern==None:
            self.product.pattern = T_AnyString()
        self.parent.add_pattern(self.product)
        HandlerBase.end(self, name)

    def add_nameclass(self, name_class):
        self.product.name_class = name_class

    def add_pattern(self, pattern):
        self.product.pattern = pattern


class T_NameHandler(HandlerBase):
    def __init__(self, parser, parent):
        HandlerBase.__init__(self, parser, parent)
        self.product = ExpandedName()
        self.chardata = ""

    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def char(self, data):
        self.chardata = self.chardata + data
        
    def child(self, name, atts):
        print "WARNING: name should not have child '%s'" % name

    def end(self, name):
        self.product.namespaceURI = ""
        self.product.NCName = self.chardata
        self.parent.add_nameclass(self.product)
        HandlerBase.end(self, name)
        

class T_EmptyHandler(HandlerBase):
    def __init__(self, parser, parent):
        HandlerBase.__init__(self, parser, parent)
        self.product = T_Empty()
        
    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def char(self, data):
        print "WARNING: empty should not have content"

    def child(self, name, atts):
        print "WARNING: empty should not have content"

    def end(self, name):
        self.parent.add_pattern(self.product)
        HandlerBase.end(self, name)


class T_AnyStringHandler(HandlerBase):
    def __init__(self, parser, parent):
        HandlerBase.__init__(self, parser, parent)
        self.product = T_AnyString()
        
    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def char(self, data):
        print "WARNING: anyString should not have content"

    def child(self, name, atts):
        print "WARNING: anyString should not have content"

    def end(self, name):
        self.parent.add_pattern(self.product)
        HandlerBase.end(self, name)


class T_StringHandler(HandlerBase):
    def __init__(self, parser, parent, atts):
        HandlerBase.__init__(self, parser, parent)
        self.chardata = ""
        self.whitespace_normalize = 1
        if atts.has_key(u"whiteSpace"):
            if atts["whiteSpace"]=="normalize":
                self.whitespace_normalize = 1
            elif atts["whiteSpace"]=="preserve":
                self.whitespace_normalize = 0
            else:
                print "WARNING: whiteSpace attribute on string must be normalize or preserve, not %s" % atts["whiteSpace"]
        
    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def char(self, data):
        self.chardata = self.chardata + data

    def child(self, name, atts):
        print "WARNING: string should not have element content"

    def end(self, name):
        self.parent.add_pattern(T_String(self.chardata, self.whitespace_normalize))
        HandlerBase.end(self, name)


class T_ZeroOrMoreHandler(HandlerBase):
    def __init__(self, parser, parent):
        HandlerBase.__init__(self, parser, parent)

    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def child(self, name, atts):
        if not handlePattern(self.parser, self, name, atts):
            print "WARNING: %s not supported inside zeroOrMore yet" % name

    def end(self, name):
        self.parent.add_pattern(self.product)
        HandlerBase.end(self, name)

    def add_pattern(self, pattern):
        self.product = T_Choice(T_Empty(), T_OneOrMore(pattern))


class T_MixedHandler(HandlerBase):
    def __init__(self, parser, parent):
        HandlerBase.__init__(self, parser, parent)

    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def child(self, name, atts):
        if not handlePattern(self.parser, self, name, atts):
            print "WARNING: %s not supported inside mixed yet" % name

    def end(self, name):
        self.parent.add_pattern(self.product)
        HandlerBase.end(self, name)

    def add_pattern(self, pattern):
        self.product = T_Interleave(T_AnyString(), pattern)


class T_OneOrMoreHandler(HandlerBase):
    def __init__(self, parser, parent):
        HandlerBase.__init__(self, parser, parent)

    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def child(self, name, atts):
        if not handlePattern(self.parser, self, name, atts):
            print "WARNING: %s not supported inside oneOrMore yet" % name

    def end(self, name):
        self.parent.add_pattern(self.product)
        HandlerBase.end(self, name)

    def add_pattern(self, pattern):
        self.product = T_OneOrMore(pattern)


class T_OptionalHandler(HandlerBase):
    def __init__(self, parser, parent):
        HandlerBase.__init__(self, parser, parent)

    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def child(self, name, atts):
        if not handlePattern(self.parser, self, name, atts):
            print "WARNING: %s not supported inside optional yet" % name

    def end(self, name):
        self.parent.add_pattern(self.product)
        HandlerBase.end(self, name)

    def add_pattern(self, pattern):
        self.product = T_Choice(T_Empty(), pattern)


class T_ChoiceHandler(HandlerBase):
    def __init__(self, parser, parent):
        HandlerBase.__init__(self, parser, parent)
        self.pattern_1 = None
        self.pattern_2 = None

    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def child(self, name, atts):
        if not handlePattern(self.parser, self, name, atts):
            print "WARNING: %s not supported inside choice yet" % name

    def end(self, name):
        self.parent.add_pattern(self.product)
        HandlerBase.end(self, name)

    def add_pattern(self, pattern):
        if self.pattern_1==None:
            self.pattern_1 = pattern
            self.product = self.pattern_1
        elif self.pattern_2==None:
            self.pattern_2 = pattern
            self.product = T_Choice(self.pattern_1, self.pattern_2)
        else:
            self.product = T_Choice(self.product, pattern)


class T_InterleaveHandler(HandlerBase):
    def __init__(self, parser, parent):
        HandlerBase.__init__(self, parser, parent)
        self.pattern_1 = None
        self.pattern_2 = None

    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def child(self, name, atts):
        if not handlePattern(self.parser, self, name, atts):
            print "WARNING: %s not supported inside interleave yet" % name

    def end(self, name):
        self.parent.add_pattern(self.product)
        HandlerBase.end(self, name)

    def add_pattern(self, pattern):
        if self.pattern_1==None:
            self.pattern_1 = pattern
            self.product = self.pattern_1
        elif self.pattern_2==None:
            self.pattern_2 = pattern
            self.product = T_Interleave(self.pattern_1, self.pattern_2)
        else:
            self.product = T_Interleave(self.product, pattern)


class T_GroupHandler(HandlerBase):
    def __init__(self, parser, parent):
        HandlerBase.__init__(self, parser, parent)
        self.pattern_1 = None

    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def child(self, name, atts):
        if not handlePattern(self.parser, self, name, atts):
            print "WARNING: %s not supported inside group yet" % name

    def end(self, name):
        self.parent.add_pattern(self.product)
        HandlerBase.end(self, name)

    def add_pattern(self, pattern):
        if self.pattern_1==None:
            self.pattern_1 = pattern
        else:
            self.product = T_Group(self.pattern_1, pattern)


class T_GrammarHandler(HandlerBase):
    def __init__(self, parser, parent):
        HandlerBase.__init__(self, parser, parent)
        self.product = T_Grammar()
            
    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def child(self, name, atts):
        if name=="start":
            T_StartHandler(self.parser, self, atts)
        elif name=="define":
            T_DefineHandler(self.parser, self, atts)
        else:
            print "WARNING: %s not supported inside grammar" % name

    def end(self, name):
        if self.product.start==None:
            raise TREXError, "grammar must have a start"
        self.parent.add_pattern(self.product)
        HandlerBase.end(self, name)

    def set_start(self, pattern):
        self.product.start = pattern

    def add_definition(self, name, pattern):
        self.product.add_definition(name, pattern)


class T_StartHandler(HandlerBase):
    def __init__(self, parser, parent, atts):
        HandlerBase.__init__(self, parser, parent)
        self.product = None
        if atts.has_key(u"name"):
            self.name = atts[u"name"]
        else:
            self.name = None
            
    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def child(self, name, atts):
        if not handlePattern(self.parser, self, name, atts):
            print "WARNING: %s not supported inside start" % name

    def end(self, name):
        if self.product==None:
            raise TREXError, "start must contain a pattern"
        self.parent.set_start(self.product)
        if self.name != None:
            self.parent.add_definition(self.name, self.product)
        HandlerBase.end(self, name)

    def add_pattern(self, pattern):
        self.product = pattern


class T_RefHandler(HandlerBase):
    def __init__(self, parser, parent, atts):
        HandlerBase.__init__(self, parser, parent)
        if atts.has_key(u"name"):
            self.product = T_Ref(atts[u"name"])
        
    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def char(self, data):
        print "WARNING: ref should not have content"

    def child(self, name, atts):
        print "WARNING: ref should not have content"

    def end(self, name):
        self.parent.add_pattern(self.product)
        HandlerBase.end(self, name)


class T_DefineHandler(HandlerBase):
    def __init__(self, parser, parent, atts):
        HandlerBase.__init__(self, parser, parent)
        self.pattern = None
        if atts.has_key(u"name"):
            self.name = atts[u"name"]
        else:
            raise TREXError, "define must have a name"
        
    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def child(self, name, atts):
        if not handlePattern(self.parser, self, name, atts):
            print "WARNING: %s not supported inside start" % name

    def end(self, name):
        self.parent.add_definition(self.name, self.pattern)
        HandlerBase.end(self, name)

    def add_pattern(self, pattern):
        if self.pattern==None:
            self.pattern = pattern
        else:
            self.pattern = T_Group(self.pattern, pattern)



########################################################################
### TREX REPRESENTATION / VALIDATION

def validate(trex, instance):
    return trex.M({}, instance.children, {})


class Pattern:
    # each pattern has the following methods:
    # 
    # display()
    #   prints a string representation of the pattern
    #   (recursing over components)
    #
    # M(a,c,e)
    #   returns a Match object indicating whether the match succeeded or,
    #   if not, why not
    #
    # M_consume(a,c,e)
    #   similar to M above but allows for the match to consume only part of
    #   a and c. Because of non-determinism, multiple consumptions are possible
    #   and so the Match object returned will contain a list of possible
    #   remainders unless no matches are possible
    #
    # M_interleave(a,c,e)
    #   similar to M_consume but implements interleaving by allowing
    #   consumption from any part of the given c
    pass


class Match:
    # returned by M and M_consume
    def __init__(self, remainder=None):
        if remainder==None:
            self.remainders = []
        else:
            self.remainders = [remainder]

    def add(self, match):
        self.remainders.extend(match.remainders)

    def isError(self):
        return 0

    def display(self):
        print "(MATCH [",
        for remainder in self.remainders:
            remainder.display()
        print "] )",


class Error(Match):
    def __init__(self, message, *children):
        self.message = message
        self.children = children
        
    def isError(self):
        return 1

    def display(self):
        print "(ERROR",
        print self.message,
        for error in self.children:
            error.display()
        print ")",

class Remainder:
    def __init__(self, a, c):
        self.a = a
        self.c = c

    def display(self):
        print "(",self.a, "[",
        for node in self.c:
            node.display()
        print "] )",

class Environment:
    def __init__(self, e={}, parent=None):
        self.e = e
        self.parent = parent


def normalize(s):
    ns = ""
    state = 0
    for c in s:
        if state==0:
            if c in [chr(9),chr(10),chr(13),chr(32)]:
                continue
            else:
                ns = ns + c
                state=1
                continue
        elif state==1:
            if c in [chr(9),chr(10),chr(13),chr(32)]:
                state=2
                continue
            else:
                ns = ns + c
                continue
        elif state==2:
            if c in [chr(9),chr(10),chr(13),chr(32)]:
                continue
            else:
                ns = ns + " " + c
                state=1
    return ns


class T_Element(Pattern):
    def __init__(self, name_class=None, pattern=None):
        self.name_class = name_class
        self.pattern = pattern

    def display(self):
        print "(ELEMENT",
        self.name_class.display()
        self.pattern.display()
        print ")",

    def M(self, a, c, e):
        if debug_output:
            print
            print "element M where",
            print "a is", a
            print "c is [",
            for node in c:
                node.display()
            print "]"
            print "element name is",
            self.name_class.display()
            print
            print "element pattern is",
            self.pattern.display()
            print
            
        if len(a) > 0:
            return Error("has attributes")
        c_state=0
        for node in c:
            if node.is_whitespace():
                continue
            if node.is_element():
                if c_state==1:
                    return Error("second element")
                n = node.expanded_name
                a_1 = node.attributes
                c_1 = node.children
                c_state=1
        if c_state==0:
            return Error("no element")
        match = self.name_class.C(n)
        if match.isError():
            return Error("name doesn't match", match)
        match = self.pattern.M(a_1,c_1,e)
        if match.isError():
            return Error("pattern doesn't match", match)
        return Match()

    def M_consume(self, a, c, e):
        c_state=0
        for pos in range(0,len(c)):
            if c[pos].is_whitespace():
                continue
            if c[pos].is_element():
                if c_state==1:
                    return Match(Remainder(a, c[pos:]))
                n = c[pos].expanded_name
                a_1 = c[pos].attributes
                c_1 = c[pos].children
                c_state=1
                match = self.name_class.C(n)
                if match.isError():
                    return Error("name doesn't match", match)
                match = self.pattern.M(a_1, c_1, e)
                if match.isError():
                    return Error("pattern doesn't match", match)
        if c_state==0:
            return Error("no element")
        match = self.name_class.C(n)
        if match.isError():
            return Error("name doesn't match", match)
        match = self.pattern.M(a_1, c_1, e)
        if match.isError():
            return Error("pattern doesn't match", match)
        return Match(Remainder(a, []))

    def M_interleave(self, a, c, e):
        if debug_output:
            print
            print "element M_interleave where",
            print "a is", a
            print "c is [",
            for node in c:
                node.display()
            print "]"
            print "element name is",
            self.name_class.display()
            print
            print "element pattern is",
            self.pattern.display()
            print

        c_2 = []
        taken = 0
        for pos in range(0,len(c)):
            if debug_output:
                print "checking",
                c[pos].display()
            if c[pos].is_element():
                n = c[pos].expanded_name
                a_1 = c[pos].attributes
                c_1 = c[pos].children
                match = self.name_class.C(n)
                if match.isError():
                    c_2.append(c[pos])
                    continue
                match = self.pattern.M(a_1, c_1, e)
                if match.isError():
                    c_2.append(c[pos])
                    continue
                taken = 1
            else:
                c_2.append(c[pos])
        if debug_output:
            print "c_2 is [",
            for node in c_2:
                node.display()
                print "]"
        if taken:
            return Match(Remainder(a, c_2))
        else:
            return Error("element in interleave did not match")


class T_Attribute(Pattern):
    def __init__(self, name_class=None, pattern=None):
        self.name_class = name_class
        self.pattern = pattern

    def display(self):
        print "(ATTRIBUTE",
        self.name_class.display()
        self.pattern.display()
        print ")",

    def M(self, a, c, e):
        if len(c)>0:
            return Error("has children when should be empty")
        for attr in a:
            n = attr.expanded_name
            v = attr.value
            match_1 = self.name_class.C(n)
            match_2 = self.pattern.M({}, v, e)
            if (not match_1.isError()) and (not match_2.isError()):
                return Match()
        return Error("attribute did not match")

    def M_consume(self, a, c, e):
        for attr in a:
            n = attr.expanded_name
            v = attr.value
            match_1 = self.name_class.C(n)
            match_2 = self.pattern.M({}, v, e)
            if (not match_1.isError()) and (not match_2.isError()):
                a_2 = []
                for attr2 in a:
                    if attr2 != attr:
                        a_2.append(attr2)
                return Match(Remainder(a_2,c))
        return Error("attribute didn't match") # or should this be Match(Remainder(a,c))


class T_Empty(Pattern):

    def display(self):
        print "(EMPTY)",
        
    def M(self, a, c, e):
        if len(a) > 0:
            return Error("has attributes")
        if len(c) > 0:
            return Error("has children when should be empty")
        return Match()

    def M_consume(self, a, c, e):
        #if len(a) > 0:
        #    return Error("has attributes")
        #if len(c) > 0:
        #    return Error("has children when should be empty")
        return Match(Remainder(a,c))

    def M_interleave(self, a, c, e):
        return Match(Remainder(a,c))


class T_AnyString(Pattern):

    def display(self):
        print "(ANY-STRING)",
        
    def M(self, a, c, e):
        if len(a) > 0:
            return Error("has attributes")
        for node in c:
            if node.is_element():
                return Error("anyString but got element")
        return Match()

    def M_consume(self, a, c, e):
        if len(a) > 0:
            return Error("has attributes")
        if len(c)==0:
            return Error("anyString but no children")
        for pos in range(0,len(c)):
            if c[pos].is_element():
                if pos==0:
                    return Error("element where string required")
                else:
                    return Match(Remainder(a,c[pos:]))
        return Match(Remainder(a,[]))

    def M_interleave(self, a, c, e):
        c_2 = []
        taken = 0
        for pos in range(0, len(c)):
            if c[pos].is_element():
                c_2.append(c[pos])
            else:
                taken=1
        if taken:
            return Match(Remainder(a, c_2))
        else:
            return Error("anyString but no characters") # TODO maybe this is okay!?!?


class T_String(Pattern):
    def __init__(self, chardata, whitespace_normalize):
        self.chardata = chardata
        self.whitespace_normalize = whitespace_normalize
        
    def display(self):
        print "(STRING '%s')" % self.chardata

    def M(self, a, c, e):
        if len(a) > 0:
            return Error("has attributes")
        cdata = ""
        for node in c:
            if node.is_element():
                return Error("string but got element")
            else:
                 cdata = cdata + node.data
        if self.whitespace_normalize:
            if normalize(cdata) == normalize(self.chardata):
                return Match()
            else:
                return Error("character data '%s' did not match string '%s'" % (normalize(cdata), normalize(self.chardata)))
        else:
            if cdata == self.chardata:
                return Match()
            else:
                return Error("character data '%s' did not match string '%s'" % (cdata, self.chardata))

class T_Choice(Pattern):
    def __init__(self, pattern_1=None, pattern_2=None):
        self.pattern_1 = pattern_1
        self.pattern_2 = pattern_2

    def display(self):
        print "(CHOICE",
        self.pattern_1.display()
        self.pattern_2.display()
        print ")",
        
    def M(self, a, c, e):
        if debug_output:
            print
            print "choice M where c is [",
            for node in c:
                node.display()
            print "]"
            print "choice pattern 1 is",
            self.pattern_1.display()

        match_1 = self.pattern_1.M(a, c ,e)
        if not match_1.isError():
            return Match()
        if debug_output:
            print
            print "choice pattern 2 is",
            self.pattern_2.display()
            print
        match_2 = self.pattern_2.M(a, c, e)
        if not match_2.isError():
            return Match()
        return Error("both items of a choice failed", match_1, match_2)

    def M_consume(self, a, c, e):
        if debug_output:
            print
            print "choice M consume where c is [",
            for node in c:
                node.display()
            print "]"
            print "choice (consume) pattern 1 is",
            self.pattern_1.display()
            print
        match = Match()
        match_1 = self.pattern_1.M_consume(a, c ,e)
        if not match_1.isError():
            match.add(match_1)
            if debug_output:
                print "1 succeeded and left",
                match_1.display()
                print
        if debug_output:
            print "choice (consume) pattern 2 is",
            self.pattern_2.display()
            print
        match_2 = self.pattern_2.M_consume(a, c, e)
        if not match_2.isError():
            match.add(match_2)
            if debug_output:
                print "2 succeeded"
                match.display()
        if match_1.isError() and match_2.isError():
            return Error("both items of a choice failed", match_1, match_2)
        if debug_output:
            print "choice is returning",
            match.display()
            print
        return match

    def M_interleave(self, a, c, e):
        match = Match()
        match_1 = self.pattern_1.M_interleave(a, c ,e)
        if not match_1.isError():
            match.add(match_1)
        match_2 = self.pattern_2.M_interleave(a, c, e)
        if not match_2.isError():
            match.add(match_2)
        if match_1.isError() and match_2.isError():
            return Error("both items of a choice failed", match_1, match_2)
        return match


class T_Interleave(Pattern):
    def __init__(self, pattern_1=None, pattern_2=None):
        self.pattern_1 = pattern_1
        self.pattern_2 = pattern_2

    def display(self):
        print "(INTERLEAVE",
        self.pattern_1.display()
        self.pattern_2.display()
        print ")",

    def M(self, a, c, e):
        if debug_output:
            print "interleave M where c is [",
            for node in c:
                node.display()
            print "]"
            print "interleave pattern 1 is",
            self.pattern_1.display()
            print
        match_1 = self.pattern_1.M_interleave(a,c,e)
        if debug_output:
            print "match from consuming interleave pattern 1 is",
            match_1.display()
            print
        if match_1.isError():
            return Error("first pattern of interleave failed", match_1)
	for remainder in match_1.remainders:
	    a_2 = remainder.a
            c_2 = remainder.c
            match = self.pattern_2.M(a_2, c_2, e)
            if not match.isError():
                return Match()
        return Error("second pattern of interleave failed", match)

    def M_interleave(self, a, c, e):
        if debug_output:
            print
            print "interleave M_interleave where c is [",
            for node in c:
                node.display()
            print "]"
            print "interleave pattern 1 is",
            self.pattern_1.display()
            print
        match_1 = self.pattern_1.M_interleave(a,c,e)
        if debug_output:
            print "match from consuming interleave pattern 1 is",
            match_1.display()
            print
        if match_1.isError():
            return Error("first pattern of interleave failed", match_1)
        if debug_output:
            print "interleave pattern 2 is",
            self.pattern_2.display()
            print
            print "remainders are", match_1.remainders
        match = Match()
	for remainder in match_1.remainders:
	    a_2 = remainder.a
            c_2 = remainder.c
            if debug_output:
                print "about to see how interleave pattern 2 goes where c is [",
                for node in c_2:
                    node.display()
                print "]"
            match_2 = self.pattern_2.M_interleave(a_2, c_2, e)
            if debug_output:
                print "match from consuming interleave pattern 2 is",
                match_2.display()
            if not match.isError():
                match.add(match_2)
        return match
        

class T_OneOrMore(Pattern):
    def __init__(self, pattern=None):
        self.pattern = pattern

    def display(self):
        print "(ONE-OR-MORE",
        self.pattern.display()
        print ")",

    def M(self, a, c, e):
        if debug_output:
            print
            print "one or more M where c is [",
            for node in c:
                node.display()
            print "]"
            print "one or more pattern is",
            self.pattern.display()
            print
        group = T_Group(self.pattern, T_Choice(T_Empty(), T_OneOrMore(self.pattern)))
        match = group.M(a, c, e)
        if match.isError():
            return Error("oneOrMore failed")
        return Match()

    def M_consume(self, a, c, e):
        group = T_Group(self.pattern, T_Choice(T_Empty(), T_OneOrMore(self.pattern)))
        return group.M_consume(a, c, e)

    def M_interleave(self, a, c, e):
        group = T_Group(self.pattern, T_Choice(T_Empty(), T_OneOrMore(self.pattern)))
        return group.M_interleave(a, c, e)
        

class T_Group(Pattern):
    def __init__(self, pattern_1=None, pattern_2=None):
        self.pattern_1 = pattern_1
        self.pattern_2 = pattern_2

    def display(self):
        print "(GROUP",
        self.pattern_1.display()
        self.pattern_2.display()
        print ")",

    def M(self, a, c, e):
        if debug_output:
            print
            print "group %s M where" % T_Group.count,
            print "a is",
            print a
            print "c is [",
            for node in c:
                node.display()
            print "]"
            print "group pattern 1 is",
            self.pattern_1.display()
            print
        match_1 = self.pattern_1.M_consume(a,c,e)
        if debug_output:
            print "match from consuming group pattern 1 is",
            match_1.display()
            print
        if match_1.isError():
            return Error("first pattern of group failed", match_1)
        if debug_output:
            print "group pattern 2 is",
            self.pattern_2.display()
            print
	for remainder in match_1.remainders:
	    a_2 = remainder.a
            c_2 = remainder.c
            if debug_output:
                print "about to see how group pattern 2 goes where c is [",
                for node in c_2:
                    node.display()
                print "]"
                T_Group.count = T_Group.count + 1
                if T_Group.count > 100:
                    print "STOPPING TO AVOID INFINITE LOOP"
                    import sys; sys.exit(1)
            match = self.pattern_2.M(a_2, c_2, e)
            if not match.isError():
                if debug_output:
                    print "group pattern 2 succeeded"
                return Match()
            if debug_output:
                print "that remainder failed"
        if debug_output:
            print "second pattern of group failed"
        return Error("second pattern of group failed", match)

    def M_consume(self, a, c, e):
        match_1 = self.pattern_1.M_consume(a,c,e)
        if match_1.isError():
            return Error("first pattern of group failed", match_1)
        match = Match()
        for remainder in match_1.remainders:
            a_2 = remainder.a
            c_2 = remainder.c
            match_2 = self.pattern_2.M_consume(a_2, c_2, e)
            if not match_2.isError():
                match.add(match_2)
        return match

    def M_interleave(self, a, c, e):
        # TODO I'm not 100% what it means to interleave a group (eg does order matter?)
        match_1 = self.pattern_1.M_interleave(a,c,e)
        if match_1.isError():
            return Error("first pattern of group failed", match_1)
        match = Match()
        for remainder in match_1.remainders:
            a_2 = remainder.a
            c_2 = remainder.c
            match_2 = self.pattern_2.M_interleave(a_2, c_2, e)
            if not match_2.isError():
                match.add(match_2)
        return match

if debug_output:
    T_Group.count  = 0

class T_Grammar(Pattern):
    def __init__(self):
        self.start = None
        self.definitions = {}

    def display(self):
        print "(GRAMMAR",
        self.start.display()
        for definition in self.definitions.keys():
            print "(%s=" % definition,
            self.definitions[definition].display()
            print ")",
        print ")",

    def add_definition(self, name, definition):
        self.definitions[name] = definition

    def M(self, a, c, e):
        return self.start.M(a, c, Environment(self.definitions, e))


class T_Ref(Pattern):
    def __init__(self, name):
        self.name = name
        
    def display(self):
        print "(REF =%s)" % self.name,

    def M(self, a, c, e):
        # TODO currently assumes parent="false"
        if not e.e.has_key(self.name):
            return Error("ref to unknown pattern '%s'" % self.name)
        else:
            pattern = e.e[self.name]
            return pattern.M(a, c, e)

    def M_consume(self, a, c, e):
        # TODO currently assumes parent="false"
        if not e.e.has_key(self.name):
            return Error("ref to unknown pattern '%s'" % self.name)
        else:
            pattern = e.e[self.name]
            return pattern.M_consume(a, c, e)


class NameClass:
    pass


class ExpandedName(NameClass):
    def __init__(self, namespaceURI=None, NCName=None):
        self.namespaceURI = namespaceURI
        self.NCName = NCName

    def display(self):
        print "(EXPANDED-NAME '%s' '%s')" % (self.namespaceURI, self.NCName),

    def C(self, n):
        if self.namespaceURI==n.namespaceURI and self.NCName==n.localName:
            return Match()
        else:
            return Error("expanded name doesn't match: %s^%s != %s^%s" % (self.namespaceURI, self.NCName, n.namespaceURI, n.localName))


########################################################################
### INSTANCE REPRESENTATION
#
# Basically the instance data model from section 2
#

class I_Node:
    pass


class I_Root(I_Node):
    def __init__(self):
        self.children = []

    def add_child(self, node):
        self.children.append(node)
        
    def is_whitespace(self):
        return 0

    def is_element(self):
        return 0

    def display(self):
        print "(ROOT",
        for child in self.children:
            child.display()
        print ")"


class I_ExpandedName:
    def __init__(self, namespaceURI, localName):
        self.namespaceURI = namespaceURI
        self.localName = localName


class I_Element(I_Node):
    def __init__(self):
        self.expanded_name = None
        self.attributes = []
        self.children = []

    def add_child(self, node):
        self.children.append(node)

    def add_attribute(self, node):
        self.attributes.append(node)

    def is_whitespace(self):
        return 0

    def is_element(self):
        return 1

    def display(self):
        print "(%s" % self.expanded_name.localName,
        for attr in self.attributes:
            attr.display()
        for child in self.children:
            child.display()
        print ")",


class I_Attribute(I_Node):
    def __init__(self, expanded_name=None, value=None):
        self.expanded_name = expanded_name
        self.value = value

    def is_whitespace(self):
        return 0

    def is_element(self):
        return 1

    def display(self):
        print "(@%s" % self.expanded_name.localName,
        self.value[0].display()
        print ")",
        

class I_CharData(I_Node):
    def __init__(self, data):
        self.data = data
        
    def is_whitespace(self):
        for char in self.data:
            if char not in [chr(9),chr(10),chr(13),chr(32)]:
                return 0
        return 1

    def is_element(self):
        return 0

    def display(self):
        print "'%s'" % self.data,


########################################################################
### INSTANCE PARSING

# TODO wellformedness errors don't seem to get reported

def parse_Instance(location, baseURI=None):
    if baseURI==None:
        baseURI = location

    import pyexpat
    parser = pyexpat.ParserCreate(namespace_separator="^")
    parser.SetBase(baseURI)
    parser.returns_unicode = 1

    i = I_RootHandler(parser, None)

    from urllib import urlopen
    f = urlopen(location)
    try:
        parser.ParseFile(f)
    except pyexpat.error:
        import sys
        sys.stderr.write(u"Error parsing file at line '%s' and column '%s'\n" % (parser.ErrorLineNumber, parser.ErrorColumnNumber) )
        sys.stderr.flush()
    f.close()

    return i.product


class I_RootHandler(HandlerBase):
    def __init__(self, parser, parent):
        HandlerBase.__init__(self, parser, parent)
        self.product = I_Root()

    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def child(self, name, atts):
        I_ElementHandler(self.parser, self, name, atts)

    def char(self, data):
        self.product.add_child(I_CharData(data))

    def end(self, name):
        HandlerBase.end(self, name)

    def add_child(self, node):
        self.product.add_child(node)


class I_ElementHandler(HandlerBase):
    def __init__(self, parser, parent, name, atts):
        HandlerBase.__init__(self, parser, parent)
        self.product = I_Element()
        import string
        n = string.split(name,"^")
        if len(n)==1:
            namespaceURI=""
            localName=n[0]
        else:
            namespaceURI=n[0]
            localName=n[1]
        self.product.expanded_name = I_ExpandedName(namespaceURI, localName)
        for attr in atts.keys():
            n = string.split(attr,"^")
            if len(n)==1:
                namespaceURI=""
                localName=n[0]
            else:
                namespaceURI=n[0]
                localName=n[1]
            self.product.add_attribute(I_Attribute(I_ExpandedName(namespaceURI, localName), [I_CharData(atts[attr])]))

    def set_handlers(self):
        self.parser.StartElementHandler = self.child
        self.parser.CharacterDataHandler = self.char
        self.parser.EndElementHandler = self.end

    def child(self, name, atts):
        I_ElementHandler(self.parser, self, name, atts)

    def char(self, data):
        self.product.add_child(I_CharData(data))

    def end(self, name):
        self.parent.add_child(self.product)
        HandlerBase.end(self, name)

    def add_child(self, node):
        self.product.add_child(node)



########################################################################
### MAIN LINE

import sys
if len(sys.argv)==3:
    match = validate(parse_TREX(sys.argv[1]),parse_Instance(sys.argv[2]))
    if match.isError():
        match.display()
    else:
        print "match"

