# Copyright (c) 2002, Daniel Krech, http://eikeon.com/
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#   * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
#   * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
#
#   * Neither the name of Daniel Krech nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""
RDF/XML parser.
"""

from urlparse import urljoin, urldefrag
from urllib2 import urlopen, Request

from rdflib.URIRef import URIRef
from rdflib.Literal import Literal
from rdflib.BNode import BNode
from rdflib.Namespace import Namespace
from rdflib.exception import ParserError

from rdflib import __version__

headers = {
    'Accept': 'text/xml, application/xml, application/rdf+xml, application/xml+rdf, text/plain, application/xhtml+xml, application/*, */*',
    'User-agent':
    'rdflib-%s (http://rdflib.net/; eikeon@eikeon.com)' % __version__
    }

## 3.4 The RDF Namespace

## The RDF Namespace URI is
## http://www.w3.org/1999/02/22-rdf-syntax-ns# and is typically used
## in XML with the prefix rdf although other prefix strings may be
## used.

RDFNS = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")

## The namespace contains the following names only:

## RDF Description
## ID about bagID parseType resource
## li

RDF = RDFNS["RDF"]
DESCRIPTION = RDFNS["Description"]
ID = RDFNS["ID"]
NODE_ID = RDFNS["nodeID"]
ABOUT = RDFNS["about"]
BAG_ID = RDFNS["bagID"]
PARSE_TYPE = RDFNS["parseType"]
RESOURCE = RDFNS["resource"]
LI = RDFNS["li"]

## The above terms are only used in the syntax and are not concepts in
## the graph.

## Seq Bag Alt Statement Property
## subject predicate object
## type value
## _n

SEQ = RDFNS["Seq"]
BAG = RDFNS["Bag"]
ALT = RDFNS["Alt"]
STATEMENT = RDFNS["Statement"]
PROPERTY = RDFNS["Property"]
SUBJECT = RDFNS["subject"]
PREDICATE = RDFNS["predicate"]
OBJECT = RDFNS["object"]
TYPE = RDFNS["type"]
VALUE = RDFNS["value"]

LIST = RDFNS["List"]
FIRST = RDFNS["first"]
REST = RDFNS["rest"]
NIL = RDFNS["nil"]

## where n is a non-negative integer. These are either RDF Classes
## (first 5) or Properties (remainder) in the graph.

## Any other names are not defined and SHOULD generate a warning when
## encountered in an application, but should otherwise behave
## normally, and treated as properties and/or classes as appropriate
## for their use.

## Throughout this document the terminology rdf:name will be used to
## indicate name is from the RDF namespace and it has a URI of the
## concatenation of the RDF Namespace URI and name. For example,
## rdf:type has the URI
## http://www.w3.org/1999/02/22-rdf-syntax-ns#type

## Note: In the 18 December 2001 Working Draft the names aboutEach and
## aboutEachPrefix were removed from the language and the RDF
## namespace by the RDF Core Working Group. See the resolution of
## issues rdfms-abouteach and rdfms-abouteachprefix for further
## information. The Working Group invites feedback from the community
## on the effects of this on existing implementations and documents
## and on the costs and benefits of adopting a new namespace URI to
## reflect this change (currently not proposed by the Working Group).


XMLNS = Namespace("http://www.w3.org/XML/1998/namespace")
BASE = XMLNS["base"]

# OLD names
ABOUT_EACH = RDFNS["aboutEach"]
ABOUT_EACH_PREFIX = RDFNS["aboutEachPrefix"]


class BagID(URIRef):
    def __init__(self, val):
        super(URIRef, self).__init__(val)
        self.li = 0

    def next_li(self):
        self.li += 1
        return URIRef(RDFNS + "_%s" % self.li)        

class ElementHandler(object):
    def __init__(self):
        self.start = None
        self.char = None
        self.end = None
        self.li = 0
        self.id = None
        self.bag_id = None
        self.base = None
        self.subject = None
        self.list = None

    def next_li(self):
        self.li += 1
        return URIRef(RDFNS + "_%s" % self.li)

    
class DocumentHandler(object):
    def __init__(self, parser, add):
        self.parser = parser
        self.add = add
        parser.StartElementHandler = self.dispatch_start
        parser.CharacterDataHandler = self.dispatch_char
        parser.EndElementHandler = self.dispatch_end
        document_element = ElementHandler()
        document_element.start = self.document_element_start
        document_element.char = lambda data: None
        document_element.end = lambda name: None
        self.stack = [document_element,]
        self.ids = {} # remember IDs we have already seen
        self.bnode = {}

    def add_reified(self, sid, s, p, o):
        self.add(sid, TYPE, STATEMENT)
        self.add(sid, SUBJECT, s)
        self.add(sid, PREDICATE, p)
        self.add(sid, OBJECT, o)

    def error(self, message):
        raise ParserError(message)
    
    def get_current(self):
        return self.stack[-2]
    # Create a read only property called current so that self.current
    # give the current element handler.
    current = property(get_current)

    def get_next(self):
        return self.stack[-1]
    # Create a read only property that gives the element handler to be
    # used for the next element.
    next = property(get_next)

    def get_parent(self):
        if len(self.stack)>=3:
            return self.stack[-3]
        else:
            return None
    # Create a read only property that gives the current parent
    # element handler
    parent = property(get_parent)

    def absolutize(self, uri):
        return URIRef(urljoin(self.current.base, uri)) 

    def dispatch_start(self, name, atts):
        eh = ElementHandler()            
        self.stack.append(eh)
        base = atts.get(BASE, None)
        if not base:
            parent = self.parent
            if parent:
                base = self.parent.base
        base = base or self.parser.GetBase()
        # TODO: Should base ever contain a fragment
        # TODO: If so, could localize this a bit more        
        base, frag = urldefrag(base) 
        self.current.base = base
        for att in atts.keys():
            if att.startswith(XMLNS) or att[0:3].lower()=="xml":
                del atts[att]
        self.current.start(name, atts)        

    def dispatch_char(self, data):
        self.current.char(data)
    
    def dispatch_end(self, name):
        self.current.end(name)
        self.stack.pop()

    def document_element_start(self, name, atts):
        # 5.2, 5.3
        if name == RDF:
            self.next.start = self.node_element_start
            self.next.char = lambda data: None
            self.next.end = self.node_element_end
        else:
            self.next.start = self.document_element_start  # keep looking
            self.next.char = lambda data: None
            self.next.end = lambda name: None

    def node_element_start(self, name, atts):
        current = self.current
        self.next.start = self.property_element_start
        self.next.char = self.property_element_char
        self.next.end = self.property_element_end

        # 5.4, 5.5
        #start_element(URI=anyURI - ( rdf:RDF | rdf:ID | rdf:about |
        #rdf:bagID | rdf:parseType | rdf:resource | rdf:li )
        #attributes=set((idAttr | aboutAttr )?, bagIdAttr?, propertyAttr*))
        #end_element()
        if name in [RDF, ID, ABOUT, BAG_ID, PARSE_TYPE, RESOURCE, LI]:
            self.error("%s not allowed under rdf:RDF" % name)

        if name in [ABOUT_EACH, ABOUT_EACH_PREFIX]:
            self.error("%s is no longer a valid nodeElement" % name) 

        # Is there no test for this?
        if ((ABOUT in atts) and (ID in atts)):
            self.error("Can not have both rdf:ID and rdf:ABOUT")

        #For element e, the processing of some of the attributes have
        #to be done before other work such as dealing with children
        #nodes or other attributes. These can be processed in any
        #order:

        #If there is an attribute a with a.URI = rdf:ID, then
        #e.subject := identifier(identifier=concat(e.base-uri, "#",
        #a.string-value), identifier-type="URI").
        if ID in atts:
            subject = self.absolutize("#"+atts[ID])
            id = subject
            if subject in self.ids:
                self.error("two elements cannot use the same ID: '%s'" %
                           subject)
            # IDs can only appear once within a document
            self.ids[subject] = 1
        #If there is an attribute a with a.URI = rdf:about then
        #e.subject := identifier(identifier=a.string-value,
        #identifier-type="URI").
        elif NODE_ID in atts:
            nodeID = atts[NODE_ID]
            if nodeID in self.bnode:
                subject = self.bnode[nodeID]
            else:
                subject = BNode()
                self.bnode[nodeID] = subject
            self.ids[subject] = 1
            id = None # assumption rdf:nodeID acts like an rdf:about
        elif ABOUT in atts:
            subject = self.absolutize(atts[ABOUT])
            id = None

        #If e.subject is empty, generate a local blank node identifier
        #i and n := identifier(identifier=i,
        #identifier-type="bnodeID"). e.subject := n.
        else:
            subject = BNode()
            id = None
        

        if BAG_ID in atts:
            bag_id = BagID(self.absolutize("#"+atts[BAG_ID]))
            #S5 Add the following statement to the model:
            #  n.string-value
            #  <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
            #  <http://www.w3.org/1999/02/22-rdf-syntax-ns#Bag> .
            self.add(bag_id, TYPE, BAG)
        else:
            bag_id = None

        #S1 If e.URI != rdf:Description then the following statement
        #is added to the model: e.subject.string-value
        #<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <e.URI> .
        if name!=DESCRIPTION:
            self.add(subject, TYPE, self.absolutize(name))
            if bag_id:
                s = BNode()
                self.add_reified(s, subject, TYPE, self.absolutize(name))
                self.add(bag_id, bag_id.next_li(), s)

        #S2 If there is an attribute a in propertyAttr with a.URI =
        #rdf:type then the following statement is added to the model:
        #e.subject.string-value <a.URI> <a.string-value> .
        if TYPE in atts:
            self.add(subject, TYPE, self.absolutize(atts[TYPE]))
            if bag_id:
                s = BNode()
                self.add_reified(s, subject, TYPE, self.absolutize(atts[TYPE]))
                self.add(bag_id, bag_id.next_li(), s)

        #S3 For each attribute a matching propertyAttr (and not
        #rdf:type) then o := literal(literal-value=a.string-value,
        #literal-language=e.language) and the following statement is
        #added to the model: e.subject.string-value <a.URI>
        #o.string-value .
        for att in atts:
            if att in [ABOUT_EACH, ABOUT_EACH_PREFIX]:
                self.error("%s is forbidden as a property attribute name" %
                           att)
            if att==LI:
                self.error("%s not allowed as an attribute name here")
            if att in ["about", "aboutEach", "aboutEachPrefix",
                       "ID", "bagID", "type", "resource", "parseType"]:
                self.error("%s required to be in RDF namespace" % att)

            #Check the following if against:
#http://www.w3.org/2000/10/rdf-tests/rdfcore/rdfms-rdf-names-use/test-026 
            if att!=TYPE and not att in [RDF, DESCRIPTION, ID, ABOUT, BAG_ID,
                                         PARSE_TYPE, RESOURCE, LI, NODE_ID]:
                object = Literal(atts[att])
                # TODO: lang
                self.add(subject, self.absolutize(att), object)
                if bag_id:
                    s = BNode()
                    self.add_reified(s, subject, self.absolutize(att), object)
                    self.add(bag_id, bag_id.next_li(), s)

        #S4 Handle the propertyEltList children nodes in document order.

        # Does the bagID stuff really need to get done here... am
        # doing it at the same time as adding S1, S2 etc.

        current.subject = subject
        current.id = id
        current.bag_id = bag_id
        
        
    def node_element_end(self, name):
        self.parent.object = self.current.subject
        
    def property_element_start(self, name, atts):
        current = self.current

        if name==LI:
            current.predicate = self.current.next_li()
        else:
            current.predicate = self.absolutize(name)

        current.object = None
        # 5.8 Production propertyElt resourcePropertyElt |
        # literalPropertyElt | parseTypeLiteralPropertyElt |
        # parseTypeResourcePropertyElt | parseTypeOtherPropertyElt |
        # emptyPropertyElt
        
        if ID in atts:
            current.id = self.absolutize("#"+atts[ID])
        else:
            current.id = None

        if name in [ABOUT_EACH, ABOUT_EACH_PREFIX]:
            self.error("%s is not a valid property element name" % name)

        if name in [RDF, DESCRIPTION, ID, ABOUT, BAG_ID, PARSE_TYPE, RESOURCE]:
            self.error("%s not allowed here" % name)

        if PARSE_TYPE in atts and RESOURCE in atts:
            self.error("Can not have both parseType and resource")

        for att in atts:
            if att in ["about", "aboutEach", "aboutEachPrefix", "ID",
                       "bagID", "type", "resource", "parseType"]:
                self.error("%s required to be in RDF namespace" % att)
            

        if PARSE_TYPE in atts:
            for att in atts:
                if not att in [ID, BAG_ID, PARSE_TYPE, RESOURCE]:
                    self.error("Property attr '%s' now allowed here" % att)
            parse_type = atts[PARSE_TYPE]
            if parse_type=="Resource": # propertyEltList
                current.subject = current.object = BNode()                
                self.next.start = self.property_element_start
                self.next.char = self.property_element_char
                self.next.end = self.property_element_end
            elif parse_type=="Collection":
                self.next.start = self.node_element_start
                self.next.char = lambda data: None
                self.next.end = self.list_node_element_end
            elif parse_type=="Literal" or 1: 
                current.object = Literal("")
                self.next.start = self.literal_element_start
                self.next.char = self.literal_element_char
                self.next.end = self.literal_element_end
                
        else:
            if len(atts)==0 or (len(atts)==1 and ID in atts):
                current.object = Literal("")
            else:
                if RESOURCE in atts:
                    r = self.absolutize(atts[RESOURCE])
                elif NODE_ID in atts:
                    nodeID = atts[NODE_ID]
                    if nodeID in self.bnode:
                        r = self.bnode[nodeID]
                    else:
                        subject = BNode()
                        self.bnode[nodeID] = subject
                        r = subject
                else:
                    r = BNode()

                #Add the following statement to the model:
                #e.parent.subject.string-value <e.URI> r.string-value
                #.  and then if rdf:ID attribute i is given, the above
                #statement is reified with
                #identifier(identifier=concat(e.base-uri, "#",
                #i.string-value), identifier-type="URI") using the
                #reification rules in section 5.26.
                current.object = r # property_element_end will do the above.

                if BAG_ID in atts:
                    bag_id = BagID(self.absolutize("#"+atts[BAG_ID]))
                    self.add(bag_id, TYPE, BAG)                    
                else:
                    bag_id = None

                for att in atts:
                    if att in [ID, BAG_ID, PARSE_TYPE, RESOURCE, NODE_ID]:
                        continue
                    if att in [RDF, DESCRIPTION, ABOUT, LI]:
                        # see 5.18
                        self.error("""\
%s not allowed as a property attribute name""" % att)
                    
                    if att==TYPE:
                        p, o = TYPE, Literal(atts[att])
                    else:
                        p, o = self.absolutize(att), Literal(atts[att])
                    self.add(r, p, o)                        
                    if bag_id:
                        s = BNode()
                        self.add(bag_id, bag_id.next_li(), s)
                        self.add_reified(s, r, p, o)
            self.next.start = self.node_element_start
            self.next.char = lambda data: None
            self.next.end = self.node_element_end

    def property_element_char(self, data):
        current = self.current
        if current.object==None:
            current.object = Literal(data)
        else:
            if isinstance(current.object, Literal):
                current.object += data

    def property_element_end(self, name):
        current = self.current
        if self.next.end==self.list_node_element_end:
            self.add(current.list, REST, NIL)
            
        #e.parent.subject.string-value <e.URI> n.subject.string-value .
        if current.object!=None:
            self.add(self.parent.subject, current.predicate, current.object)
            if current.id:
                self.add_reified(current.id, self.parent.subject,
                                 current.predicate, current.object)

            if current.bag_id:
                s = current.bag_id
                self.add_reified(s, self.parent.subject,
                                 current.predicate, current.object)
                self.add(s, current.bag_id.next_li(), s)

            if self.parent.bag_id:
                s = current.id or BNode()
                self.add_reified(s, self.parent.subject,
                                 current.predicate, current.object)
                self.add(self.parent.bag_id, self.parent.bag_id.next_li(), s)
        current.subject = None

    def list_node_element_end(self, name):
        #self.parent.object = self.current.subject        
        
        current = self.current        
        if not self.parent.list:
            list = BNode()
            self.add(list, TYPE, LIST)
            self.parent.list = list
            self.add(self.parent.list, FIRST, current.subject)
            self.parent.object = list
        else:
            list = BNode()
            self.add(list, TYPE, LIST)
            self.add(self.parent.list, REST, list)            
            self.add(list, FIRST, current.subject)
            self.parent.list = list
            
    def literal_element_start(self, name, atts):
        self.error("XML Literals not yet implemented. Ignoring '%s' tag." 
                   % name)
        
        self.next.start = self.literal_element_start
        self.next.char = self.literal_element_char
        self.next.end = self.literal_element_end
        current = self.current
        current.object = ""

    def literal_element_char(self, data):
        current = self.current
        current.object += data
        
    def literal_element_end(self, name):
        current = self.current
        self.parent.object += current.object
        
# Mapping from unqualified names to their corresponding qualified
# names.
lax_map = {
    "about" : ABOUT,
    "ID" : ID,
    "bagID" : BAG_ID,
    "type" : TYPE,
    "resource": RESOURCE,
    "parseType": PARSE_TYPE,
    }
class LaxDocumentHandler(DocumentHandler):
    def dispatch_start(self, name, atts):
        for att in atts.keys():
            if att in lax_map:
                new_att = lax_map[att]
                atts[new_att] = atts[att]
                del atts[att]
        super(LaxDocumentHandler, self).dispatch_start(name, atts)
        
    def error(self, message):
        print message


class Parser(object):
    def __init__(self):
        super(Parser, self).__init__()
        self.strict = 1

    def parse(self, file, baseURI, strict=None):
        strict = strict or self.strict
        from xml.parsers.expat import ParserCreate        
        parser = ParserCreate(namespace_separator="")
        parser.returns_unicode = 0
        if strict:
            dh = DocumentHandler(parser, self.add)
        else:
            dh = LaxDocumentHandler(parser, self.add)
        parser.SetBase(baseURI)
        parser.ParseFile(file)
        file.close()

    def parse_URI(self, location, baseURI=None, strict=None):
        strict = strict or self.strict
        baseURI = baseURI or location
        req = Request(location, None, headers)
        file = urlopen(req)
        self.parse(file, baseURI, strict)


  
