/* * Copyright Aduna (http://www.aduna-software.com/) (c) 1997-2006. * * Licensed under the Aduna BSD-style license. */ package org.openrdf.rio.rdfxml; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Stack; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; import info.aduna.net.ParsedURI; import info.aduna.xml.XMLUtil; import org.openrdf.model.vocabulary.RDF; import org.openrdf.rio.ParseLocationListener; import org.openrdf.rio.RDFHandlerException; import org.openrdf.rio.RDFParseException; /** * A filter on SAX events to make life easier on the RDF parser itself. This * filter does things like combining a call to startElement() that is directly * followed by a call to endElement() to a single call to emptyElement(). */ class SAXFilter implements ContentHandler { /*-----------* * Variables * *-----------*/ /** * The RDF parser to supply the filtered SAX events to. */ private RDFXMLParser rdfParser; /** * A Locator indicating a position in the text that is currently being parsed * by the SAX parser. */ private Locator locator; /** * Stack of ElementInfo objects. */ private Stack<ElementInfo> elInfoStack = new Stack<ElementInfo>(); /** * StringBuilder used to collect text during parsing. */ private StringBuilder charBuf = new StringBuilder(512); /** * The document's URI. */ private ParsedURI documentURI; /** * Flag indicating whether the parser parses stand-alone RDF documents. In * stand-alone documents, the rdf:RDF element is optional if it contains just * one element. */ private boolean parseStandAloneDocuments = true; /** * Variable used to defer reporting of start tags. Reporting start tags is * deferred to be able to combine a start tag and an immediately following * end tag to a single call to emptyElement(). */ private ElementInfo deferredElement = null; /** * New namespace mappings that have been reported for the next start tag by * the SAX parser, but that are not yet assigned to an ElementInfo object. */ private Map<String, String> newNamespaceMappings = new LinkedHashMap<String, String>(); /** * Flag indicating whether we're currently parsing RDF elements. */ private boolean inRDFContext; /** * The number of elements on the stack that are in the RDF context. */ private int rdfContextStackHeight; /** * Flag indicating whether we're currently parsing an XML literal. */ private boolean parseLiteralMode = false; /** * The number of elements on the stack that are part of an XML literal. */ private int xmlLiteralStackHeight; /** * The prefixes that are defined in the XML literal itself (this in contrast * to the namespaces from the XML literal's context). */ private List<String> xmlLiteralPrefixes = new ArrayList<String>(); /** * The prefixes that were used in an XML literal, but that were not defined * in it (but rather in the XML literal's context). */ private List<String> unknownPrefixesInXMLLiteral = new ArrayList<String>(); /*--------------* * Constructors * *--------------*/ public SAXFilter(RDFXMLParser rdfParser) { this.rdfParser = rdfParser; } /*---------* * Methods * *---------*/ public Locator getLocator() { return locator; } public void clear() { locator = null; elInfoStack.clear(); charBuf.setLength(0); documentURI = null; deferredElement = null; newNamespaceMappings.clear(); inRDFContext = false; rdfContextStackHeight = 0; parseLiteralMode = false; xmlLiteralStackHeight = 0; xmlLiteralPrefixes.clear(); unknownPrefixesInXMLLiteral.clear(); } public void setDocumentURI(String documentURI) { this.documentURI = createBaseURI(documentURI); } public void setParseStandAloneDocuments(boolean standAloneDocs) { parseStandAloneDocuments = standAloneDocs; } public boolean getParseStandAloneDocuments() { return parseStandAloneDocuments; } /*---------------------------------------* * Methods from interface ContentHandler * *---------------------------------------*/ public void setDocumentLocator(Locator loc) { locator = loc; ParseLocationListener pll = rdfParser.getParseLocationListener(); if (pll != null) { pll.parseLocationUpdate(loc.getLineNumber(), loc.getColumnNumber()); } } public void startDocument() { // ignore } public void endDocument() { // ignore } public void startPrefixMapping(String prefix, String uri) throws SAXException { try { if (deferredElement != null) { // This new prefix mapping must come from a new start tag reportDeferredStartElement(); } newNamespaceMappings.put(prefix, uri); if (parseLiteralMode) { // This namespace is introduced inside an XML literal xmlLiteralPrefixes.add(prefix); } rdfParser.getRDFHandler().handleNamespace(prefix, uri); } catch (RDFParseException e) { throw new SAXException(e); } catch (RDFHandlerException e) { throw new SAXException(e); } } public void endPrefixMapping(String prefix) { if (parseLiteralMode) { xmlLiteralPrefixes.remove(prefix); } } public void startElement(String namespaceURI, String localName, String qName, Attributes attributes) throws SAXException { try { if (deferredElement != null) { // The next call could set parseLiteralMode to true! reportDeferredStartElement(); } if (parseLiteralMode) { appendStartTag(qName, attributes); xmlLiteralStackHeight++; } else { ElementInfo parent = peekStack(); ElementInfo elInfo = new ElementInfo(parent, qName, namespaceURI, localName); elInfo.setNamespaceMappings(newNamespaceMappings); newNamespaceMappings.clear(); if (!inRDFContext && parseStandAloneDocuments && (!localName.equals("RDF") || !namespaceURI.equals(RDF.NAMESPACE))) { // Stand-alone document that does not start with an rdf:RDF root // element. Assume this root element is omitted. inRDFContext = true; } if (!inRDFContext) { // Check for presence of xml:base and xlm:lang attributes. for (int i = 0; i < attributes.getLength(); i++) { String attQName = attributes.getQName(i); if ("xml:base".equals(attQName)) { elInfo.setBaseURI(attributes.getValue(i)); } else if ("xml:lang".equals(attQName)) { elInfo.xmlLang = attributes.getValue(i); } } elInfoStack.push(elInfo); // Check if we are entering RDF context now. if (localName.equals("RDF") && namespaceURI.equals(RDF.NAMESPACE)) { inRDFContext = true; rdfContextStackHeight = 0; } } else { // We're parsing RDF elements. checkAndCopyAttributes(attributes, elInfo); // Don't report the new element to the RDF parser just yet. deferredElement = elInfo; } } } catch (RDFParseException e) { throw new SAXException(e); } catch (RDFHandlerException e) { throw new SAXException(e); } } private void reportDeferredStartElement() throws RDFParseException, RDFHandlerException { // Only useful for debugging. // if (deferredElement == null) { // throw new RuntimeException("no deferred start element available"); // } elInfoStack.push(deferredElement); rdfContextStackHeight++; rdfParser.setBaseURI(deferredElement.baseURI); rdfParser.setXMLLang(deferredElement.xmlLang); rdfParser.startElement(deferredElement.namespaceURI, deferredElement.localName, deferredElement.qName, deferredElement.atts); deferredElement = null; } public void endElement(String namespaceURI, String localName, String qName) throws SAXException { try { // FIXME: in parseLiteralMode we should also check if start- and // end-tags match but these start tags are not tracked yet. if (rdfParser.verifyData() && !parseLiteralMode) { // Verify that the end tag matches the start tag. ElementInfo elInfo; if (deferredElement != null) { elInfo = deferredElement; } else { elInfo = peekStack(); } if (!qName.equals(elInfo.qName)) { rdfParser.reportFatalError("expected end tag </'" + elInfo.qName + ">, found </" + qName + ">"); } } if (!inRDFContext) { elInfoStack.pop(); charBuf.setLength(0); return; } if (deferredElement == null && rdfContextStackHeight == 0) { // This end tag removes the element that signaled the start // of the RDF context (i.e. <rdf:RDF>) from the stack. inRDFContext = false; elInfoStack.pop(); charBuf.setLength(0); return; } // We're still in RDF context. if (parseLiteralMode && xmlLiteralStackHeight > 0) { appendEndTag(qName); xmlLiteralStackHeight--; return; } // Check for any deferred start elements if (deferredElement != null) { // Start element still deferred, this is an empty element rdfParser.setBaseURI(deferredElement.baseURI); rdfParser.setXMLLang(deferredElement.xmlLang); rdfParser.emptyElement(deferredElement.namespaceURI, deferredElement.localName, deferredElement.qName, deferredElement.atts); deferredElement = null; } else { if (parseLiteralMode) { // Insert any used namespace prefixes from the XML literal's // context that are not defined in the XML literal itself. insertUsedContextPrefixes(); } // Check if any character data has been collected in the charBuf String s = charBuf.toString().trim(); charBuf.setLength(0); if (s.length() > 0 || parseLiteralMode) { rdfParser.text(s); parseLiteralMode = false; } // Handle the end tag elInfoStack.pop(); rdfContextStackHeight--; rdfParser.endElement(namespaceURI, localName, qName); } } catch (RDFParseException e) { throw new SAXException(e); } catch (RDFHandlerException e) { throw new SAXException(e); } } public void characters(char[] ch, int start, int length) throws SAXException { try { if (inRDFContext) { if (deferredElement != null) { reportDeferredStartElement(); } if (parseLiteralMode) { // Characters like '<', '>', and '&' must be escaped to // prevent breaking the XML text. String s = new String(ch, start, length); s = XMLUtil.escapeCharacterData(s); charBuf.append(s); } else { charBuf.append(ch, start, length); } } } catch (RDFParseException e) { throw new SAXException(e); } catch (RDFHandlerException e) { throw new SAXException(e); } } public void ignorableWhitespace(char[] ch, int start, int length) { if (parseLiteralMode) { charBuf.append(ch, start, length); } } public void processingInstruction(String target, String data) { // ignore } public void skippedEntity(String name) { // ignore } private void checkAndCopyAttributes(Attributes attributes, ElementInfo elInfo) throws SAXException, RDFParseException { Atts atts = new Atts(attributes.getLength()); int attCount = attributes.getLength(); for (int i = 0; i < attCount; i++) { String qName = attributes.getQName(i); String value = attributes.getValue(i); // attributes starting with "xml" should be ignored, except for the // ones that are handled by this parser (xml:lang and xml:base). if (qName.startsWith("xml")) { if (qName.equals("xml:lang")) { elInfo.xmlLang = value; } else if (qName.equals("xml:base")) { elInfo.setBaseURI(value); } } else { String namespace = attributes.getURI(i); String localName = attributes.getLocalName(i); // A limited set of unqualified attributes must be supported by // parsers, as is specified in section 6.1.4 of the spec if ("".equals(namespace)) { if (localName.equals("ID") || localName.equals("about") || localName.equals("resource") || localName.equals("parseType") || localName.equals("type")) { rdfParser.reportWarning("use of unqualified attribute " + localName + " has been deprecated"); namespace = RDF.NAMESPACE; } } if (rdfParser.verifyData()) { if ("".equals(namespace)) { rdfParser.reportError("unqualified attribute '" + qName + "' not allowed"); } } Att att = new Att(namespace, localName, qName, value); atts.addAtt(att); } } elInfo.atts = atts; } public void setParseLiteralMode() { parseLiteralMode = true; xmlLiteralStackHeight = 0; // All currently known namespace prefixes are // new for this XML literal. xmlLiteralPrefixes.clear(); unknownPrefixesInXMLLiteral.clear(); } private ParsedURI createBaseURI(String uriString) { if (uriString.length() > 4 && uriString.substring(0, 4).equalsIgnoreCase("jar:")) { // uriString is e.g. // jar:http://www.foo.com/bar/baz.jar!/COM/foo/Quux.class // Treat the part up to and including the exclamation mark as the // scheme and // the rest as the path to enable 'correct' resolving of relative URIs int idx = uriString.indexOf('!'); if (idx != -1) { String scheme = uriString.substring(0, idx + 1); String path = uriString.substring(idx + 1); return new ParsedURI(scheme, null, path, null, null); } } ParsedURI uri = new ParsedURI(uriString); uri.normalize(); return uri; } /*---------------------------------* * Methods related to XML literals * *---------------------------------*/ /** * Appends a start tag to charBuf. This method is used during the parsing of * an XML Literal. */ private void appendStartTag(String qName, Attributes attributes) { // Write start of start tag charBuf.append("<" + qName); // Write any new namespace prefix definitions for (Map.Entry<String, String> entry : newNamespaceMappings.entrySet()) { String prefix = entry.getKey(); String namespace = entry.getValue(); appendNamespaceDecl(charBuf, prefix, namespace); } // Write attributes int attCount = attributes.getLength(); for (int i = 0; i < attCount; i++) { appendAttribute(charBuf, attributes.getQName(i), attributes.getValue(i)); } // Write end of start tag charBuf.append(">"); // Check for any used prefixes that are not // defined in the XML literal itself int colonIdx = qName.indexOf(':'); String prefix = (colonIdx > 0) ? qName.substring(0, colonIdx) : ""; if (!xmlLiteralPrefixes.contains(prefix) && !unknownPrefixesInXMLLiteral.contains(prefix)) { unknownPrefixesInXMLLiteral.add(prefix); } } /** * Appends an end tag to charBuf. This method is used during the parsing of * an XML Literal. */ private void appendEndTag(String qName) { charBuf.append("</" + qName + ">"); } /** * Inserts prefix mappings from an XML Literal's context for all prefixes * that are used in the XML Literal and that are not defined in the XML * Literal itself. */ private void insertUsedContextPrefixes() { int unknownPrefixesCount = unknownPrefixesInXMLLiteral.size(); if (unknownPrefixesCount > 0) { // Create a String with all needed context prefixes StringBuilder contextPrefixes = new StringBuilder(1024); ElementInfo topElement = peekStack(); for (int i = 0; i < unknownPrefixesCount; i++) { String prefix = unknownPrefixesInXMLLiteral.get(i); String namespace = topElement.getNamespace(prefix); if (namespace != null) { appendNamespaceDecl(contextPrefixes, prefix, namespace); } } // Insert this String before the first '>' character int endOfFirstStartTag = charBuf.indexOf(">"); charBuf.insert(endOfFirstStartTag, contextPrefixes.toString()); } unknownPrefixesInXMLLiteral.clear(); } private void appendNamespaceDecl(StringBuilder sb, String prefix, String namespace) { String attName = "xmlns"; if (!"".equals(prefix)) { attName += ":" + prefix; } appendAttribute(sb, attName, namespace); } private void appendAttribute(StringBuilder sb, String name, String value) { sb.append(" "); sb.append(name); sb.append("=\""); sb.append(XMLUtil.escapeDoubleQuotedAttValue(value)); sb.append("\""); } /*------------------------------------------* * Methods related to the ElementInfo stack * *------------------------------------------*/ private ElementInfo peekStack() { ElementInfo result = null; if (!elInfoStack.empty()) { result = elInfoStack.peek(); } return result; } /*----------------------------* * Internal class ElementInfo * *----------------------------*/ private class ElementInfo { public String qName; public String namespaceURI; public String localName; public Atts atts; public ElementInfo parent; private Map<String, String> namespaceMap; public ParsedURI baseURI; public String xmlLang; public ElementInfo(String qName, String namespaceURI, String localName) { this(null, qName, namespaceURI, localName); } public ElementInfo(ElementInfo parent, String qName, String namespaceURI, String localName) { this.parent = parent; this.qName = qName; this.namespaceURI = namespaceURI; this.localName = localName; if (parent != null) { // Inherit baseURI and xmlLang from parent this.baseURI = parent.baseURI; this.xmlLang = parent.xmlLang; } else { this.baseURI = documentURI; this.xmlLang = ""; } } public void setBaseURI(String uriString) { // Resolve the specified base URI against the inherited base URI baseURI = baseURI.resolve(createBaseURI(uriString)); } public void setNamespaceMappings(Map<String, String> namespaceMappings) { if (namespaceMappings.isEmpty()) { namespaceMap = null; } else { namespaceMap = new HashMap<String, String>(namespaceMappings); } } public String getNamespace(String prefix) { String result = null; if (namespaceMap != null) { result = namespaceMap.get(prefix); } if (result == null && parent != null) { result = parent.getNamespace(prefix); } return result; } } }