package lux.xml; import java.io.InputStream; import java.io.Reader; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import javax.xml.namespace.NamespaceContext; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import javax.xml.stream.util.StreamReaderDelegate; import net.sf.saxon.Configuration; import net.sf.saxon.event.PipelineConfiguration; import net.sf.saxon.evpull.BracketedDocumentIterator; import net.sf.saxon.evpull.Decomposer; import net.sf.saxon.evpull.EventIterator; import net.sf.saxon.evpull.EventToStaxBridge; import net.sf.saxon.evpull.SingletonEventIterator; import net.sf.saxon.om.DocumentInfo; import net.sf.saxon.om.NodeInfo; import org.codehaus.stax2.XMLInputFactory2; import com.ctc.wstx.api.WstxInputProperties; /** * Reads XML and passes events to a brigade of StAXHandlers. Essentially * turns StAX into push model parser a'la SAX. * * @author sokolov * */ public class XmlReader { private XMLInputFactory inputFactory; private ArrayList<StAXHandler> handlers = new ArrayList<StAXHandler>(); private boolean stripNamespaces; /** * Consume the character stream, generating events for the handlers. * * @param reader source of xml StAX events * @throws XMLStreamException if the reader does */ public void read (Reader reader) throws XMLStreamException { XMLStreamReader xmlStreamReader = getXMLInputFactory().createXMLStreamReader(reader); if (stripNamespaces) { xmlStreamReader = new NamespaceStrippingXMLStreamReader(xmlStreamReader); } read (xmlStreamReader); } /** * Consume the byte stream, generating events for the handlers. * * @param in source of xml StAX events * @throws XMLStreamException if the reader does */ public void read (InputStream in) throws XMLStreamException { XMLStreamReader xmlStreamReader = getXMLInputFactory().createXMLStreamReader(in); if (stripNamespaces) { xmlStreamReader = new NamespaceStrippingXMLStreamReader(xmlStreamReader); } read (xmlStreamReader); } public void read (InputStream in, String systemID) throws XMLStreamException { XMLStreamReader xmlStreamReader = getXMLInputFactory().createXMLStreamReader(systemID, in); if (stripNamespaces) { xmlStreamReader = new NamespaceStrippingXMLStreamReader(xmlStreamReader); } read (xmlStreamReader); } public void read (NodeInfo node) throws XMLStreamException { Configuration configuration = node.getConfiguration(); if (configuration == null) { // TODO: index a text fragment throw new XMLStreamException ("Attempt to parse non-XML node"); } PipelineConfiguration pipe = configuration.makePipelineConfiguration(); pipe.setHostLanguage(Configuration.XQUERY); XMLStreamReader xmlStreamReader; // copied from net.sf.saxon.xqj.SaxonXQItem if (node instanceof DocumentInfo) { EventIterator eventIterator = new Decomposer(node, pipe); xmlStreamReader = new EventToStaxBridge(eventIterator, pipe); } else { EventIterator contentIterator = new SingletonEventIterator(node); EventIterator eventIterator = new BracketedDocumentIterator(contentIterator); eventIterator = new Decomposer(eventIterator, pipe); xmlStreamReader = new EventToStaxBridge(eventIterator, pipe); } if (stripNamespaces) { xmlStreamReader = new NamespaceStrippingXMLStreamReader(xmlStreamReader); } read (xmlStreamReader); } private XMLInputFactory getXMLInputFactory () { if (inputFactory == null) { // We require Woodstox for its superior character-offset reporting, which // is broken and incomplete in the default (sun) StAX parser in the Oracle JVM. inputFactory = XMLInputFactory2.newInstance(); inputFactory.setProperty (XMLInputFactory.IS_COALESCING, false); inputFactory.setProperty (XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, true); inputFactory.setProperty (XMLInputFactory2.P_REPORT_PROLOG_WHITESPACE, false); inputFactory.setProperty (XMLInputFactory2.RESOLVER, new GentleXmlResolver()); // this doesn't seem to do anything? // inputFactory.setProperty (WstxInputProperties.P_NORMALIZE_LFS, false); inputFactory.setProperty (WstxInputProperties.P_TREAT_CHAR_REFS_AS_ENTS, true); // must set this to 1 in order to get TREAT_CHAR_REFS_AS_ENTS to report entities? inputFactory.setProperty (WstxInputProperties.P_MIN_TEXT_SEGMENT, Integer.valueOf(1)); } return inputFactory; } public void addHandler (StAXHandler handler) { handlers.add (handler); } public List<StAXHandler> getHandlers () { return handlers; } /** * Consume the XML stream, generating events for the handlers. * * @param in source of xml StAX events * @throws XMLStreamException if the reader does */ public void read (XMLStreamReader in) throws XMLStreamException { boolean gotEndDocument = false; // wrap every pass in start/end document?? sendEvent (in, XMLStreamConstants.START_DOCUMENT); while (in.hasNext()) { int event = in.next(); if (event == XMLStreamConstants.START_DOCUMENT) { continue; } sendEvent(in, event); if (event == XMLStreamConstants.END_DOCUMENT) { gotEndDocument = true; } } if (! gotEndDocument) { sendEvent (in, XMLStreamConstants.END_DOCUMENT); } } private void sendEvent(XMLStreamReader in, int event) throws XMLStreamException { for (StAXHandler handler : handlers) { handler.handleEvent (in, event); } } public void reset () { for (StAXHandler handler : handlers) { handler.reset (); } } /** * when true, all namespace information is stripped from the reported events. * The result is as if all namespace declarations and prefixes were removed from the document. * @return whether namespace information is stripped */ public boolean isStripNamespaces() { return stripNamespaces; } public void setStripNamespaces(boolean stripNamespaces) { this.stripNamespaces = stripNamespaces; } class NamespaceStrippingXMLStreamReader extends StreamReaderDelegate implements NamespaceContext { public NamespaceStrippingXMLStreamReader(XMLStreamReader xmlStreamReader) { super (xmlStreamReader); } @Override public String getPrefix () { return ""; } @Override public String getNamespaceURI() { return ""; } @Override public int getNamespaceCount() { return 0; } @Override public String getNamespaceURI(int i) { return ""; } @Override public String getNamespaceURI(String s) { return ""; } @Override public String getAttributePrefix (int i) { return ""; } @Override public String getAttributeNamespace (int i) { return ""; } @Override public NamespaceContext getNamespaceContext () { return this; // return super.getNamespaceContext(); } @Override public String getPrefix(String namespaceURI) { return ""; } @Override public Iterator<String> getPrefixes(String namespaceURI) { return new Iterator<String>() { @Override public boolean hasNext() { return false; } @Override public String next() { return null; } @Override public void remove() { } }; } } } /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this file, * You can obtain one at http://mozilla.org/MPL/2.0/. */