/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.toolbox.filehandler; import org.w3c.dom.*; import javax.xml.parsers.*; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import java.io.File; import java.io.FileNotFoundException; import java.io.Reader; /** * Builds a DOM {@link org.w3c.dom.Document} using a * {@link javax.xml.stream.XMLStreamReader}. * * @version $Revision: 1.00 $, $Date: 2004/12/11 00:00:00 $ * @author Tatu Saloranta * * Major revisions by Damon May -- this class searches for a start tag with a given name and * produces the DOM tree that represents that node. */ public class Stax2DomBuilder { // // // Configuration settings: /** * Whether ignorable white space should be ignored, ie not added * in the resulting JDOM tree. If true, it will be ignored; if false, * it will be added in the tree. Default value if false. */ protected boolean mCfgIgnoreWs = false; protected boolean mNsAware = true; // // Trivial caching... protected String mLastPrefix = null; protected String mLastLocalName = null; protected String mLastQName = null; //Default document for use in generating DOM nodes protected Document _defaultDocument = null; protected Reader _fileReader = null; // streamreader persisted for multiple sequential checks of the same file protected XMLStreamReader _docStreamReader = null; /** * Creates a new XMLStreamReader for the file and prepares to use it. This should * be the only constructor exposed, since _docStreamReader is required by the parsing methods * @param file * @throws FileNotFoundException * @throws XMLStreamException */ public Stax2DomBuilder(File file) throws FileNotFoundException, XMLStreamException { _fileReader = new java.io.FileReader(file); javax.xml.stream.XMLInputFactory f = javax.xml.stream.XMLInputFactory.newInstance(); _docStreamReader = f.createXMLStreamReader(_fileReader); } /** * prepares to read from the given streamreader * @param xmlStreamReader */ public Stax2DomBuilder(XMLStreamReader xmlStreamReader) { _docStreamReader = xmlStreamReader; } /** * Method used to change whether the build methods will add ignorable * (element) white space in the DOM tree or not. *<p> * Whether all-whitespace text segment is ignorable white space or * not is based on DTD read in, as per XML specifications (white space * is only significant in mixed content or pure text elements). */ public void setIgnoreWhitespace(boolean state) { mCfgIgnoreWs = state; } /** * Creates a default Document for node creation, if necessary, and returns it * @return */ protected Document getDefaultDocument() { if (_defaultDocument == null) { try { _defaultDocument = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); } catch (Exception e) { e.printStackTrace(System.err); } } return _defaultDocument; } /** * This cover method will create a {@link org.w3c.dom.Document} instance using * the default JAXP mechanism and * populate using the given StAX stream reader. * * @return <code>Document</code> - DOM document object. * @throws XMLStreamException If the reader threw such exception (to * indicate a parsing or I/O problem) */ public Node findTreeForName(String nodeName, String endNodeNameToStop) throws ParserConfigurationException, XMLStreamException { return findTreeForName(getDefaultDocument(), nodeName, endNodeNameToStop); // return findTreeForName(DocumentBuilderFactory.newInstance().newDocumentBuilder(), nodeName); } /** * This cover method uses the passed-in DocumentBuilder to build a new doc * @param docbuilder * @param nodeName * @return * @throws XMLStreamException */ public Node findTreeForName(DocumentBuilder docbuilder, String nodeName, String endNodeNameToStop) throws XMLStreamException { Document doc = docbuilder.newDocument(); return findTreeForName(doc, nodeName, endNodeNameToStop); } /** * This method uses the class <code>XMLStreamReader</code> and builds up * a DOM tree for the first occurrence of a tag with name nodeName, using the * passed-in <code>Document</code>. Recursion has been eliminated by using nodes' * parent/child relationship; this improves performance somewhat * (classic recursion-by-iteration-and-explicit stack transformation) * * @param doc DOM <code>Document</code> for use in constructing Nodes * @param nodeName the name of the tag to match * @param endElementNameToStop When we see an end tag that matches this name, * if we're not currently building a tree we'll stop. If null, search whole doc * @return a Node containing the tree representing the first occurrence * of a tag with name nodeName in the document. If none found, return null */ public Node findTreeForName(Document doc, String nodeName, String endElementNameToStop) throws XMLStreamException { checkReaderSettings(_docStreamReader); //Top level node. In XmlBeans 2.1, they introduced a check to make sure you don't //have more than one node at the top level. Their check is a little strange in that //it always seems to stop you from appending a node to a document. So we can't just //add our top node directly to the document, we have to create this one level of indirection. Node top = doc.createElement("dummy"); Node current = top; //this will hold the root of the tree we want to keep, if we find it Node matchingTreeRoot = null; //reflects whether we're inside the tree we actually want to return. //If so, great, build everything normally. If not, refuse to do anything //for any element. boolean insideTree = false; main_loop: while (true) { int evtType = _docStreamReader.next(); Node child = null; switch (evtType) { case XMLStreamConstants.CDATA: if (insideTree) child = doc.createCDATASection(_docStreamReader.getText()); break; case XMLStreamConstants.SPACE: if (mCfgIgnoreWs) { continue main_loop; } /* Oh great. DOM is brain-dead in that ignorable white space * can not be added, even though it is legal, and often * reported by StAX/SAX impls... */ if (current == top) { // better just ignore, thus... continue; } // fall through case XMLStreamConstants.CHARACTERS: if (insideTree) child = doc.createTextNode(_docStreamReader.getText()); break; case XMLStreamConstants.COMMENT: if (insideTree) child = doc.createComment(_docStreamReader.getText()); break; //probably shouldn't get here case XMLStreamConstants.END_DOCUMENT: { break main_loop; } //check whether we've found the end tag of the root node of our tree, or //the end tag that indicates we should stop. //if so, we're done case XMLStreamConstants.END_ELEMENT: current = current.getParentNode(); if (current == null) { current = top; } //rely on the fact that our root tree node is assigned directly to the doc, //so stop if the parent is the doc itself. //Also stop if we're NOT building a tree right now and we hit our sentinel String thisEndNodeName = _docStreamReader.getLocalName(); if ((insideTree && current == top) || (!insideTree && thisEndNodeName.equals(endElementNameToStop))) { break main_loop; } continue main_loop; case XMLStreamConstants.ENTITY_DECLARATION: case XMLStreamConstants.NOTATION_DECLARATION: /* Shouldn't really get these, but maybe some stream readers * do provide the info. If so, better ignore it -- DTD event * should have most/all we need. */ continue main_loop; case XMLStreamConstants.ENTITY_REFERENCE: if (insideTree) child = doc.createEntityReference(_docStreamReader.getLocalName()); break; case XMLStreamConstants.PROCESSING_INSTRUCTION: if (insideTree) child = doc.createProcessingInstruction(_docStreamReader.getPITarget(), _docStreamReader.getPIData()); break; case XMLStreamConstants.START_ELEMENT: // Ok, need to add a new element... { String ln = _docStreamReader.getLocalName(); Element newElem; boolean thisIsRootNode = false; if (!insideTree) { //check to see if we should start the tree up now if (nodeName.equals(ln)) { insideTree = true; //indicate that, once the node is built, it should be saved //as the root of our tree thisIsRootNode = true; } } //if we're not inside the tree, don't bother if (!insideTree) continue main_loop; if (mNsAware) { String elemPrefix = _docStreamReader.getPrefix(); // Doh, DOM requires a silly qualified name... if (elemPrefix != null && elemPrefix.length() > 0) { newElem = doc.createElementNS(_docStreamReader.getNamespaceURI(), getQualified(elemPrefix, ln)); } else { newElem = doc.createElementNS(_docStreamReader.getNamespaceURI(), ln); } } else { // if non-ns-aware, things are simpler: newElem = doc.createElement(ln); } if (thisIsRootNode) { matchingTreeRoot = newElem; } /* No need to check namespace bindings, unlikes with some * other frameworks (JDOM) */ // And then the attributes: for (int i = 0, len = _docStreamReader.getAttributeCount(); i < len; ++i) { ln = _docStreamReader.getAttributeLocalName(i); if (mNsAware) { String prefix = _docStreamReader.getAttributePrefix(i); if (prefix != null && prefix.length() > 0) { ln = getQualified(prefix, ln); } Attr attr = doc.createAttributeNS(_docStreamReader.getAttributeNamespace(i), ln); attr.setValue(_docStreamReader.getAttributeValue(i)); newElem.setAttributeNodeNS(attr); } else { Attr attr = doc.createAttribute(ln); attr.setValue(_docStreamReader.getAttributeValue(i)); newElem.setAttributeNode(attr); } } // And then 'push' new element... current.appendChild(newElem); current = newElem; continue main_loop; } case XMLStreamConstants.START_DOCUMENT: /* This should only be received at the beginning of document... * so, should we indicate the problem or not? */ /* For now, let it pass: maybe some (broken) readers pass * that info as first event in beginning of doc? */ continue main_loop; case XMLStreamConstants.DTD: /* !!! Note: StAX does not expose enough information about * doctype declaration (specifically, public and system id!); * (altough StAX2 would...) * * Worse, DOM1/2 do not specify a way to create the DocType * node, even if StAX provided it. This is pretty silly, * all in all. */ continue main_loop; // Should never get these, from a stream reader: /* (commented out entries are just FYI; default catches * them all) */ //case XMLStreamConstants.ATTRIBUTE: //case XMLStreamConstants.NAMESPACE: default: throw new XMLStreamException("Unrecognized iterator event type: "+_docStreamReader.getEventType()+"; should not receive such types (broken stream reader?)"); } if (child != null && insideTree) { current.appendChild(child); } } return matchingTreeRoot; } // // // Overridable helper methods: protected String getQualified(String prefix, String localName) { /* This mostly/only helps with empty/text-only elements... * might make sense to do 'real' caching... */ if (localName == mLastLocalName && prefix == mLastPrefix) { return mLastQName; } String qn = prefix + ":" + localName; mLastQName = qn; return qn; } protected void checkReaderSettings(XMLStreamReader r) throws XMLStreamException { Object o = r.getProperty(XMLInputFactory.IS_NAMESPACE_AWARE); /* StAX defaults to namespace aware, so let's use similar * logics (although all compliant implementations really should * return a valid value) */ if ((o instanceof Boolean) && !((Boolean) o).booleanValue()) { mNsAware = false; } else { mNsAware = true; } } /** * Free up resources, in this case just the streamreader */ public void dispose() { try { if (_docStreamReader != null) _docStreamReader.close(); } catch (Exception e) {} try { if (_fileReader != null) _fileReader.close(); } catch (Exception e) {} } }