Stax2DomBuilder.java example

Explorer
msInspect-master
/*
 * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.fhcrc.cpl.toolbox.filehandler;

import org.w3c.dom.*;

import javax.xml.parsers.*;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.Reader;

/**
 * Builds a DOM {@link org.w3c.dom.Document} using a
 * {@link javax.xml.stream.XMLStreamReader}.
 *
 * @version $Revision: 1.00 $, $Date: 2004/12/11 00:00:00 $
 * @author  Tatu Saloranta
 *
 * Major revisions by Damon May -- this class searches for a start tag with a given name and
 * produces the DOM tree that represents that node.
 */
public class Stax2DomBuilder
{
    // // // Configuration settings:

    /**
     * Whether ignorable white space should be ignored, ie not added
     * in the resulting JDOM tree. If true, it will be ignored; if false,
     * it will be added in the tree. Default value if false.
     */
    protected boolean mCfgIgnoreWs = false;

    protected boolean mNsAware = true;

    // // Trivial caching...

    protected String mLastPrefix = null;
    protected String mLastLocalName = null;
    protected String mLastQName = null;

    //Default document for use in generating DOM nodes
    protected Document _defaultDocument = null;

    protected Reader _fileReader = null;

    // streamreader persisted for multiple sequential checks of the same file
    protected XMLStreamReader _docStreamReader = null;

    /**
     * Creates a new XMLStreamReader for the file and prepares to use it.  This should
     * be the only constructor exposed, since _docStreamReader is required by the parsing methods
     * @param file
     * @throws FileNotFoundException
     * @throws XMLStreamException
     */
    public Stax2DomBuilder(File file) throws FileNotFoundException, XMLStreamException
    {
        _fileReader = new java.io.FileReader(file);
        javax.xml.stream.XMLInputFactory f = javax.xml.stream.XMLInputFactory.newInstance();
        _docStreamReader = f.createXMLStreamReader(_fileReader);
    }

    /**
     * prepares to read from the given streamreader
     * @param xmlStreamReader
     */
    public Stax2DomBuilder(XMLStreamReader xmlStreamReader)
    {
        _docStreamReader = xmlStreamReader;
    }

    /**
     * Method used to change whether the build methods will add ignorable
     * (element) white space in the DOM tree or not.
     *<p>
     * Whether all-whitespace text segment is ignorable white space or
     * not is based on DTD read in, as per XML specifications (white space
     * is only significant in mixed content or pure text elements).
     */
    public void setIgnoreWhitespace(boolean state) {
        mCfgIgnoreWs = state;
    }

    /**
     * Creates a default Document for node creation, if necessary, and returns it
     * @return
     */
    protected Document getDefaultDocument()
    {
        if (_defaultDocument == null)
        {
            try
            {
                _defaultDocument = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
            }
            catch (Exception e) { e.printStackTrace(System.err); }
        }
        return _defaultDocument;
    }

    /**
     * This cover method will create a {@link org.w3c.dom.Document} instance using
     * the default JAXP mechanism and
     * populate using the given StAX stream reader.
     *
     * @return <code>Document</code> - DOM document object.
     * @throws XMLStreamException If the reader threw such exception (to
     *   indicate a parsing or I/O problem)
     */
    public Node findTreeForName(String nodeName, String endNodeNameToStop)
        throws ParserConfigurationException, XMLStreamException
    {
        return findTreeForName(getDefaultDocument(), nodeName, endNodeNameToStop);
//        return findTreeForName(DocumentBuilderFactory.newInstance().newDocumentBuilder(), nodeName);
    }

    /**
     * This cover method uses the passed-in DocumentBuilder to build a new doc
     * @param docbuilder
     * @param nodeName
     * @return
     * @throws XMLStreamException
     */
    public Node findTreeForName(DocumentBuilder docbuilder, String nodeName, String endNodeNameToStop)
        throws XMLStreamException
    {
        Document doc = docbuilder.newDocument();
        return findTreeForName(doc, nodeName, endNodeNameToStop);
    }

    /**
     * This method uses the class <code>XMLStreamReader</code> and builds up
     * a DOM tree for the first occurrence of a tag with name nodeName, using the
     * passed-in <code>Document</code>. Recursion has been eliminated by using nodes'
     * parent/child relationship; this improves performance somewhat
     * (classic recursion-by-iteration-and-explicit stack transformation)
     *
     * @param doc DOM <code>Document</code> for use in constructing Nodes
     * @param nodeName the name of the tag to match
     * @param endElementNameToStop When we see an end tag that matches this name,
     * if we're not currently building a tree we'll stop.  If null, search whole doc
     * @return a Node containing the tree representing the first occurrence
     * of a tag with name nodeName in the document.  If none found, return null
     */
    public Node findTreeForName(Document doc, String nodeName, String endElementNameToStop)
        throws XMLStreamException
    {
        checkReaderSettings(_docStreamReader);

        //Top level node.  In XmlBeans 2.1, they introduced a check to make sure you don't
        //have more than one node at the top level.  Their check is a little strange in that
        //it always seems to stop you from appending a node to a document.  So we can't just
        //add our top node directly to the document, we have to create this one level of indirection.
        Node top = doc.createElement("dummy");
        Node current = top;

        //this will hold the root of the tree we want to keep, if we find it
        Node matchingTreeRoot = null;

        //reflects whether we're inside the tree we actually want to return.
        //If so, great, build everything normally.  If not, refuse to do anything
        //for any element.
        boolean insideTree = false;

        main_loop:

        while (true) {
            int evtType = _docStreamReader.next();
            Node child = null;

            switch (evtType) {
            case XMLStreamConstants.CDATA:
                if (insideTree)
                    child = doc.createCDATASection(_docStreamReader.getText());
                break;

            case XMLStreamConstants.SPACE:
                if (mCfgIgnoreWs) {
                    continue main_loop;
                }
                /* Oh great. DOM is brain-dead in that ignorable white space
                 * can not be added, even though it is legal, and often
                 * reported by StAX/SAX impls...
                 */
                if (current == top) { // better just ignore, thus...
                    continue;
                }
                // fall through

            case XMLStreamConstants.CHARACTERS:
                if (insideTree)
                    child = doc.createTextNode(_docStreamReader.getText());
                break;

            case XMLStreamConstants.COMMENT:
                if (insideTree)
                    child = doc.createComment(_docStreamReader.getText());
                break;

            //probably shouldn't get here
            case XMLStreamConstants.END_DOCUMENT:
            {
                break main_loop;
            }

            //check whether we've found the end tag of the root node of our tree, or
            //the end tag that indicates we should stop.
            //if so, we're done
            case XMLStreamConstants.END_ELEMENT:
                current = current.getParentNode();
                if (current == null) {
                    current = top;
                }
                //rely on the fact that our root tree node is assigned directly to the doc,
                //so stop if the parent is the doc itself.
                //Also stop if we're NOT building a tree right now and we hit our sentinel
                String thisEndNodeName = _docStreamReader.getLocalName();
                if ((insideTree && current == top) ||
                    (!insideTree && thisEndNodeName.equals(endElementNameToStop)))
                {
                    break main_loop;
                }
                continue main_loop;

            case XMLStreamConstants.ENTITY_DECLARATION:
            case XMLStreamConstants.NOTATION_DECLARATION:
                /* Shouldn't really get these, but maybe some stream readers
                 * do provide the info. If so, better ignore it -- DTD event
                 * should have most/all we need.
                 */
                continue main_loop;

            case XMLStreamConstants.ENTITY_REFERENCE:
                if (insideTree)
                    child = doc.createEntityReference(_docStreamReader.getLocalName());
                break;

            case XMLStreamConstants.PROCESSING_INSTRUCTION:
                if (insideTree)
                    child = doc.createProcessingInstruction(_docStreamReader.getPITarget(),
                                                            _docStreamReader.getPIData());
                break;

            case XMLStreamConstants.START_ELEMENT:
                // Ok, need to add a new element...
                {
                    String ln = _docStreamReader.getLocalName();
                    Element newElem;

                    boolean thisIsRootNode = false;

                    if (!insideTree)
                    {
                        //check to see if we should start the tree up now
                        if (nodeName.equals(ln))
                        {
                            insideTree = true;
                            //indicate that, once the node is built, it should be saved
                            //as the root of our tree
                            thisIsRootNode = true;
                        }
                    }

                    //if we're not inside the tree, don't bother
                    if (!insideTree)
                        continue main_loop;
                    if (mNsAware) {

                        String elemPrefix = _docStreamReader.getPrefix();

                        // Doh, DOM requires a silly qualified name...
                        if (elemPrefix != null && elemPrefix.length() > 0) {
                            newElem = doc.createElementNS(_docStreamReader.getNamespaceURI(),
                                                          getQualified(elemPrefix, ln));
                        } else {
                            newElem = doc.createElementNS(_docStreamReader.getNamespaceURI(), ln);
                        }

                    } else { // if non-ns-aware, things are simpler:
                        newElem = doc.createElement(ln);
                    }

                    if (thisIsRootNode)
                    {
                        matchingTreeRoot = newElem;
                    }

                    /* No need to check namespace bindings, unlikes with some
                     * other frameworks (JDOM)
                     */

                    // And then the attributes:
                    for (int i = 0, len = _docStreamReader.getAttributeCount(); i < len; ++i) {
                        ln = _docStreamReader.getAttributeLocalName(i);
                        if (mNsAware) {
                            String prefix = _docStreamReader.getAttributePrefix(i);
                            if (prefix != null && prefix.length() > 0) {
                                ln = getQualified(prefix, ln);
                            }
                            Attr attr = doc.createAttributeNS(_docStreamReader.getAttributeNamespace(i), ln);
                            attr.setValue(_docStreamReader.getAttributeValue(i));
                            newElem.setAttributeNodeNS(attr);
                        } else {
                            Attr attr = doc.createAttribute(ln);
                            attr.setValue(_docStreamReader.getAttributeValue(i));
                            newElem.setAttributeNode(attr);
                        }

                    }
                    // And then 'push' new element...
                    current.appendChild(newElem);
                    current = newElem;

                    continue main_loop;
                }

            case XMLStreamConstants.START_DOCUMENT:
                /* This should only be received at the beginning of document...
                 * so, should we indicate the problem or not?
                 */
                /* For now, let it pass: maybe some (broken) readers pass
                 * that info as first event in beginning of doc?
                 */
                continue main_loop;

            case XMLStreamConstants.DTD:
                /* !!! Note: StAX does not expose enough information about
                 *  doctype declaration (specifically, public and system id!);
                 *  (altough StAX2 would...)
                 *
                 * Worse, DOM1/2 do not specify a way to create the DocType
                 * node, even if StAX provided it. This is pretty silly,
                 * all in all.
                 */
                continue main_loop;

            // Should never get these, from a stream reader:

            /* (commented out entries are just FYI; default catches
            * them all)
            */

            //case XMLStreamConstants.ATTRIBUTE:
            //case XMLStreamConstants.NAMESPACE:
            default:
                throw new XMLStreamException("Unrecognized iterator event type: "+_docStreamReader.getEventType()+"; should not receive such types (broken stream reader?)");
            }

            if (child != null && insideTree) {
                current.appendChild(child);
            }
        }
        return matchingTreeRoot;
    }

    // // // Overridable helper methods:

    protected String getQualified(String prefix, String localName)
    {
        /* This mostly/only helps with empty/text-only elements...
         * might make sense to do 'real' caching...
         */
        if (localName == mLastLocalName &&
            prefix == mLastPrefix) {
            return mLastQName;
        }
        String qn = prefix + ":" + localName;
        mLastQName = qn;
        return qn;
    }

    protected void checkReaderSettings(XMLStreamReader r)
        throws XMLStreamException
    {
        Object o = r.getProperty(XMLInputFactory.IS_NAMESPACE_AWARE);
        /* StAX defaults to namespace aware, so let's use similar
         * logics (although all compliant implementations really should
         * return a valid value)
         */
        if ((o instanceof Boolean) && !((Boolean) o).booleanValue()) {
            mNsAware = false;
        } else {
            mNsAware = true;
        }
    }

    /**
     * Free up resources, in this case just the streamreader
     */
    public void dispose()
    {
        try
        {
            if (_docStreamReader != null)
                _docStreamReader.close();
        }
        catch (Exception e) {}
        try
        {
            if (_fileReader != null)
                _fileReader.close();
        }
        catch (Exception e) {}
    }
}