/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package groovy.util; import groovy.xml.FactorySupport; import groovy.xml.QName; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.DTDHandler; import org.xml.sax.EntityResolver; import org.xml.sax.ErrorHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.SAXNotRecognizedException; import org.xml.sax.SAXNotSupportedException; import org.xml.sax.XMLReader; import javax.xml.XMLConstants; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; /** * A helper class for parsing XML into a tree of Node instances for a * simple way of processing XML. This parser does not preserve the XML * InfoSet - if that's what you need try using W3C DOM, dom4j, JDOM, XOM etc. * This parser ignores comments and processing instructions and converts * the XML into a Node for each element in the XML with attributes * and child Nodes and Strings. This simple model is sufficient for * most simple use cases of processing XML. * <p> * Example usage: * <pre class="groovyTestCase"> * def xml = '<root><one a1="uno!"/><two>Some text!</two></root>' * def rootNode = new XmlParser().parseText(xml) * assert rootNode.name() == 'root' * assert rootNode.one[0].@a1 == 'uno!' * assert rootNode.two.text() == 'Some text!' * rootNode.children().each { assert it.name() in ['one','two'] } * </pre> * * @author <a href="mailto:james@coredevelopers.net">James Strachan</a> * @author Paul King */ public class XmlParser implements ContentHandler { private StringBuilder bodyText = new StringBuilder(); private final List<Node> stack = new ArrayList<Node>(); private Locator locator; private final XMLReader reader; private Node parent; private boolean trimWhitespace = false; private boolean keepIgnorableWhitespace = false; private boolean namespaceAware; /** * Creates a non-validating and non-namespace-aware <code>XmlParser</code> which does not allow DOCTYPE declarations in documents. * * @throws ParserConfigurationException if no parser which satisfies the requested configuration can be created. * @throws SAXException for SAX errors. */ public XmlParser() throws ParserConfigurationException, SAXException { this(false, true); } /** * Creates a <code>XmlParser</code> which does not allow DOCTYPE declarations in documents. * * @param validating <code>true</code> if the parser should validate documents as they are parsed; false otherwise. * @param namespaceAware <code>true</code> if the parser should provide support for XML namespaces; <code>false</code> otherwise. * @throws ParserConfigurationException if no parser which satisfies the requested configuration can be created. * @throws SAXException for SAX errors. */ public XmlParser(boolean validating, boolean namespaceAware) throws ParserConfigurationException, SAXException { this(validating, namespaceAware, false); } /** * Creates a <code>XmlParser</code>. * * @param validating <code>true</code> if the parser should validate documents as they are parsed; false otherwise. * @param namespaceAware <code>true</code> if the parser should provide support for XML namespaces; <code>false</code> otherwise. * @param allowDocTypeDeclaration <code>true</code> if the parser should provide support for DOCTYPE declarations; <code>false</code> otherwise. * @throws ParserConfigurationException if no parser which satisfies the requested configuration can be created. * @throws SAXException for SAX errors. */ public XmlParser(boolean validating, boolean namespaceAware, boolean allowDocTypeDeclaration) throws ParserConfigurationException, SAXException { SAXParserFactory factory = FactorySupport.createSaxParserFactory(); factory.setNamespaceAware(namespaceAware); this.namespaceAware = namespaceAware; factory.setValidating(validating); setQuietly(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true); setQuietly(factory, "http://apache.org/xml/features/disallow-doctype-decl", !allowDocTypeDeclaration); reader = factory.newSAXParser().getXMLReader(); } public XmlParser(XMLReader reader) { this.reader = reader; } public XmlParser(SAXParser parser) throws SAXException { reader = parser.getXMLReader(); } private static void setQuietly(SAXParserFactory factory, String feature, boolean value) { try { factory.setFeature(feature, value); } catch (ParserConfigurationException ignored) { } catch (SAXNotRecognizedException ignored) { } catch (SAXNotSupportedException ignored) { } } /** * Returns the current trim whitespace setting. * * @return true if whitespace will be trimmed */ public boolean isTrimWhitespace() { return trimWhitespace; } /** * Sets the trim whitespace setting value. * * @param trimWhitespace the desired setting value */ public void setTrimWhitespace(boolean trimWhitespace) { this.trimWhitespace = trimWhitespace; } /** * Returns the current keep ignorable whitespace setting. * * @return true if ignorable whitespace will be kept (default false) */ public boolean isKeepIgnorableWhitespace() { return keepIgnorableWhitespace; } /** * Sets the keep ignorable whitespace setting value. * * @param keepIgnorableWhitespace the desired new value */ public void setKeepIgnorableWhitespace(boolean keepIgnorableWhitespace) { this.keepIgnorableWhitespace = keepIgnorableWhitespace; } /** * Parses the content of the given file as XML turning it into a tree * of Nodes. * * @param file the File containing the XML to be parsed * @return the root node of the parsed tree of Nodes * @throws SAXException Any SAX exception, possibly * wrapping another exception. * @throws IOException An IO exception from the parser, * possibly from a byte stream or character stream * supplied by the application. */ public Node parse(File file) throws IOException, SAXException { InputSource input = new InputSource(new FileInputStream(file)); input.setSystemId("file://" + file.getAbsolutePath()); getXMLReader().parse(input); return parent; } /** * Parse the content of the specified input source into a tree of Nodes. * * @param input the InputSource for the XML to parse * @return the root node of the parsed tree of Nodes * @throws SAXException Any SAX exception, possibly * wrapping another exception. * @throws IOException An IO exception from the parser, * possibly from a byte stream or character stream * supplied by the application. */ public Node parse(InputSource input) throws IOException, SAXException { getXMLReader().parse(input); return parent; } /** * Parse the content of the specified input stream into a tree of Nodes. * <p> * Note that using this method will not provide the parser with any URI * for which to find DTDs etc * * @param input an InputStream containing the XML to be parsed * @return the root node of the parsed tree of Nodes * @throws SAXException Any SAX exception, possibly * wrapping another exception. * @throws IOException An IO exception from the parser, * possibly from a byte stream or character stream * supplied by the application. */ public Node parse(InputStream input) throws IOException, SAXException { InputSource is = new InputSource(input); getXMLReader().parse(is); return parent; } /** * Parse the content of the specified reader into a tree of Nodes. * <p> * Note that using this method will not provide the parser with any URI * for which to find DTDs etc * * @param in a Reader to read the XML to be parsed * @return the root node of the parsed tree of Nodes * @throws SAXException Any SAX exception, possibly * wrapping another exception. * @throws IOException An IO exception from the parser, * possibly from a byte stream or character stream * supplied by the application. */ public Node parse(Reader in) throws IOException, SAXException { InputSource is = new InputSource(in); getXMLReader().parse(is); return parent; } /** * Parse the content of the specified URI into a tree of Nodes. * * @param uri a String containing a uri pointing to the XML to be parsed * @return the root node of the parsed tree of Nodes * @throws SAXException Any SAX exception, possibly * wrapping another exception. * @throws IOException An IO exception from the parser, * possibly from a byte stream or character stream * supplied by the application. */ public Node parse(String uri) throws IOException, SAXException { InputSource is = new InputSource(uri); getXMLReader().parse(is); return parent; } /** * A helper method to parse the given text as XML. * * @param text the XML text to parse * @return the root node of the parsed tree of Nodes * @throws SAXException Any SAX exception, possibly * wrapping another exception. * @throws IOException An IO exception from the parser, * possibly from a byte stream or character stream * supplied by the application. */ public Node parseText(String text) throws IOException, SAXException { return parse(new StringReader(text)); } /** * Determine if namespace handling is enabled. * * @return true if namespace handling is enabled */ public boolean isNamespaceAware() { return namespaceAware; } /** * Enable and/or disable namespace handling. * * @param namespaceAware the new desired value */ public void setNamespaceAware(boolean namespaceAware) { this.namespaceAware = namespaceAware; } // Delegated XMLReader methods //------------------------------------------------------------------------ /* (non-Javadoc) * @see org.xml.sax.XMLReader#getDTDHandler() */ public DTDHandler getDTDHandler() { return this.reader.getDTDHandler(); } /* (non-Javadoc) * @see org.xml.sax.XMLReader#getEntityResolver() */ public EntityResolver getEntityResolver() { return this.reader.getEntityResolver(); } /* (non-Javadoc) * @see org.xml.sax.XMLReader#getErrorHandler() */ public ErrorHandler getErrorHandler() { return this.reader.getErrorHandler(); } /* (non-Javadoc) * @see org.xml.sax.XMLReader#getFeature(java.lang.String) */ public boolean getFeature(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException { return this.reader.getFeature(uri); } /* (non-Javadoc) * @see org.xml.sax.XMLReader#getProperty(java.lang.String) */ public Object getProperty(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException { return this.reader.getProperty(uri); } /* (non-Javadoc) * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler) */ public void setDTDHandler(final DTDHandler dtdHandler) { this.reader.setDTDHandler(dtdHandler); } /* (non-Javadoc) * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver) */ public void setEntityResolver(final EntityResolver entityResolver) { this.reader.setEntityResolver(entityResolver); } /* (non-Javadoc) * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) */ public void setErrorHandler(final ErrorHandler errorHandler) { this.reader.setErrorHandler(errorHandler); } /* (non-Javadoc) * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean) */ public void setFeature(final String uri, final boolean value) throws SAXNotRecognizedException, SAXNotSupportedException { this.reader.setFeature(uri, value); } /* (non-Javadoc) * @see org.xml.sax.XMLReader#setProperty(java.lang.String, java.lang.Object) */ public void setProperty(final String uri, final Object value) throws SAXNotRecognizedException, SAXNotSupportedException { reader.setProperty(uri, value); } // ContentHandler interface //------------------------------------------------------------------------- public void startDocument() throws SAXException { parent = null; } public void endDocument() throws SAXException { stack.clear(); } public void startElement(String namespaceURI, String localName, String qName, Attributes list) throws SAXException { addTextToNode(); Object nodeName = getElementName(namespaceURI, localName, qName); int size = list.getLength(); Map<Object, String> attributes = new LinkedHashMap<Object, String>(size); for (int i = 0; i < size; i++) { Object attributeName = getElementName(list.getURI(i), list.getLocalName(i), list.getQName(i)); String value = list.getValue(i); attributes.put(attributeName, value); } parent = createNode(parent, nodeName, attributes); stack.add(parent); } public void endElement(String namespaceURI, String localName, String qName) throws SAXException { addTextToNode(); if (!stack.isEmpty()) { stack.remove(stack.size() - 1); if (!stack.isEmpty()) { parent = stack.get(stack.size() - 1); } } } public void characters(char buffer[], int start, int length) throws SAXException { bodyText.append(buffer, start, length); } public void startPrefixMapping(String prefix, String namespaceURI) throws SAXException { } public void endPrefixMapping(String prefix) throws SAXException { } public void ignorableWhitespace(char buffer[], int start, int len) throws SAXException { if (keepIgnorableWhitespace) characters(buffer, start, len); } public void processingInstruction(String target, String data) throws SAXException { } public Locator getDocumentLocator() { return locator; } public void setDocumentLocator(Locator locator) { this.locator = locator; } public void skippedEntity(String name) throws SAXException { } // Implementation methods //------------------------------------------------------------------------- protected XMLReader getXMLReader() { reader.setContentHandler(this); return reader; } protected void addTextToNode() { if (parent == null) { // TODO store this on root node? reset bodyText? return; } String text = bodyText.toString(); if (!trimWhitespace && keepIgnorableWhitespace) { parent.children().add(text); } else if (!trimWhitespace && text.trim().length() > 0) { parent.children().add(text); } else if (text.trim().length() > 0) { parent.children().add(text.trim()); } bodyText = new StringBuilder(); } /** * Creates a new node with the given parent, name, and attributes. The * default implementation returns an instance of * <code>groovy.util.Node</code>. * * @param parent the parent node, or null if the node being created is the * root node * @param name an Object representing the name of the node (typically * an instance of {@link QName}) * @param attributes a Map of attribute names to attribute values * @return a new Node instance representing the current node */ protected Node createNode(Node parent, Object name, Map attributes) { return new Node(parent, name, attributes); } /** * Return a name given the namespaceURI, localName and qName. * * @param namespaceURI the namespace URI * @param localName the local name * @param qName the qualified name * @return the newly created representation of the name */ protected Object getElementName(String namespaceURI, String localName, String qName) { String name = localName; String prefix = ""; if ((name == null) || (name.length() < 1)) { name = qName; } if (namespaceURI == null || namespaceURI.length() <= 0) { return name; } if (qName != null && qName.length() > 0 && namespaceAware) { int index = qName.lastIndexOf(":"); if (index > 0) { prefix = qName.substring(0, index); } } return new QName(namespaceURI, name, prefix); } }