package com.cloudhopper.commons.xml; /* * #%L * ch-commons-xbean * %% * Copyright (C) 2012 Cloudhopper by Twitter * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ // java imports import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; // third party imports public class XmlParser { private static final Logger logger = LoggerFactory.getLogger(XmlParser.class); //private Map _redirectMap = new HashMap(); private SAXParser _parser; private boolean trimText; private ArrayList<XPath> includeXPaths; private ArrayList<XPath> excludeXPaths; //private Map _observerMap; //private Stack _observers = new Stack(); //private String _xpath; //private Object _xpaths; //private String _dtd; public XmlParser() { SAXParserFactory factory = SAXParserFactory.newInstance(); try { _parser = factory.newSAXParser(); //_parser.getXMLReader().setFeature("http://xml.org/sax/features/validation", validating); _parser.getXMLReader().setFeature("http://xml.org/sax/features/namespaces", true); _parser.getXMLReader().setFeature("http://xml.org/sax/features/namespace-prefixes", false); } catch (Exception e) { throw new Error(e.toString(), e); } // by default, trim "text" values of whitespace to leave them empty trimText = true; // by default empty include and exclude xpaths includeXPaths = new ArrayList<XPath>(); excludeXPaths = new ArrayList<XPath>(); //boolean validating_dft = factory.getClass().toString().startsWith("org.apache.xerces."); //String validating_prop = System.getProperty("org.mortbay.xml.XmlParser.Validating", validating_dft ? "true" : "false"); //boolean validating = Boolean.valueOf(validating_prop).booleanValue(); //setValidating(validating); } public void setTrimText(boolean value) { this.trimText = value; } public boolean getTrimText() { return this.trimText; } /** public XmlParser(boolean validating) { setValidating(validating); } */ /* ------------------------------------------------------------ */ /** public void setValidating(boolean validating) { try { SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setValidating(validating); _parser = factory.newSAXParser(); try { if (validating) _parser.getXMLReader().setFeature("http://apache.org/xml/features/validation/schema", validating); } catch (Exception e) { if (validating) logger.warn("Schema validation may not be supported: ", e); //else //logger.ignore(e); } _parser.getXMLReader().setFeature("http://xml.org/sax/features/validation", validating); _parser.getXMLReader().setFeature("http://xml.org/sax/features/namespaces", true); _parser.getXMLReader().setFeature("http://xml.org/sax/features/namespace-prefixes", false); } catch (Exception e) { logger.warn(e); throw new Error(e.toString()); } } */ /* ------------------------------------------------------------ */ /** * @param name * @param entity */ /** public synchronized void redirectEntity(String name, URL entity) { if (entity != null) _redirectMap.put(name, entity); } */ /** * Adds an include XPath to filter nodes that are returned while parsing * the XML document. For example, adding "/configuration/testA" means that * only nodes "configuration", "testA", and any of its children will be * returned as nodes. All other nodes will essentially be "hidden" from * the returned DOM tree. * @param xpath The XPath filter to include during parsing such as "/configuration/testA" */ public void addIncludeXPath(String xpath) { // FIXME: is this a valid xpath? // FIXME: is this xpath already added? includeXPaths.add(XPath.parse(xpath)); } /** * Adds an exclude XPath to filter nodes that are returned while parsing * the XML document. For example, adding "/configuration/testA" means that * node "/configuration/testA" and any of its children will be excluded * as nodes returned while parsing. All other nodes will be returned, but * this one will essentially be "hidden". * @param xpath The XPath filter to exclude during parsing such as "/configuration/testA" */ public void addExcludeXPath(String xpath) { // FIXME: is this a valid xpath? // FIXME: is this xpath already added? excludeXPaths.add(XPath.parse(xpath)); } /* ------------------------------------------------------------ */ /** public String getDTD() { return _dtd; } */ /* ------------------------------------------------------------ */ /** * Add a ContentHandler. Add an additional _content handler that is triggered on a tag name. SAX * events are passed to the ContentHandler provided from a matching start element to the * corresponding end element. Only a single _content handler can be registered against each tag. * * @param trigger Tag local or q name. * @param observer SAX ContentHandler */ /** public synchronized void addContentHandler(String trigger, ContentHandler observer) { if (_observerMap == null) _observerMap = new HashMap(); _observerMap.put(trigger, observer); } */ /* ------------------------------------------------------------ */ public synchronized Node parse(InputSource source) throws IOException, SAXException { // _dtd=null; Handler handler = new Handler(); XMLReader reader = _parser.getXMLReader(); reader.setContentHandler(handler); reader.setErrorHandler(handler); reader.setEntityResolver(handler); if (logger.isDebugEnabled()) logger.debug("parsing: sid=" + source.getSystemId() + ",pid=" + source.getPublicId()); _parser.parse(source, handler); if (handler.error != null) throw handler.error; Node root = (Node)handler.root; handler.reset(); return root; } /** * Parse XML from a String. */ public synchronized Node parse(String xml) throws IOException, SAXException { ByteArrayInputStream is = new ByteArrayInputStream(xml.getBytes()); return parse(is); } /** * Parse XML from File. */ public synchronized Node parse(File file) throws IOException, SAXException { return parse(new InputSource(file.toURI().toURL().toString())); } /** * Parse XML from InputStream. */ public synchronized Node parse(InputStream in) throws IOException, SAXException { //_dtd=null; Handler handler = new Handler(); XMLReader reader = _parser.getXMLReader(); reader.setContentHandler(handler); reader.setErrorHandler(handler); reader.setEntityResolver(handler); _parser.parse(new InputSource(in), handler); if (handler.error != null) throw handler.error; Node root = (Node)handler.root; handler.reset(); return root; } private class Handler extends DefaultHandler { Node root = null; SAXParseException error; private Node context; private int depth; private boolean noop; private int noopDepth; Handler() { reset(); } void reset() { root = null; error = null; context = null; depth = -1; noop = false; noopDepth = -1; } @Override public void processingInstruction( String target, String value ) { //logger.debug("processing instr!"); } @Override public void startElement(String uri, String localName, String qName, Attributes attrs) throws SAXException { // figure out tag String tag = (uri == null || uri.equals("")) ? qName : localName; // debug //logger.trace("startElement: tag=" + tag); // always increment our depth depth++; // // if NOOP, then skip // if (noop) { //logger.trace("in noop mode, skipping this element"); return; } // create the current node we're parsing Node node = new Node(tag, attrs); // always set the new node's parent to the current context // if this happens to be the root node, then it'll be null node.setParent(context); // debug //logger.trace("node: " + tag + ", path: " + node.getPath()); // // is this node in our excludeXPath? // if (excludeXPaths != null && excludeXPaths.size() > 0) { String path = node.getPath(); boolean match = false; for (int i = 0; !match && i < excludeXPaths.size(); i++) { XPath xpath = excludeXPaths.get(i); match = xpath.matches(path, false); } // if there was a match, then we want to exclude this node and any children nodes if (match) { //logger.debug("turning on noop mode (due to exclusion match)"); noop = true; noopDepth = depth; return; } } // // is this node in our includeXPath list? // if (includeXPaths != null && includeXPaths.size() > 0) { String path = node.getPath(); boolean match = false; for (int i = 0; !match && i < includeXPaths.size(); i++) { XPath xpath = includeXPaths.get(i); match = xpath.matches(path); } // if no match, then we do NOT want to include this node if (!match) { //logger.debug("turning on noop mode (due to NO inclusion match)"); noop = true; noopDepth = depth; return; } } // is this the first node (root)? if (depth == 0) { root = node; } else { // add this node as a child to the current context context.addChild(node); } // we're done, so set the current context to the current node context = node; } @Override public void endElement(String uri, String localName, String qName) throws SAXException { // always decrement our depth depth--; // // check if we should turn off noop // if (noop) { if (depth < noopDepth) { //logger.trace("turning off noop mode"); noop = false; } else { //logger.trace("skipping end of element since in noop"); } return; } // reset the context to this context's parent context = context.getParent(); } @Override public void ignorableWhitespace(char buf[], int offset, int len) throws SAXException { // do nothing //logger.debug("ignorable whitespace included!"); } @Override public void characters(char buf[], int offset, int len) throws SAXException { // // if NOOP, then skip // if (noop) { //logger.trace("in noop mode, skipping characters"); return; } if (buf == null || buf.length <= 0) { // do nothing return; } // convert to string String text = new String(buf, offset, len); // check if "trim" feature is turned on if (trimText) { text = text.trim(); } // now, only set a text value if its not empty if (text != null && !text.isEmpty()) { context.appendText(text); } } @Override public void warning(SAXParseException ex) { logger.warn("WARNING @ " + getLocationString(ex) + " : " + ex.toString(), ex); } @Override public void error(SAXParseException ex) throws SAXException { // Save error and continue to report other errors if (error == null) error = ex; logger.error("ERROR @ " + getLocationString(ex) + " : " + ex.toString(), ex); } @Override public void fatalError(SAXParseException ex) throws SAXException { error = ex; logger.error("FATAL @ " + getLocationString(ex) + " : " + ex.toString(), ex); throw ex; } private String getLocationString(SAXParseException ex) { return ex.getSystemId() + " line:" + ex.getLineNumber() + " col:" + ex.getColumnNumber(); } @Override public InputSource resolveEntity(String pid, String sid) { //logger.debug("resolveEntity(" + pid + ", " + sid + ")"); /** if (logger.isDebugEnabled()) logger.debug("resolveEntity(" + pid + ", " + sid + ")"); if (sid!=null && sid.endsWith(".dtd")) _dtd=sid; URL entity = null; if (pid != null) entity = (URL) _redirectMap.get(pid); if (entity == null) entity = (URL) _redirectMap.get(sid); if (entity == null) { String dtd = sid; if (dtd.lastIndexOf('/') >= 0) dtd = dtd.substring(dtd.lastIndexOf('/') + 1); if (logger.isDebugEnabled()) logger.debug("Can't exact match entity in redirect map, trying " + dtd); entity = (URL) _redirectMap.get(dtd); } if (entity != null) { try { InputStream in = entity.openStream(); if (logger.isDebugEnabled()) logger.debug("Redirected entity " + sid + " --> " + entity); InputSource is = new InputSource(in); is.setSystemId(sid); return is; } catch (IOException e) { logger.warn(e); } } */ return null; } } /** * XML Attribute. */ public static class Attribute { private String _name; private String _value; Attribute(String n, String v) { _name = n; _value = v; } public String getName() { return _name; } public String getValue() { return _value; } } /** * XML Node. Represents an XML element with optional attributes and ordered content. */ public static class Node { private String tag; private Attribute[] attrs; private StringBuilder tempTextBuffer; private String text; Node parent; private ArrayList<Node> children; private String path; //private boolean _lastString = false; Node (String tag, Attributes attrs) { this.tag = tag; if (attrs == null) { // make it a zero array this.attrs = new Attribute[0]; } else { this.attrs = new Attribute[attrs.getLength()]; for (int i = 0; i < attrs.getLength(); i++) { String name = attrs.getLocalName(i); if (name == null || name.equals("")) name = attrs.getQName(i); this.attrs[i] = new Attribute(name, attrs.getValue(i)); } } } /** * Tests if this node contains a "text" value. If the XmlParser's "trimText" * feature is turned on, then whitespace characters will be trimmed and * this text value will be null (not set). * @return True if a text value exists, otherwise false. */ public boolean hasText() { return (this.tempTextBuffer != null || this.text != null); } /** * Clears the temporary buffer and sets the text. * @param text */ protected void setText(String text) { // reset text buffer to empty this.tempTextBuffer = null; // set the text this.text = text; } /** * Helper method for appending text since this creates a temporary * StringBuilder buffer while we keep appending text. This buffer will * be cleared the first time getText() is called. * @param text */ protected void appendText(String text) { // if the text buffer is null, create a new temporary instance if (this.tempTextBuffer == null) { this.tempTextBuffer = new StringBuilder(); // if we still have text, add it to the buffer if (this.text != null) { this.tempTextBuffer.append(this.text); } } // now append the text this.tempTextBuffer.append(text); } public String getText() { // if the text buffer isn't null, then we need to cache our full text if (this.tempTextBuffer != null) { this.text = this.tempTextBuffer.toString(); // clear the buffer this.tempTextBuffer = null; } return this.text; } public String getTag() { return this.tag; } /** * Lazily gets path of the node such as "/root/nodeA/subNodeB". The path * is only generated on the first call to this method, then its permanently * stored in this node for quick lookup. */ public String getPath() { if (path == null) { if (getParent() != null && getParent().getTag() != null) path = getParent().getPath() + "/" + getTag(); else path = "/" + getTag(); } return path; } public boolean hasParent() { return (this.parent != null); } protected void setParent(Node parent) { this.parent = parent; } public Node getParent() { return this.parent; } protected void addChild(Node child) { if (this.children == null) { this.children = new ArrayList<Node>(); } this.children.add(child); } public int getChildrenSize() { if (this.children == null) { return 0; } else { return this.children.size(); } } public boolean hasChildren() { return (this.children != null && this.children.size() > 0); } /** * Returns list of children or null if none exist. */ public ArrayList<Node> getChildren() { return this.children; } public Node getChild(int index) throws IndexOutOfBoundsException { return this.children.get(index); } /** * Get the first child node with the tag. * @param tag Tag of element * @return Node or null. */ public Node getChild(String tag) { if (this.children != null) { for (Node node : this.children) { if (node.getTag().equals(tag)) { return node; } } } return null; } public boolean hasAttributes() { return (this.attrs != null && this.attrs.length > 0); } /** * Get an array of element attributes or null if none exist */ public Attribute[] getAttributes() { return this.attrs; } /** * Get the first attribute with the name. * @param name Name of attribute * @return Attribute or null. */ public Attribute getAttribute(String name) { if (this.attrs != null) { for (Attribute attr : this.attrs) { if (attr.getName().equals(name)) { return attr; } } } return null; } /* ------------------------------------------------------------ */ /** public void clear() { if (_list != null) _list.clear(); _list = null; } */ /* ------------------------------------------------------------ */ /** * Get a tag as a string. * * @param tag The tag to get * @param tags IF true, tags are included in the value. * @param trim If true, trim the value. * @return results of get(tag).toString(tags). */ /** public String getString(String tag, boolean tags, boolean trim) { Node node = get(tag); if (node == null) return null; String s = node.toString(tags); if (s != null && trim) s = s.trim(); return s; } */ /** public synchronized String toString() { return toString(true); } public synchronized String toString(boolean tag) { StringBuilder buf = new StringBuilder(); toString(buf, tag); return buf.toString(); } public synchronized String toString(boolean tag, boolean trim) { String s = toString(tag); if (s != null && trim) s = s.trim(); return s; } private synchronized void toString(StringBuilder buf, boolean tag) { if (tag) { buf.append("<"); buf.append(_tag); if (_attrs != null) { for (int i = 0; i < _attrs.length; i++) { buf.append(' '); buf.append(_attrs[i].getName()); buf.append("=\""); buf.append(_attrs[i].getValue()); buf.append("\""); } } } if (_list != null) { if (tag) buf.append(">"); for (int i = 0; i < _list.size(); i++) { Object o = _list.get(i); if (o == null) continue; if (o instanceof Node) ((Node) o).toString(buf, tag); else buf.append(o.toString()); } if (tag) { buf.append("</"); buf.append(_tag); buf.append(">"); } } else if (tag) buf.append("/>"); } */ /* ------------------------------------------------------------ */ /** * Iterator over named child nodes. * * @param tag The tag of the nodes. * @return Iterator over all child nodes with the specified tag. */ /** public Iterator iterator(final String tag) { return new Iterator() { int c = 0; Node _node; * public boolean hasNext() { if (_node != null) return true; while (_list != null && c < _list.size()) { Object o = _list.get(c); if (o instanceof Node) { Node n = (Node) o; if (tag.equals(n._tag)) { _node = n; return true; } } c++; } return false; } public Object next() { try { if (hasNext()) return _node; throw new NoSuchElementException(); } finally { _node = null; c++; } } public void remove() { throw new UnsupportedOperationException("Not supported"); } }; } */ } }