/** * Copyright (C) 2010 Orbeon, Inc. * * This program is free software; you can redistribute it and/or modify it under the terms of the * GNU Lesser General Public License as published by the Free Software Foundation; either version * 2.1 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU Lesser General Public License for more details. * * The full text of the license is available at http://www.gnu.org/copyleft/lesser.html */ package org.orbeon.oxf.xml.dom4j; import org.orbeon.dom.*; import org.orbeon.dom.io.*; import org.orbeon.oxf.common.OXFException; import org.orbeon.oxf.pipeline.api.TransformerXMLReceiver; import org.orbeon.oxf.processor.generator.DOMGenerator; import org.orbeon.oxf.resources.URLFactory; import org.orbeon.oxf.util.StringUtils; import org.orbeon.oxf.util.StringBuilderWriter; import org.orbeon.oxf.xml.*; import org.orbeon.oxf.xml.XMLUtils; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.AttributesImpl; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.io.StringReader; import java.net.URL; import java.util.*; // TODO: move this to Scala/remove unneeded stuff public class Dom4jUtils { /** * 03/30/2005 d : Currently DOM4J doesn't really support read only documents. ( No real * checks in place. If/when DOM4J adds real support then NULL_DOCUMENT should be made a * read only document. */ public static final Document NULL_DOCUMENT; static { NULL_DOCUMENT = DocumentFactory.createDocument(); Element nullElement = DocumentFactory.createElement("null"); nullElement.addAttribute(XMLConstants.XSI_NIL_QNAME, "true"); NULL_DOCUMENT.setRootElement(nullElement); } private static SAXReader createSAXReader(XMLParsing.ParserConfiguration parserConfiguration) throws SAXException { return new SAXReader(XMLParsing.newXMLReader(parserConfiguration)); } private static SAXReader createSAXReader() throws SAXException { return createSAXReader(XMLParsing.ParserConfiguration.XINCLUDE_ONLY); } /** * Convert a dom4j document to a string. * * @param document document to convert * @return resulting string */ public static String domToString(final Document document) { final Element rootElement = document.getRootElement(); return domToString((Branch) rootElement); } /** * Convert a dom4j element to a string. * * @param element element to convert * @return resulting string */ public static String domToString(final Element element) { return domToString((Branch) element); } /** * Convert a dom4j node to a string. * * @param node node to convert * @return resulting string */ public static String nodeToString(final Node node) { final String ret; if (node instanceof Document) { ret = domToString((Branch) ((Document) node).getRootElement()); } else if (node instanceof Element) { ret = domToString((Branch) node); } else if (node instanceof Text) { ret = node.getText(); } else { ret = domToString(node, null); } return ret; } /** * Convert an XML string to a prettified XML string. */ public static String prettyfy(String xmlString) { try { return domToPrettyString(readDom4j(xmlString)); } catch (Exception e) { throw new OXFException(e); } } /** * Convert a dom4j document to a pretty string, for formatting/debugging purposes only. * * @param document document to convert * @return resulting string */ public static String domToPrettyString(final Document document) { return domToPrettyString(document.getRootElement()); } public static String domToPrettyString(final Element element) { return domToString(element, OutputFormat.apply(true, true, true)); } /** * Convert a dom4j document to a compact string, with all text being trimmed. * * @param document document to convert * @return resulting string */ public static String domToCompactString(final Document document) { return domToString(document.getRootElement(), OutputFormat.apply(false, false, true)); } private static String domToString(final Branch branch) { return domToString(branch, OutputFormat.apply(false, false, false)); } private static String domToString(final Node node, final OutputFormat format) { final StringBuilderWriter writer = new StringBuilderWriter(); final XMLWriter xmlWriter = new XMLWriter(writer, format == null ? XMLWriter.DefaultFormat() : format); xmlWriter.write(node); writer.close(); return writer.toString(); } /** * Read a document from a URL. * * @param urlString URL * @param parserConfiguration parser configuration * @return document */ public static Document readFromURL(String urlString, XMLParsing.ParserConfiguration parserConfiguration) { InputStream is = null; try { final URL url = URLFactory.createURL(urlString); is = url.openStream(); return readDom4j(is, urlString, parserConfiguration); } catch (Exception e) { throw new OXFException(e); } finally { if (is != null) { try { is.close(); } catch (IOException e) { throw new OXFException("Exception while closing stream", e); } } } } public static Document readDom4j(Reader reader) throws SAXException, DocumentException { return createSAXReader().read(reader); } public static Document readDom4j(Reader reader, String uri) throws SAXException, DocumentException { return createSAXReader().read(reader, uri); } /* * Replacement for DocumentHelper.parseText. DocumentHelper.parseText is not used since it creates work for GC * (because it relies on JAXP). */ public static Document readDom4j(String xmlString, XMLParsing.ParserConfiguration parserConfiguration) throws SAXException, DocumentException { final StringReader stringReader = new StringReader(xmlString); return createSAXReader(parserConfiguration).read(stringReader); } public static Document readDom4j(String xmlString) throws SAXException, DocumentException { return readDom4j(xmlString, XMLParsing.ParserConfiguration.PLAIN); } public static Document readDom4j(InputStream inputStream, String uri, XMLParsing.ParserConfiguration parserConfiguration) throws SAXException, DocumentException { return createSAXReader(parserConfiguration).read(inputStream, uri); } public static Document readDom4j(InputStream inputStream) throws SAXException, DocumentException { return createSAXReader(XMLParsing.ParserConfiguration.PLAIN).read(inputStream); } public static String makeSystemId(final Element e) { final LocationData ld = (LocationData) e.getData(); final String ldSid = ld == null ? null : ld.file(); return ldSid == null ? DOMGenerator.DefaultContext : ldSid; } /** * Go over the Node and its children and make sure that there are no two contiguous text nodes so as to ensure that * XPath expressions run correctly. As per XPath 1.0 (http://www.w3.org/TR/xpath): * * "As much character data as possible is grouped into each text node: a text node never has an immediately * following or preceding sibling that is a text node." * * dom4j Text and CDATA nodes are combined together. * * @param nodeToNormalize Node hierarchy to normalize * @return the input node, normalized */ public static Node normalizeTextNodes(Node nodeToNormalize) { final List<Node> nodesToDetach = new ArrayList<Node>(); nodeToNormalize.accept(new VisitorSupport() { public void visit(Element element) { final List children = element.content(); Node previousNode = null; StringBuilder sb = null; for (Iterator i = children.iterator(); i.hasNext();) { final Node currentNode = (Node) i.next(); if (previousNode != null) { if (previousNode instanceof Text && currentNode instanceof Text) { final Text previousNodeText = (Text) previousNode; if (sb == null) sb = new StringBuilder(previousNodeText.getText()); sb.append(currentNode.getText()); nodesToDetach.add(currentNode); } else if (previousNode instanceof Text) { // Update node if needed if (sb != null) { previousNode.setText(sb.toString()); } previousNode = currentNode; sb = null; } else { previousNode = currentNode; sb = null; } } else { previousNode = currentNode; sb = null; } } // Update node if needed if (previousNode != null && sb != null) { previousNode.setText(sb.toString()); } } }); // Detach nodes only in the end so as to not confuse the acceptor above for (final Node currentNode: nodesToDetach) { currentNode.detach(); } return nodeToNormalize; } public static DocumentSource getDocumentSource(final Document d) { /* * Saxon's error handler is expensive for the service it provides so we just use our * singleton instead. * * Wrt expensive, delta in heap dump info below is amount of bytes allocated during the * handling of a single request to '/' in the examples app. i.e. The trace below was * responsible for creating 200k of garbage during the handing of a single request to '/'. * * delta: 213408 live: 853632 alloc: 4497984 trace: 380739 class: byte[] * * TRACE 380739: * java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:39) * java.nio.ByteBuffer.allocate(ByteBuffer.java:312) * sun.nio.cs.StreamEncoder$CharsetSE.<init>(StreamEncoder.java:310) * sun.nio.cs.StreamEncoder$CharsetSE.<init>(StreamEncoder.java:290) * sun.nio.cs.StreamEncoder$CharsetSE.<init>(StreamEncoder.java:274) * sun.nio.cs.StreamEncoder.forOutputStreamWriter(StreamEncoder.java:69) * java.io.OutputStreamWriter.<init>(OutputStreamWriter.java:93) * java.io.PrintWriter.<init>(PrintWriter.java:109) * java.io.PrintWriter.<init>(PrintWriter.java:92) * org.orbeon.saxon.StandardErrorHandler.<init>(StandardErrorHandler.java:22) * org.orbeon.saxon.event.Sender.sendSAXSource(Sender.java:165) * org.orbeon.saxon.event.Sender.send(Sender.java:94) * org.orbeon.saxon.IdentityTransformer.transform(IdentityTransformer.java:31) * org.orbeon.oxf.xml.XMLUtils.getDigest(XMLUtils.java:453) * org.orbeon.oxf.xml.XMLUtils.getDigest(XMLUtils.java:423) * org.orbeon.oxf.processor.generator.DOMGenerator.<init>(DOMGenerator.java:93) * * Before mod * * 1.4.2_06-b03 P4 2.6 Ghz / 50 th tc 4.1.30 10510 ms ( 150 mb ), 7124 ( 512 mb ) 2.131312472239924 ( 150 mb ), 1.7474380872589803 ( 512 mb ) * * after mod * * 1.4.2_06-b03 P4 2.6 Ghz / 50 th tc 4.1.30 9154 ms ( 150 mb ), 6949 ( 512 mb ) 1.7316203642295738 ( 150 mb ), 1.479365288194895 ( 512 mb ) * */ final LocationDocumentSource lds = new LocationDocumentSource(d); final XMLReader rdr = lds.getXMLReader(); rdr.setErrorHandler(XMLParsing.ERROR_HANDLER); return lds; } public static byte[] getDigest(Document document) { final DocumentSource ds = getDocumentSource(document); return DigestContentHandler.getDigest(ds); } /** * Clean-up namespaces. Some tools generate namespace "un-declarations" or * the form xmlns:abc="". While this is needed to keep the XML infoset * correct, it is illegal to generate such declarations in XML 1.0 (but it * is legal in XML 1.1). Technically, this cleanup is incorrect at the DOM * and SAX level, so this should be used only in rare occasions, when * serializing certain documents to XML 1.0. */ public static Document adjustNamespaces(Document document, boolean xml11) { if (xml11) return document; final LocationSAXWriter writer = new LocationSAXWriter(); final LocationSAXContentHandler ch = new LocationSAXContentHandler(); writer.setContentHandler(new NamespaceCleanupXMLReceiver(ch, xml11)); writer.write(document); return ch.getDocument(); } /** * Return a Map of namespaces in scope on the given element. */ public static Map<String, String> getNamespaceContext(Element element) { final Map<String, String> namespaces = new HashMap<String, String>(); for (Element currentNode = element; currentNode != null; currentNode = currentNode.getParent()) { final List currentNamespaces = currentNode.declaredNamespaces(); for (Iterator j = currentNamespaces.iterator(); j.hasNext();) { final Namespace namespace = (Namespace) j.next(); if (!namespaces.containsKey(namespace.prefix())) { namespaces.put(namespace.prefix(), namespace.uri()); // TODO: Intern namespace strings to save memory; should use NamePool later // namespaces.put(namespace.getPrefix().intern(), namespace.getURI().intern()); } } } // It seems that by default this may not be declared. However, it should be: "The prefix xml is by definition // bound to the namespace name http://www.w3.org/XML/1998/namespace. It MAY, but need not, be declared, and MUST // NOT be bound to any other namespace name. Other prefixes MUST NOT be bound to this namespace name, and it // MUST NOT be declared as the default namespace." namespaces.put(XMLConstants.XML_PREFIX, XMLConstants.XML_URI); return namespaces; } /** * Return a Map of namespaces in scope on the given element, without the default namespace. */ public static Map<String, String> getNamespaceContextNoDefault(Element element) { final Map<String, String> namespaces = getNamespaceContext(element); namespaces.remove(""); return namespaces; } /** * Extract a QName from an Element and an attribute name. The prefix of the QName must be in * scope. Return null if the attribute is not found. */ public static QName extractAttributeValueQName(Element element, String attributeName) { return extractTextValueQName(element, element.attributeValue(attributeName), true); } /** * Extract a QName from an Element and an attribute QName. The prefix of the QName must be in * scope. Return null if the attribute is not found. */ public static QName extractAttributeValueQName(Element element, QName attributeQName) { return extractTextValueQName(element, element.attributeValue(attributeQName), true); } public static QName extractAttributeValueQName(Element element, QName attributeQName, boolean unprefixedIsNoNamespace) { return extractTextValueQName(element, element.attributeValue(attributeQName), unprefixedIsNoNamespace); } /** * Extract a QName from an Element's string value. The prefix of the QName must be in scope. * Return null if the text is empty. */ public static QName extractTextValueQName(Element element, boolean unprefixedIsNoNamespace) { return extractTextValueQName(element, element.getStringValue(), unprefixedIsNoNamespace); } /** * Extract a QName from an Element's string value. The prefix of the QName must be in scope. * Return null if the text is empty. * * @param element Element containing the attribute * @param qNameString QName to analyze * @param unprefixedIsNoNamespace if true, an unprefixed value is in no namespace; if false, it is in the default namespace * @return a QName object or null if not found */ public static QName extractTextValueQName(Element element, String qNameString, boolean unprefixedIsNoNamespace) { return extractTextValueQName(getNamespaceContext(element), qNameString, unprefixedIsNoNamespace); } /** * Extract a QName from a string value, given namespace mappings. Return null if the text is empty. * * @param namespaces prefix -> URI mappings * @param qNameString QName to analyze * @param unprefixedIsNoNamespace if true, an unprefixed value is in no namespace; if false, it is in the default namespace * @return a QName object or null if not found */ public static QName extractTextValueQName(Map<String, String> namespaces, String qNameString, boolean unprefixedIsNoNamespace) { if (qNameString == null) return null; qNameString = StringUtils.trimAllToEmpty(qNameString); if (qNameString.length() == 0) return null; final int colonIndex = qNameString.indexOf(':'); final String prefix; final String localName; final String namespaceURI; if (colonIndex == -1) { prefix = ""; localName = qNameString; if (unprefixedIsNoNamespace) { namespaceURI = ""; } else { final String nsURI = namespaces.get(prefix); namespaceURI = nsURI == null ? "" : nsURI; } } else if (colonIndex == 0) { throw new OXFException("Empty prefix for QName: " + qNameString); } else { prefix = qNameString.substring(0, colonIndex); localName = qNameString.substring(colonIndex + 1); namespaceURI = namespaces.get(prefix); if (namespaceURI == null) { throw new OXFException("No namespace declaration found for prefix: " + prefix); } } return QName.get(localName, Namespace$.MODULE$.apply(prefix, namespaceURI)); } /** * Decode a String containing an exploded QName (also known as a "Clark name") into a QName. */ public static QName explodedQNameToQName(String qName) { int openIndex = qName.indexOf("{"); if (openIndex == -1) return QName.get(qName); String namespaceURI = qName.substring(openIndex + 1, qName.indexOf("}")); String localName = qName.substring(qName.indexOf("}") + 1); return QName.get(localName, Namespace$.MODULE$.apply("p1", namespaceURI)); } // TODO ORBEON: remove uses, just use DocumentFactory /** * Create a copy of a dom4j Node. * * @param source source Node * @return copy of Node */ public static Node createCopy(Node source) { return (source instanceof Element) ? ((Element) source).createCopy() : (Node) source.clone(); } /** * Return a new document with a copy of newRoot as its root. */ public static Document createDocumentCopyElement(final Element newRoot) { return DocumentFactory.createDocument(newRoot.createCopy()); } /** * Return a new document with all parent namespaces copied to the new root element, assuming they are not already * declared on the new root element. The element passed is deep copied. * * @param newRoot element which must become the new root element of the document * @return new document */ public static Document createDocumentCopyParentNamespaces(final Element newRoot) { return createDocumentCopyParentNamespaces(newRoot, false); } /** * Return a new document with all parent namespaces copied to the new root element, assuming they are not already * declared on the new root element. * * @param newRoot element which must become the new root element of the document * @param detach if true the element is detached, otherwise it is deep copied * @return new document */ public static Document createDocumentCopyParentNamespaces(final Element newRoot, boolean detach) { final Element parentElement = newRoot.getParent(); final Document document; { if (detach) { // Detach document = DocumentFactory.createDocument(); document.setRootElement((Element) newRoot.detach()); } else { // Copy document = createDocumentCopyElement(newRoot); } } copyMissingNamespaces(parentElement, document.getRootElement()); return document; } public static void copyMissingNamespaces(Element sourceElement, Element destinationElement) { final Map<String, String> parentNamespaceContext = Dom4jUtils.getNamespaceContext(sourceElement); final Map<String, String> rootElementNamespaceContext = Dom4jUtils.getNamespaceContext(destinationElement); for (final String prefix: parentNamespaceContext.keySet()) { // NOTE: Don't use rootElement.getNamespaceForPrefix() because that will return the element prefix's // namespace even if there are no namespace nodes if (rootElementNamespaceContext.get(prefix) == null) { final String uri = parentNamespaceContext.get(prefix); destinationElement.addNamespace(prefix, uri); } } } /** * Return a new document with a copy of newRoot as its root and all parent namespaces copied to the new root * element, except those with the prefixes appearing in the Map, assuming they are not already declared on the new * root element. */ public static Document createDocumentCopyParentNamespaces(final Element newRoot, Set<String> prefixesToFilter) { final Document document = Dom4jUtils.createDocumentCopyElement(newRoot); final Element rootElement = document.getRootElement(); final Element parentElement = newRoot.getParent(); final Map<String, String> parentNamespaceContext = Dom4jUtils.getNamespaceContext(parentElement); final Map<String, String> rootElementNamespaceContext = Dom4jUtils.getNamespaceContext(rootElement); for (final String prefix: parentNamespaceContext.keySet()) { // NOTE: Don't use rootElement.getNamespaceForPrefix() because that will return the element prefix's // namespace even if there are no namespace nodes if (rootElementNamespaceContext.get(prefix) == null && ! prefixesToFilter.contains(prefix)) { final String uri = parentNamespaceContext.get(prefix); rootElement.addNamespace(prefix, uri); } } return document; } /** * Return a copy of the given element which includes all the namespaces in scope on the element. * * @param sourceElement element to copy * @return copied element */ public static Element copyElementCopyParentNamespaces(final Element sourceElement) { final Element newElement = sourceElement.createCopy(); copyMissingNamespaces(sourceElement.getParent(), newElement); return newElement; } /** * Workaround for Java's lack of an equivalent to C's __FILE__ and __LINE__ macros. Use * carefully as it is not fast. * * Perhaps in 1.5 we will find a better way. * * @return LocationData of caller. */ public static LocationData getLocationData() { return getLocationData(1, false); } public static LocationData getLocationData(final int depth, boolean isDebug) { // Enable this with a property for debugging only, as it is time consuming if (!isDebug && !org.orbeon.oxf.properties.Properties.instance().getPropertySet() .getBoolean("oxf.debug.enable-java-location-data", false)) return null; // Compute stack trace and extract useful information final Exception e = new Exception(); final StackTraceElement[] stkTrc = e.getStackTrace(); final int depthToUse = depth + 1; final String sysID = stkTrc[depthToUse].getFileName(); final int line = stkTrc[depthToUse].getLineNumber(); return new LocationData(sysID, line, -1); } /** * Visit a subtree of a dom4j document. * * @param container element containing the elements to visit * @param visitorListener listener to call back */ public static void visitSubtree(Element container, VisitorListener visitorListener) { visitSubtree(container, visitorListener, false); } /** * Visit a subtree of a dom4j document. * * @param container element containing the elements to visit * @param visitorListener listener to call back * @param mutable whether the source tree can mutate while being visited */ public static void visitSubtree(Element container, VisitorListener visitorListener, boolean mutable) { // If the source tree can mutate, copy the list first, otherwise dom4j might throw exceptions final List<Node> content = mutable ? new ArrayList<Node>(container.content()) : container.content(); // Iterate over the content for (final Node childNode : content) { if (childNode instanceof Element) { final Element childElement = (Element) childNode; visitorListener.startElement(childElement); visitSubtree(childElement, visitorListener, mutable); visitorListener.endElement(childElement); } else if (childNode instanceof Text) { visitorListener.text((Text) childNode); } else { // Ignore as we don't need other node types for now } } } public static String elementToDebugString(Element element) { // Open start tag final StringBuilder sb = new StringBuilder("<"); sb.append(element.getQualifiedName()); // Attributes if any for (Iterator i = element.attributeIterator(); i.hasNext();) { final Attribute currentAttribute = (Attribute) i.next(); sb.append(' '); sb.append(currentAttribute.getQualifiedName()); sb.append("=\""); sb.append(currentAttribute.getValue()); sb.append('\"'); } final boolean isEmptyElement = element.elements().isEmpty() && element.getText().length() == 0; if (isEmptyElement) { // Close empty element sb.append("/>"); } else { // Close start tag sb.append('>'); sb.append("[...]"); // Close element with end tag sb.append("</"); sb.append(element.getQualifiedName()); sb.append('>'); } return sb.toString(); } public static String attributeToDebugString(Attribute attribute) { return attribute.getQualifiedName() + "=\"" + attribute.getValue() + '\"'; } /** * Convert dom4j attributes to SAX attributes. * * @param element dom4j Element * @return SAX Attributes */ public static AttributesImpl getSAXAttributes(Element element) { final AttributesImpl result = new AttributesImpl(); for (Iterator i = element.attributeIterator(); i.hasNext();) { final Attribute attribute = (Attribute) i.next(); result.addAttribute(attribute.getNamespaceURI(), attribute.getName(), attribute.getQualifiedName(), XMLReceiverHelper.CDATA, attribute.getValue()); } return result; } public static Document createDocument(DebugXML debugXML) { final TransformerXMLReceiver identity = TransformerUtils.getIdentityTransformerHandler(); final LocationDocumentResult result = new LocationDocumentResult(); identity.setResult(result); final XMLReceiverHelper helper = new XMLReceiverHelper(new ForwardingXMLReceiver(identity) { @Override public void startDocument() {} @Override public void endDocument() {} }); try { identity.startDocument(); debugXML.toXML(helper); identity.endDocument(); } catch (SAXException e) { throw new OXFException(e); } return result.getDocument(); } /** * Encode a QName to an exploded QName (also known as a "Clark name") String. */ public static String qNameToExplodedQName(QName qName) { return (qName == null) ? null : XMLUtils.buildExplodedQName(qName.getNamespaceURI(), qName.getName()); } // http://www.w3.org/TR/xpath-30/#doc-xpath30-URIQualifiedName public static String buildURIQualifiedName(QName qName) { return XMLUtils.buildURIQualifiedName(qName.getNamespaceURI(), qName.getName()); } public interface VisitorListener { void startElement(Element element); void endElement(Element element); void text(Text text); } public interface DebugXML { void toXML(XMLReceiverHelper helper); } }