/********************************************************************************** * * Copyright (c) 2003, 2004, 2007, 2008 The Sakai Foundation * * Licensed under the Educational Community License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.opensource.org/licenses/ECL-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * **********************************************************************************/ package edu.indiana.lib.twinpeaks.util; import java.io.*; import java.util.*; import javax.xml.parsers.*; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.*; import org.w3c.dom.*; import org.w3c.dom.html.*; import org.xml.sax.*; public class DomUtils { private static org.apache.commons.logging.Log _log = LogUtils.getLog(DomUtils.class); /** * Default encoding (NekoHTML) */ private static final String ENCODING_OPTION = "http://cyberneko.org/html/properties/default-encoding"; private DomUtils() { } public final static String INPUT_ENCODING = "iso-8859-1"; public final static String ENCODING = "UTF-8"; /** * Create a new element * @param document Document to contain the new element * @param name the element name * @return new Element */ public static Element createElement(Document document, String name) { Element element; return document.createElement(name); } /** * Add a new element to the given parent * @param parent the parent Element * @param name the child name * @return new Element */ public static Element createElement(Element parent, String name) { Document document; Element element; document = parent.getOwnerDocument(); element = document.createElement(name); parent.appendChild(element); return element; } /** * Add Text object to an Element. * @param element the containing element * @param text the text to add */ public static void addText(Element element, String text) { element.appendChild(element.getOwnerDocument().createTextNode(text)); } /** * Add an entity to a specified Element. * (eg <code>DomUtils.addEntity(element, "nbsp");</code>) * @param element the containing element * @param entity the entity to add */ public static void addEntity(Element element, String entity) { element.appendChild(element.getOwnerDocument().createEntityReference(entity)); } /** * "Normalize" XML text node content to create a simple string * @param update Text to add to the original string * @return Concatenated contents (trimmed, pagination characters (\r, \n, etc.) * removed, with a space seperator) */ public static String normalizeText(String update) { return normalizeText(null, update); } /** * "Normalize" XML text node content to create a simple string * @param original Original text * @param update Text to add to the original string * @return Concatenated contents (trimmed, pagination characters (\r, \n, etc.) * removed, with a space seperator) */ public static String normalizeText(String original, String update) { StringBuilder result; if (original == null) { return (update == null) ? "" : StringUtils.replace(update.trim(), "\\s", " "); } result = new StringBuilder(original.trim()); result.append(' '); result.append(update.trim()); return StringUtils.replace(result.toString(), "\\s", " "); } /** * Get the text associated with this element, at this level only * @param parent the node containing text * @return Text (trimmed of leading/trailing whitespace, null if none) */ public static String getText(Node parent) { return textSearch(parent, false); } /** * Get the text associated with a specified element, at this level only * @param parent the node containing text * @param elementName Element with the text we want to fetch * @return Text (trimmed of leading/trailing whitespace, null if none) */ public static String getText(Node parent, String elementName) { Element element = getElement((Element) parent, elementName); if (element == null) { return null; } return textSearch(element, false); } /** * Get the text associated with a specified element, at this level * only - namespace aware * * @param namespace Namespace URI * @param parent the node containing text * @param elementName Element with the text we want to fetch * @return Text (trimmed of leading/trailing whitespace, null if none) */ public static String getTextNS(String namespace, Node parent, String elementName) { Element element = getElementNS(namespace, (Element) parent, elementName); if (element == null) { return null; } return textSearch(element, false); } /** * Get the text associated with a specified element, at this level * only - namespace aware * * @param namespace Namespace URI * @param parent the node containing text * @param elementName Element with the text we want to fetch * @return Text (trimmed of leading/trailing whitespace, null if none) */ /* public static String getTextNS(String namespace, Element parent, String elementName) { Element element = getElementNS(namespace, parent, elementName); if (element == null) { return null; } return textSearch(element, false); } */ /** * Get the text associated with this element, at all suboordinate levels * @param parent the node containing text * @return Text (trimmed of leading/trailing whitespace, null if none) */ public static String getAllTextAtNode(Node parent) { return textSearch(parent, true); } /** * Get the text associated with this element at this level only, or * recursivley, searching through all child elements * @param parent the node containing text * @param recursiveSearch Search all child elements? * @return Text (trimmed of leading/trailing whitespace, null if none) */ public static String textSearch(Node parent, boolean recursiveSearch) { String text = null; if (parent != null) { for (Node child = parent.getFirstChild(); child != null; child = child.getNextSibling()) { switch (child.getNodeType()) { case Node.TEXT_NODE: text = normalizeText(text, child.getNodeValue()); break; case Node.ELEMENT_NODE: if (recursiveSearch) { text = normalizeText(text, getText(child)); } break; default: break; } } } return text == null ? text : text.trim(); } /** * Get the first text node associated with this element * @param parent the node containing text * @return Text (trimmed of leanding/trailing whitespace, null if none) */ public static String getFirstText(Node parent) { return getTextNodeByNumber(parent, 1); } /** * Get the specified text node associated with this element * @param parent the node containing text * @param number The text node to fetch (1st, 2nd, etc) * @return Text (trimmed of leanding/trailing whitespace, null if none) */ public static String getTextNodeByNumber(Node parent, int number) { String text = null; int count = 1; if (parent != null) { for (Node child = parent.getFirstChild(); child != null; child = child.getNextSibling()) { if ((child.getNodeType() == Node.TEXT_NODE) && (count++ == number)) { text = child.getNodeValue(); return text.trim(); } } } return text; } /** * Get any text associated with this element and it's children. Null if none. * @param parent the node containing text * @return Text */ public static String getAllText(Node parent) { String text = null; if (parent != null) { for (Node child = parent.getFirstChild(); child != null; child = child.getNextSibling()) { if (child.getNodeType() == Node.TEXT_NODE) { text = normalizeText(text, child.getNodeValue()); continue; } if (child.getNodeType() == Node.ELEMENT_NODE) { String childText = getText(child); if (childText != null) { text = normalizeText(text, childText); } } } } return text; } /** * Get an Attribute from an Element. Returns an empty String if none found * @param element the containing Element * @param name the attribute name * @return Attribute as a String */ public static String getAttribute(Element element, String name) { return element.getAttribute(name); } /** * Set an Attribute in an Element * @param element the containing Element * @param name the attribute name * @param value the attribute value */ public static void setAttribute(Element element, String name, String value) { element.setAttribute(name, value); } /** * Return a list of named Elements. * @param element the containing Element * @param name the tag name * @return NodeList of matching elements */ public static NodeList getElementList(Element element, String name) { return element.getElementsByTagName(name); } /** * Return a list of specified namespace:Elements * @param namespace Namespace URI * @param element the containing Element * @param name the tag name * @return NodeList of matching elements */ public static NodeList getElementListNS(String namespace, Element element, String name) { return element.getElementsByTagNameNS(namespace, name); } /** * Return a list of named Elements with a specific attribute value. * @param element the containing Element * @param name the tag name * @param attribute Attribute name * @param value Attribute value * @return List of matching elements */ public static List selectElementsByAttributeValue(Element element, String name, String attribute, String value) { return selectElementsByAttributeValue(element, name, attribute, value, false); } /** * Return the first named Element with a specific attribute value. * @param element the containing Element * @param name the tag name * @param attribute Attribute name * @param value Attribute value * @return The first matching Element (null if none) */ public static Element selectFirstElementByAttributeValue(Element element, String name, String attribute, String value) { ArrayList resultList = (ArrayList) selectElementsByAttributeValue(element, name, attribute, value, true); return (resultList.size() == 0) ? null : (Element) resultList.get(0); } /** * Return a list of named Elements with a specific attribute value. * @param element the containing Element * @param name the tag name * @param attribute Attribute name * @param value Attribute value * @param returnFirst Return only the first matching value? * @return List of matching elements */ public static List selectElementsByAttributeValue(Element element, String name, String attribute, String value, boolean returnFirst) { NodeList elementList = element.getElementsByTagName(name); List resultList = new ArrayList(); for (int i = 0; i < elementList.getLength(); i++) { if (getAttribute((Element) elementList.item(i), attribute).equals(value)) { resultList.add(elementList.item(i)); if (returnFirst) { break; } } } return resultList; } /** * Return a list of named Elements with a specific attribute * value (namespace aware) * * @param namespace Namespace URI * @param element the containing Element * @param name the tag name * @param attribute Attribute name * @param value Attribute value * @return List of matching elements */ public static List selectElementsByAttributeValueNS(String namespace, Element element, String name, String attribute, String value) { return selectElementsByAttributeValueNS(namespace, element, name, attribute, value, false); } /** * Return the first named Element with a specific attribute * value (namespace aware) * * namespace The namespace URI * @param element the containing Element * @param name the tag name * @param attribute Attribute name * @param value Attribute value * @return The first matching Element (null if none) */ public static Element selectFirstElementByAttributeValueNS(String namespace, Element element, String name, String attribute, String value) { ArrayList resultList; resultList = (ArrayList) selectElementsByAttributeValueNS(namespace, element, name, attribute, value, true); return (resultList.size() == 0) ? null : (Element) resultList.get(0); } /** * Return a list of named Elements with a specific attribute * value (namespace aware) * * @param namespace Namespace URI * @param element the containing Element * @param name the tag name * @param attribute Attribute name * @param value Attribute value * @param returnFirst Return only the first matching value? * @return List of matching elements */ public static List selectElementsByAttributeValueNS(String namespace, Element element, String name, String attribute, String value, boolean returnFirst) { NodeList elementList = element.getElementsByTagNameNS(namespace, name); List resultList = new ArrayList(); for (int i = 0; i < elementList.getLength(); i++) { if (getAttribute((Element) elementList.item(i), attribute).equals(value)) { resultList.add(elementList.item(i)); if (returnFirst) { break; } } } return resultList; } /** * Return the first named Element found. * @param element the containing Element * @param name the tag name * @return matching Element (null if none) */ public static Element getElement(Element element, String name) { NodeList nodeList = getElementList(element, name); return (nodeList.getLength() == 0) ? null : (Element) nodeList.item(0); } /** * Return the first named Element found - namespace aware * @param namespace Namespace URI * @param element the containing Element * @param name the tag name * @return matching Element (null if none) */ public static Element getElementNS(String namespace, Element element, String name) { NodeList nodeList = getElementListNS(namespace, element, name); return (nodeList.getLength() == 0) ? null : (Element) nodeList.item(0); } /** * Remove this node from its parent. * @param node the node to remove * @return Node removed */ public Node removeNode(Node node) { return node.getParentNode().removeChild(node); } /** * Search up the tree for a given node * @param currentNode Starting point for our search * @param tagName Node name to look up * @return matching Node (null if none) */ public static Node getPreviousNodeByName(Node currentNode, String tagName) { Node node = currentNode.getParentNode(); while ((node != null) && (!node.getNodeName().equals(tagName))) { node = node.getParentNode(); } return node; } /** * Search earlier siblings for a given node * @param currentNode Starting point for our search * @param tagName Node name to look up * @return matching Node (null if none) */ public static Node getPreviousSiblingByName(Node currentNode, String tagName) { Node node = currentNode.getPreviousSibling(); while ((node != null) && (!node.getNodeName().equals(tagName))) { node = node.getPreviousSibling(); } return node; } /** * Search our next siblings for a given node * @param currentNode Starting point for our search * @param tagName Node name to look up * @return matching Node (null if none) */ public static Node getNextSiblingByName(Node currentNode, String tagName) { Node node = currentNode.getNextSibling(); while ((node != null) && (!node.getNodeName().equals(tagName))) { node = node.getNextSibling(); } return node; } /** * Search across the tree for a given sibling * @param currentNode Starting point for our search * @param tagName Node name to look up * @return matching Node (null if none) * @deprecated Replaced by {@link #getNextSiblingByName(Node currentNode, String tagName)} */ public static Node getNextNodeByName(Node currentNode, String tagName) { return getNextSiblingByName(currentNode, tagName); } /** * Search for a named child of a given node * @param currentNode Starting point for our search * @param tagName Node name to look up * @return matching Node (null if none) */ public static Node getChildSiblingByName(Node currentNode, String tagName) { Node node = currentNode.getFirstChild(); while ((node != null) && (!node.getNodeName().equals(tagName))) { node = node.getNextSibling(); } return node; } /** * Get a DOM Document builder. * @return The DocumentBuilder * @throws DomException */ public static DocumentBuilder getXmlDocumentBuilder() throws DomException { return getXmlDocumentBuilder(false); } /** * Get a DOM Document builder - namespace aware * @return The DocumentBuilder * @throws DomException */ public static DocumentBuilder getXmlDocumentBuilderNS() throws DomException { return getXmlDocumentBuilder(true); } /** * Get a DOM Document builder. * @param namespaceAware true if we're to handle namespace details * @return The DocumentBuilder * @throws DomException */ public static DocumentBuilder getXmlDocumentBuilder(boolean namespaceAware) throws DomException { try { DocumentBuilderFactory factory; factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(namespaceAware); _log.debug("DOM parse: namespace aware = " + namespaceAware); return factory.newDocumentBuilder(); } catch (Exception e) { throw new DomException(e.toString()); } } /** * Start a new XML Document (with root name = xml) * @return the Document * @throws DomException */ public static Document createXmlDocument() throws DomException { return createXmlDocument("xml"); } /** * Start a new XML Document. * @param rootName The name of the Document root Element (created here) * @return the Document * @throws DomException */ public static Document createXmlDocument(String rootName) throws DomException { try { Document document = getXmlDocumentBuilder().newDocument(); Element root = document.createElement(rootName); document.appendChild(root); return document; } catch (Exception e) { throw new DomException(e.toString()); } } /** * Copy an XML document, adding it as a child of the target document root * @param source Document to copy * @param target Document to contain copy */ public static void copyDocument(Document source, Document target) { Node node = target.importNode(source.getDocumentElement(), true); target.getDocumentElement().appendChild(node); } /** * Copy a Node from one source document, adding it to the document * root of a different, target Document * @param source Document to copy * @param target Document to contain copy */ public static void copyDocumentNode(Node source, Document target) { Node node = target.importNode(source, true); target.getDocumentElement().appendChild(node); } /** * Parse XML text (from an input stream) into a Document. * @param xmlStream The XML text stream * @return DOM Document * @throws DomException */ public static Document parseXmlStream(InputStream xmlStream) throws DomException { try { return getXmlDocumentBuilder().parse(new InputSource(xmlStream)); } catch (Exception e) { throw new DomException(e.toString()); } } /** * Parse XML text (from an input stream) into a Document - namespace aware. * @param xmlStream The XML text stream * @return DOM Document * @throws DomException */ public static Document parseXmlStreamNS(InputStream xmlStream) throws DomException { try { return getXmlDocumentBuilderNS().parse(new InputSource(xmlStream)); } catch (Exception e) { throw new DomException(e.toString()); } } /** * Parse XML text (from a Reader) into a Document. * @param xmlReader The XML Reader * @return DOM Document * @throws DomException */ public static Document parseXmlReader(Reader xmlReader) throws DomException { try { return getXmlDocumentBuilder().parse(new InputSource(xmlReader)); } catch (Exception e) { throw new DomException(e.toString()); } } /** * Parse XML text (from a raw byte array) into a Document. * @param xml The XML text * @return DOM Document * @throws DomException */ public static Document parseXmlBytes(byte[] xml) throws DomException { return parseXmlStream(new ByteArrayInputStream(xml)); } /** * Parse XML text (from a raw byte array) into a Document - namespace aware. * @param xml The XML text * @return DOM Document * @throws DomException */ public static Document parseXmlBytesNS(byte[] xml) throws DomException { return parseXmlStreamNS(new ByteArrayInputStream(xml)); } /** * Parse XML text (from a string) into a Document. * @param xml The XML text * @return DOM Document * @throws DomException */ public static Document parseXmlString(String xml) throws DomException { return parseXmlStream(new ByteArrayInputStream(xml.getBytes())); } /** * Parse an XML file into a Document. * @param filename - The filename to parse * @return DOM Document * @throws DomException */ public static Document parseXmlFile(String filename) throws DomException { try { return getXmlDocumentBuilder().parse(filename); } catch (Exception exception) { throw new DomException(exception.toString()); } } /** * Set up and configure an HTML DOM parser. We specifiy a * default encoding value to be used when no encoding information * is available in the HTML document itself. * * An appropriate META tag will override this default: * <code> * <meta http-equiv="Content-Type" content="text/html; charset=XXXX"> * </code> * * @return The parser */ /******************************************************************************* * We originally used the Neko HTML parser here. This was a boon as it * gracefully handled both HTML and XML (which it wraped in HTML and * BODY tags). Sadly, it is closely tied to Xerces, * * At a future date, we'll look for an appropriate substitute. At present, * parsing only XML is good enough (the Sirsi Web2 Bridge is the only * supported search source, and it's an XML API to SingleSearch). private static org.cyberneko.html.parsers.DOMParser newHtmlDomParser() throws SAXNotRecognizedException, SAXNotSupportedException { org.cyberneko.html.parsers.DOMParser domParser; domParser = new org.cyberneko.html.parsers.DOMParser(); domParser.setProperty(ENCODING_OPTION, INPUT_ENCODING); return domParser; } *******************************************************************************/ /** * Parse HTML from a Reader * @param reader Reader input * @return DOM Document * @throws DomException */ /******************************************************************************* * * See notes on Neko HTML (above) * public static Document parseHtmlReader(Reader reader) throws DomException { return parseHtmlFromInputSource(new InputSource(reader)); } *******************************************************************************/ /** * Parse HTML from an InputSource * @param in InputSource * @return DOM Document * @throws DomException */ /******************************************************************************* * * See notes on Neko HTML (above) * public static Document parseHtmlFromInputSource(InputSource in) throws DomException { try { org.cyberneko.html.parsers.DOMParser domParser; domParser = newHtmlDomParser(); domParser.parse(in); return domParser.getDocument(); } catch (Exception e) { throw new DomException(e.toString()); } } *******************************************************************************/ /** * Parse HTML text (from a raw byte array) into a Document. * @param html The HTML text * @return DOM Document * @throws DomException *<p> * The used to be: * <code>parseHtmlStream(new ByteArrayInputStream(html));</code> */ public static Document parseHtmlBytes(byte[] html) throws DomException { return parseXmlStreamNS(new ByteArrayInputStream(html)); } /** * Parse HTML text (from a String) into a Document. * @param html The HTML text * @return DOM Document * @throws DomException *<p> * This used to be: * <code>return parseHtmlReader(new StringReader(html));</code> */ public static Document parseHtmlString(String html) throws DomException { return parseXmlReader(new StringReader(html)); } /** * Write formatted XML text to supplied OutputStream. * @param node Node to write * @param target stream to write to * @throws DomException */ public static void serializeXml(Node node, OutputStream target) throws DomException { try { Transformer transformer = TransformerFactory.newInstance().newTransformer(); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); transformer.transform(new DOMSource(node), new StreamResult(target)); } catch (Exception e) { throw new DomException(e.toString()); } } /** * Write formatted XML text to supplied Writer. * @param node the Node to write * @param writer Writer the document is written to * @throws DomException */ public static void serializeXml(Node node, Writer writer) throws DomException { try { Transformer transformer = TransformerFactory.newInstance().newTransformer(); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); transformer.transform(new DOMSource(node), new StreamResult(writer)); } catch (Exception e) { throw new DomException(e.toString()); } } /** * Write formatted XML text to a String. * @param object The XML Document, HTML Document, or Element to write * @return String containing the formatted document text * @throws DomException */ public static String serialize(Object object) throws DomException { ByteArrayOutputStream stream = null; Writer writer = null; try { stream = new ByteArrayOutputStream(); writer = new OutputStreamWriter(stream, ENCODING); if (object instanceof Document) { serializeXml((Node) ((Document) object).getDocumentElement(), writer); } else if (object instanceof Element) { serializeXml((Node) object, writer); } else { throw new IllegalArgumentException("Unexpected object for serialzation: " + object.toString()); } return stream.toString(); } catch (Exception e) { throw new DomException(e.toString()); } finally { try { if (writer != null) writer.close(); } catch (Exception ignore) { } try { if (stream != null) stream.close(); } catch (Exception ignore) { } } } }