// Copyright 2010 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package com.google.enterprise.connector.util; import com.google.common.base.Charsets; import com.google.common.collect.ImmutableMap; import org.w3c.dom.CharacterData; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.EntityResolver; import org.xml.sax.ErrorHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.logging.Level; import java.util.logging.Logger; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; /** * Utility functions for parsing XML. * * @since 2.8 */ public class XmlParseUtil { private XmlParseUtil() { // prevents instantiation } private static Logger LOGGER = Logger.getLogger(XmlParseUtil.class.getName()); private static final String XHTML_DTD_ID = "-//W3C//DTD XHTML 1.0 Transitional//EN"; private static final String XHTML_DTD_FILE = "/xhtml1-transitional.dtd"; private static final String HTML_LAT1_ID = "-//W3C//ENTITIES Latin 1 for XHTML//EN"; private static final String HTML_LAT1_FILE = "/xhtml-lat1.ent"; private static final String HTML_SYMBOL_ID = "-//W3C//ENTITIES Symbols for XHTML//EN"; private static final String HTML_SYMBOL_FILE = "/xhtml-symbol.ent"; private static final String HTML_SPECIAL_ID = "-//W3C//ENTITIES Special for XHTML//EN"; private static final String HTML_SPECIAL_FILE = "/xhtml-special.ent"; private static final String WEBAPP_DTD_ID = "-//Sun Microsystems, Inc.//DTD Web Application 2.3//EN"; private static final String WEBAPP_DTD_FILE = "/web-app_2_3.dtd"; private static final String XHTML_STRICT_DTD_ID = "-//W3C//DTD XHTML 1.0 Strict//EN"; private static final String XHTML_STRICT_DTD_FILE = "/xhtml1-strict.dtd"; private static Map<String, String> LOCAL_DTDS = ImmutableMap.<String, String> builder() .put(XHTML_DTD_ID, XHTML_DTD_FILE) .put(HTML_LAT1_ID, HTML_LAT1_FILE) .put(HTML_SYMBOL_ID, HTML_SYMBOL_FILE) .put(HTML_SPECIAL_ID, HTML_SPECIAL_FILE) .put(WEBAPP_DTD_ID, WEBAPP_DTD_FILE) .put(XHTML_STRICT_DTD_ID, XHTML_STRICT_DTD_FILE) .build(); /** * An {@link EntityResolver} implementation that resolves * entities using a selection of locally stored DTDs. This resolver * throws an exception if an unknown external entity is found. * * @since 3.2.14 */ public static final EntityResolver catalogEntityResolver = new CatalogEntityResolver(); /** * An {@link EntityResolver} implementation that always throws an exception. * * @since 3.2.14 */ public static final EntityResolver nonEntityResolver = new NonEntityResolver(); /** * An {@link EntityResolver} implementation that resolves * entities using a selection of locally stored DTDs. * * @deprecated Use {@link #catalogEntityResolver} for better security */ @Deprecated public static class LocalEntityResolver implements EntityResolver { @Override public InputSource resolveEntity(String publicId, String systemId) { return resolveLocalEntity(publicId); } } private static class CatalogEntityResolver implements EntityResolver { @Override public InputSource resolveEntity(String publicId, String systemId) throws SAXException { InputSource result = resolveLocalEntity(publicId); if (result == null) { throw resolveNotSupported(publicId, systemId); } else { return result; } } } private static class NonEntityResolver implements EntityResolver { @Override public InputSource resolveEntity(String publicId, String systemId) throws SAXException { throw resolveNotSupported(publicId, systemId); } } private static InputSource resolveLocalEntity(String publicId) { String filename = LOCAL_DTDS.get(publicId); if (filename != null) { URL url = XmlParseUtil.class.getResource(filename); if (url != null) { return new InputSource(url.toString()); } } return null; } private static SAXException resolveNotSupported(String publicId, String systemId) { return new SAXException("Error resolving " + ((publicId == null) ? "" : publicId + "-") + systemId + ". External entity resolution is not supported."); } private static final String STRICT_HTML_PREFIX = "<!DOCTYPE html PUBLIC \"" + XHTML_STRICT_DTD_ID + "\" \"\">" + "<html xmlns=\"http://www.w3.org/1999/xhtml\">" + "<head><title/></head><body><table>"; private static final String HTML_SUFFIX = "</table></body></html>"; /** * A simple {@link ErrorHandler} implementation that always * throws the {@link SAXParseException}. */ public static class ThrowingErrorHandler implements ErrorHandler { @Override public void error(SAXParseException exception) throws SAXException { throw exception; } @Override public void fatalError(SAXParseException exception) throws SAXException { throw exception; } @Override public void warning(SAXParseException exception) throws SAXException { throw exception; } } /** * Parses a form snippet using the XHTML Strict DTD, the * appropriate HTML context, and a validating parser. * * @param formSnippet the form snippet * @throws Exception if an unexpected error occrs */ public static void validateXhtml(String formSnippet) throws Exception { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setValidating(true); factory.setXIncludeAware(false); DocumentBuilder builder = factory.newDocumentBuilder(); builder.setErrorHandler(new ThrowingErrorHandler()); builder.setEntityResolver(catalogEntityResolver); String html = STRICT_HTML_PREFIX + formSnippet + HTML_SUFFIX; builder.parse(new ByteArrayInputStream(html.getBytes(Charsets.UTF_8))); } private static DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); static { factory.setValidating(false); factory.setXIncludeAware(false); } /** * Parse an XML String to a {@code org.w3c.dom.Document}. * * @param fileContent the XML string * @param errorHandler the error handle for SAX parser * @param entityResolver the entity resolver to use * @return A result Document object, {@code null} on error */ public static Document parse(String fileContent, SAXParseErrorHandler errorHandler, EntityResolver entityResolver) { InputStream in = stringToInputStream(fileContent); return parse(in, errorHandler, entityResolver); } /** * Get a root {@code org.w3c.dom.Element} from the XML request body. * * @param xmlBody the XML request body as a String * @param rootTagName the root Element tag name * @return a result Element object if successful, {@code null} on error */ public static Element parseAndGetRootElement(String xmlBody, String rootTagName) { InputStream in = stringToInputStream(xmlBody); return parseAndGetRootElement(in, rootTagName); } private static InputStream stringToInputStream(String fileContent) { return new ByteArrayInputStream(fileContent.getBytes(Charsets.UTF_8)); } /** * Parse an input stream to a {@code org.w3c.dom.Document}. * * @param in the input stream * @param errorHandler the error handle for SAX parser * @param entityResolver the entity resolver to use * @return a result Document object, {@code null} on error */ public static Document parse(InputStream in, SAXParseErrorHandler errorHandler, EntityResolver entityResolver) { try { DocumentBuilder builder = factory.newDocumentBuilder(); builder.setErrorHandler(errorHandler); builder.setEntityResolver(entityResolver); Document document = builder.parse(in); return document; } catch (ParserConfigurationException pce) { LOGGER.log(Level.SEVERE, "Parse exception", pce); } catch (SAXException se) { LOGGER.log(Level.SEVERE, "SAX Exception", se); } catch (IOException ioe) { LOGGER.log(Level.SEVERE, "IO Exception", ioe); } return null; } /** * Get a root {@code org.w3c.dom.Element} from an XML input stream. * * @param in the input stream * @param rootTagName the root Element tag name * @return a result Element object if successful, {@code null} on error */ public static Element parseAndGetRootElement(InputStream in, String rootTagName) { SAXParseErrorHandler errorHandler = new SAXParseErrorHandler(); Document document = parse(in, errorHandler, nonEntityResolver); if (document == null) { LOGGER.log(Level.WARNING, "XML parsing exception!"); return null; } NodeList nodeList = document.getElementsByTagName(rootTagName); if (nodeList == null || nodeList.getLength() == 0) { LOGGER.log(Level.WARNING, "Empty node: " + rootTagName); return null; } return (Element) nodeList.item(0); } /** * Get the attribute value of a given attribute name for * the first XML {@code org.w3c.dom.Element} of given name. * * @param elem the parent XML Element * @param name the name of the child text Element * @param attrName the attribute name * @return attribute value of named child Element */ public static String getFirstAttribute(Element elem, String name, String attrName) { NodeList nodeList = elem.getElementsByTagName(name); if (nodeList.getLength() == 0) { return null; } return (((Element) nodeList.item(0)).getAttribute(attrName)); } /** * Get the attribute values of a given name/value pair for * the first XML {@code org.w3c.dom.Element} of given name. * * @param elem the parent XML Element * @param name the name of the child text Element * @return attribute name and value Map of named child Element */ public static Map<String, String> getAllAttributes(Element elem, String name) { Map<String, String> attributes = new TreeMap<String, String>(); NodeList nodeList = elem.getElementsByTagName(name); int length = nodeList.getLength(); for (int n = 0; n < length; ++n) { attributes.put(((Element) nodeList.item(n)).getAttribute("name"), ((Element) nodeList.item(n)).getAttribute("value")); } return attributes; } /** * Get text data of an optional XML {@code org.w3c.dom.Element} of given name. * <p> * Note that this differs from {@link #getFirstElementByTagName} in how it * handles missing elements vs. empty elements. Specifically, if the named * element does not exist, this returns {@code null}. However, if the named * element does exist, but is empty (<tag></tag> or * <tag/>), this returns the empty string. In both cases * {@code getFirstElementByTagName} would return {@code null}. * * @param elem the parent XML Element * @param name the name of the child text Element * @return text data of the named child element. * Returns {@code null} if the named element does not exist. * Returns the empty string if the named element exists, but is empty. */ public static String getOptionalElementByTagName(Element elem, String name) { return getElementByTagName(elem, name, ""); } /** * Get text data of first XML {@code org.w3c.dom.Element} of given name. * * @param elem the parent XML Element * @param name the name of the child text Element * @return text data of named child Element */ public static String getFirstElementByTagName(Element elem, String name) { return getElementByTagName(elem, name, null); } /** * Get text data of first XML {@code org.w3c.dom.Element} of given name. * * @param elem the parent XML Element * @param name the name of the child text Element * @param emptyValue value to return if element exists, but is empty * @return text data of named child Element */ private static String getElementByTagName(Element elem, String name, String emptyValue) { NodeList nodeList = elem.getElementsByTagName(name); if (nodeList.getLength() == 0) { return null; } NodeList children = nodeList.item(0).getChildNodes(); if (children.getLength() == 0 || children.item(0).getNodeType() != Node.TEXT_NODE) { return emptyValue; } return children.item(0).getNodeValue(); } /** * Get a list of all child text Elements of given name directly * under a given {@code org.w3c.dom.Element}. * * @param elem the parent Element * @param name the given name of searched child Elements * @return a List of values of those child text Elements */ public static List<String> getAllElementsByTagName(Element elem, String name) { NodeList nodeList = elem.getElementsByTagName(name); List<String> result = new ArrayList<String>(); for (int i = 0; i < nodeList.getLength(); ++i) { NodeList children = nodeList.item(i).getChildNodes(); if (children.getLength() == 0 || children.item(0).getNodeType() != Node.TEXT_NODE) { continue; } result.add(children.item(0).getNodeValue()); } return result; } /** * Extracts the first CDATA child from the given {@code org.w3c.dom.Element}. * * @param elem the parent Element * @return the String value of the CDATA section, or {@code null} if none * found */ public static String getCdata(Element elem) { NodeList nodes = elem.getChildNodes(); for (int i = 0; i < nodes.getLength(); i++) { Node node = nodes.item(i); if (node.getNodeType() == Node.CDATA_SECTION_NODE) { CharacterData cdataNode = (CharacterData) node; return cdataNode.getData(); } } return null; } }