/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. */ package org.apache.shindig.gadgets.parse; import com.google.common.collect.Lists; import com.google.inject.ImplementedBy; import com.google.inject.Inject; import com.google.inject.Provider; import org.apache.shindig.common.cache.Cache; import org.apache.shindig.common.cache.CacheProvider; import org.apache.shindig.common.util.HashUtil; import org.apache.shindig.gadgets.GadgetException; import org.apache.shindig.gadgets.parse.nekohtml.NekoSimplifiedHtmlParser; import org.w3c.dom.Attr; import org.w3c.dom.DOMException; import org.w3c.dom.DOMImplementation; import org.w3c.dom.Document; import org.w3c.dom.DocumentFragment; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import java.util.LinkedList; /** * Parser for arbitrary HTML content */ @ImplementedBy(NekoSimplifiedHtmlParser.class) public abstract class GadgetHtmlParser { public static final String PARSED_DOCUMENTS = "parsedDocuments"; public static final String PARSED_FRAGMENTS = "parsedFragments"; private Cache<String, Document> documentCache; private Cache<String, DocumentFragment> fragmentCache; private Provider<HtmlSerializer> serializerProvider = new DefaultSerializerProvider(); protected final DOMImplementation documentFactory; protected GadgetHtmlParser(DOMImplementation documentFactory) { this.documentFactory = documentFactory; } @Inject public void setCacheProvider(CacheProvider cacheProvider) { documentCache = cacheProvider.createCache(PARSED_DOCUMENTS); fragmentCache = cacheProvider.createCache(PARSED_FRAGMENTS); } @Inject public void setSerializerProvider(Provider<HtmlSerializer> serProvider) { this.serializerProvider = serProvider; } /** * @param content * @return true if we detect a preamble of doctype or html */ protected static boolean attemptFullDocParseFirst(String content) { String normalized = content.substring(0, Math.min(100, content.length())).toUpperCase(); return normalized.contains("<!DOCTYPE") || normalized.contains("<HTML"); } public Document parseDom(String source) throws GadgetException { Document document = null; String key = null; // Avoid checksum overhead if we arent caching boolean shouldCache = shouldCache(); if (shouldCache) { // TODO - Consider using the source if its under a certain size key = HashUtil.checksum(source.getBytes()); document = documentCache.getElement(key); } if (document == null) { try { document = parseDomImpl(source); } catch (DOMException e) { // DOMException is a RuntimeException document = errorDom(e); HtmlSerialization.attach(document, serializerProvider.get(), source); return document; } catch (NullPointerException e) { throw new GadgetException(GadgetException.Code.INTERNAL_SERVER_ERROR, "Caught exception in parseDomImpl", e); } HtmlSerialization.attach(document, serializerProvider.get(), source); Node html = document.getDocumentElement(); Node head = null; Node body = null; LinkedList<Node> beforeHead = Lists.newLinkedList(); LinkedList<Node> beforeBody = Lists.newLinkedList(); while (html.hasChildNodes()) { Node child = html.removeChild(html.getFirstChild()); if (child.getNodeType() == Node.ELEMENT_NODE && "head".equalsIgnoreCase(child.getNodeName())) { if (head == null) { head = child; } else { // Concatenate <head> elements together. transferChildren(head, child); } } else if (child.getNodeType() == Node.ELEMENT_NODE && "body".equalsIgnoreCase(child.getNodeName())) { if (body == null) { body = child; } else { // Concatenate <body> elements together. transferChildren(body, child); } } else if (head == null) { beforeHead.add(child); } else if (body == null) { beforeBody.add(child); } else { // Both <head> and <body> are present. Append to tail of <body>. body.appendChild(child); } } // Ensure head tag exists if (head == null) { // beforeHead contains all elements that should be prepended to <body>. Switch them. LinkedList<Node> temp = beforeBody; beforeBody = beforeHead; beforeHead = temp; // Add as first element head = document.createElement("head"); html.insertBefore(head, html.getFirstChild()); } else { // Re-append head node. html.appendChild(head); } // Ensure body tag exists. if (body == null) { // Add immediately after head. body = document.createElement("body"); html.insertBefore(body, head.getNextSibling()); } else { // Re-append body node. html.appendChild(body); } // Leftovers: nodes before the first <head> node found and the first <body> node found. // Prepend beforeHead to the front of <head>, and beforeBody to beginning of <body>, // in the order they were found in the document. prependToNode(head, beforeHead); prependToNode(body, beforeBody); // One exception. <style>/<link rel="stylesheet" nodes from <body> end up at the end of <head>, // since doing so is HTML compliant and can never break rendering due to ordering concerns. LinkedList<Node> styleNodes = Lists.newLinkedList(); NodeList bodyKids = body.getChildNodes(); for (int i = 0; i < bodyKids.getLength(); ++i) { Node bodyKid = bodyKids.item(i); if (bodyKid.getNodeType() == Node.ELEMENT_NODE && isStyleElement((Element)bodyKid)) { styleNodes.add(bodyKid); } } for (Node styleNode : styleNodes) { head.appendChild(body.removeChild(styleNode)); } // Finally, reprocess all script nodes for OpenSocial purposes, as these // may be interpreted (rightly, from the perspective of HTML) as containing text only. reprocessScriptForOpenSocial(html); if (shouldCache) { documentCache.addElement(key, document); } } if (shouldCache) { Document copy = (Document)document.cloneNode(true); HtmlSerialization.copySerializer(document, copy); return copy; } return document; } protected void transferChildren(Node to, Node from) { while (from.hasChildNodes()) { to.appendChild(from.removeChild(from.getFirstChild())); } } protected void prependToNode(Node to, LinkedList<Node> from) { while (!from.isEmpty()) { to.insertBefore(from.removeLast(), to.getFirstChild()); } } private boolean isStyleElement(Element elem) { return "style".equalsIgnoreCase(elem.getNodeName()) || ("link".equalsIgnoreCase(elem.getNodeName()) && ("stylesheet".equalsIgnoreCase(elem.getAttribute("rel")) || elem.getAttribute("type").toLowerCase().contains("css"))); } /** * Parses a snippet of markup and appends the result as children to the * provided node. * * @param source markup to be parsed * @param result Node to append results to * @throws GadgetException */ public void parseFragment(String source, Node result) throws GadgetException { boolean shouldCache = shouldCache(); String key = null; if (shouldCache) { key = HashUtil.checksum(source.getBytes()); DocumentFragment cachedFragment = fragmentCache.getElement(key); if (cachedFragment != null) { copyFragment(cachedFragment, result); return; } } DocumentFragment fragment = null; try { fragment = parseFragmentImpl(source); } catch (DOMException e) { // DOMException is a RuntimeException appendParseException(result, e); return; } reprocessScriptForOpenSocial(fragment); if (shouldCache) { fragmentCache.addElement(key, fragment); } copyFragment(fragment, result); } private void copyFragment(DocumentFragment source, Node dest) { Document destDoc = dest.getOwnerDocument(); NodeList nodes = source.getChildNodes(); for (int i = 0; i < nodes.getLength(); i++) { Node clone = destDoc.importNode(nodes.item(i), true); dest.appendChild(clone); } } protected Document errorDom(DOMException e) { // Create a bare-bones DOM whose body is just error text. // We do this to echo information to the developer that originally // supplied the data, since doing so is more useful than simply // returning a black-box HTML error code stemming from an NPE or other condition downstream. // The method is protected to allow overriding of this behavior. Document doc = documentFactory.createDocument(null, null, null); Node html = doc.createElement("html"); html.appendChild(doc.createElement("head")); Node body = doc.createElement("body"); appendParseException(body, e); html.appendChild(body); doc.appendChild(html); return doc; } private void appendParseException(Node node, DOMException e) { node.appendChild(node.getOwnerDocument().createTextNode( GadgetException.Code.HTML_PARSE_ERROR.toString() + ": " + e.toString())); } protected boolean shouldCache() { return documentCache != null && documentCache.getCapacity() != 0; } private void reprocessScriptForOpenSocial(Node root) throws GadgetException { LinkedList<Node> nodeQueue = Lists.newLinkedList(); nodeQueue.add(root); while (!nodeQueue.isEmpty()) { Node next = nodeQueue.removeFirst(); if (next.getNodeType() == Node.ELEMENT_NODE && "script".equalsIgnoreCase(next.getNodeName())) { Attr typeAttr = (Attr)next.getAttributes().getNamedItem("type"); if (typeAttr != null && SocialDataTags.SCRIPT_TYPE_TO_OSML_TAG.get(typeAttr.getValue()) != null) { // The underlying parser impl may have already parsed these. // Only re-parse with the coalesced text children if that's all there are. boolean parseOs = true; StringBuilder sb = new StringBuilder(); NodeList scriptKids = next.getChildNodes(); for (int i = 0; parseOs && i < scriptKids.getLength(); ++i) { Node scriptKid = scriptKids.item(i); if (scriptKid.getNodeType() != Node.TEXT_NODE) { parseOs = false; } sb.append(scriptKid.getTextContent()); } if (parseOs) { // Clean out the script node. while (next.hasChildNodes()) { next.removeChild(next.getFirstChild()); } DocumentFragment osFragment = parseFragmentImpl(sb.toString()); while (osFragment.hasChildNodes()) { Node osKid = osFragment.removeChild(osFragment.getFirstChild()); osKid = next.getOwnerDocument().adoptNode(osKid); if (osKid.getNodeType() == Node.ELEMENT_NODE) { next.appendChild(osKid); } } } } } // Enqueue children for inspection. NodeList children = next.getChildNodes(); for (int i = 0; i < children.getLength(); ++i) { nodeQueue.add(children.item(i)); } } } /** * TODO: remove the need for parseDomImpl as a parsing method. Gadget HTML is * tag soup handled in custom fashion, or is a legitimate fragment. In either case, * we can simply use the fragment parsing implementation and patch up in higher-level calls. * @param source a piece of HTML * @return a Document parsed from the HTML * @throws GadgetException */ protected abstract Document parseDomImpl(String source) throws GadgetException; /** * @param source a snippet of HTML markup * @return a DocumentFragment containing the parsed elements * @throws GadgetException */ protected abstract DocumentFragment parseFragmentImpl(String source) throws GadgetException; private static class DefaultSerializerProvider implements Provider<HtmlSerializer> { public HtmlSerializer get() { return new DefaultHtmlSerializer(); } } }