GadgetHtmlParser.java example

Explorer
gatein-shindig-master
- extras
  - src
    - main
      - java
        org
        apache
        shindig
        extras
        ShindigExtrasGuiceModule.java
        as
        ActivityStreamsGuiceModule.java
        core
        model
        ActionLinkImpl.java
        ActivityEntryImpl.java
        ActivityObjectImpl.java
        ActivityStreamImpl.java
        MediaLinkImpl.java
        opensocial
        model
        ActionLink.java
        ActivityEntry.java
        ActivityObject.java
        ActivityStream.java
        MediaLink.java
        service
        ActivityStreamsHandler.java
        spi
        ActivityStreamService.java
        sample
        ActivityStreamsJsonDbService.java
- java
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 */
package org.apache.shindig.gadgets.parse;

import com.google.common.collect.Lists;
import com.google.inject.ImplementedBy;
import com.google.inject.Inject;
import com.google.inject.Provider;

import org.apache.shindig.common.cache.Cache;
import org.apache.shindig.common.cache.CacheProvider;
import org.apache.shindig.common.util.HashUtil;
import org.apache.shindig.gadgets.GadgetException;
import org.apache.shindig.gadgets.parse.nekohtml.NekoSimplifiedHtmlParser;
import org.w3c.dom.Attr;
import org.w3c.dom.DOMException;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import java.util.LinkedList;

/**
 * Parser for arbitrary HTML content
 */
@ImplementedBy(NekoSimplifiedHtmlParser.class)
public abstract class GadgetHtmlParser {

  public static final String PARSED_DOCUMENTS = "parsedDocuments";
  public static final String PARSED_FRAGMENTS = "parsedFragments";

  private Cache<String, Document> documentCache;
  private Cache<String, DocumentFragment> fragmentCache;
  private Provider<HtmlSerializer> serializerProvider = new DefaultSerializerProvider();
  protected final DOMImplementation documentFactory;

  protected GadgetHtmlParser(DOMImplementation documentFactory) {
    this.documentFactory = documentFactory;
  }

  @Inject
  public void setCacheProvider(CacheProvider cacheProvider) {
    documentCache = cacheProvider.createCache(PARSED_DOCUMENTS);
    fragmentCache = cacheProvider.createCache(PARSED_FRAGMENTS);
  }

  @Inject
  public void setSerializerProvider(Provider<HtmlSerializer> serProvider) {
    this.serializerProvider = serProvider;
  }

  /**
   * @param content
   * @return true if we detect a preamble of doctype or html
   */
  protected static boolean attemptFullDocParseFirst(String content) {
    String normalized = content.substring(0, Math.min(100, content.length())).toUpperCase();
    return normalized.contains("<!DOCTYPE") || normalized.contains("<HTML");
  }

  public Document parseDom(String source) throws GadgetException {
    Document document = null;
    String key = null;
    // Avoid checksum overhead if we arent caching
    boolean shouldCache = shouldCache();
    if (shouldCache) {
      // TODO - Consider using the source if its under a certain size
      key = HashUtil.checksum(source.getBytes());
      document = documentCache.getElement(key);
    }

    if (document == null) {
      try {
        document = parseDomImpl(source);
      } catch (DOMException e) {
        // DOMException is a RuntimeException
        document = errorDom(e);
        HtmlSerialization.attach(document, serializerProvider.get(), source);
        return document;
      } catch (NullPointerException e) {
        throw new GadgetException(GadgetException.Code.INTERNAL_SERVER_ERROR,
                                  "Caught exception in parseDomImpl", e);
      }

      HtmlSerialization.attach(document, serializerProvider.get(), source);

      Node html = document.getDocumentElement();

      Node head = null;
      Node body = null;
      LinkedList<Node> beforeHead = Lists.newLinkedList();
      LinkedList<Node> beforeBody = Lists.newLinkedList();

      while (html.hasChildNodes()) {
        Node child = html.removeChild(html.getFirstChild());
        if (child.getNodeType() == Node.ELEMENT_NODE &&
            "head".equalsIgnoreCase(child.getNodeName())) {
          if (head == null) {
            head = child;
          } else {
            // Concatenate <head> elements together.
            transferChildren(head, child);
          }
        } else if (child.getNodeType() == Node.ELEMENT_NODE &&
                   "body".equalsIgnoreCase(child.getNodeName())) {
          if (body == null) {
            body = child;
          } else {
            // Concatenate <body> elements together.
            transferChildren(body, child);
          }
        } else if (head == null) {
          beforeHead.add(child);
        } else if (body == null) {
          beforeBody.add(child);
        } else {
          // Both <head> and <body> are present. Append to tail of <body>.
          body.appendChild(child);
        }
      }

      // Ensure head tag exists
      if (head == null) {
        // beforeHead contains all elements that should be prepended to <body>. Switch them.
        LinkedList<Node> temp = beforeBody;
        beforeBody = beforeHead;
        beforeHead = temp;

        // Add as first element
        head = document.createElement("head");
        html.insertBefore(head, html.getFirstChild());
      } else {
        // Re-append head node.
        html.appendChild(head);
      }

      // Ensure body tag exists.
      if (body == null) {
        // Add immediately after head.
        body = document.createElement("body");
        html.insertBefore(body, head.getNextSibling());
      } else {
        // Re-append body node.
        html.appendChild(body);
      }

      // Leftovers: nodes before the first <head> node found and the first <body> node found.
      // Prepend beforeHead to the front of <head>, and beforeBody to beginning of <body>,
      // in the order they were found in the document.
      prependToNode(head, beforeHead);
      prependToNode(body, beforeBody);

      // One exception. <style>/<link rel="stylesheet" nodes from <body> end up at the end of <head>,
      // since doing so is HTML compliant and can never break rendering due to ordering concerns.
      LinkedList<Node> styleNodes = Lists.newLinkedList();
      NodeList bodyKids = body.getChildNodes();
      for (int i = 0; i < bodyKids.getLength(); ++i) {
        Node bodyKid = bodyKids.item(i);
        if (bodyKid.getNodeType() == Node.ELEMENT_NODE &&
            isStyleElement((Element)bodyKid)) {
          styleNodes.add(bodyKid);
        }
      }

      for (Node styleNode : styleNodes) {
        head.appendChild(body.removeChild(styleNode));
      }

      // Finally, reprocess all script nodes for OpenSocial purposes, as these
      // may be interpreted (rightly, from the perspective of HTML) as containing text only.
      reprocessScriptForOpenSocial(html);

      if (shouldCache) {
        documentCache.addElement(key, document);
      }
    }

    if (shouldCache) {
      Document copy = (Document)document.cloneNode(true);
      HtmlSerialization.copySerializer(document, copy);
      return copy;
    }
    return document;
  }

  protected void transferChildren(Node to, Node from) {
    while (from.hasChildNodes()) {
      to.appendChild(from.removeChild(from.getFirstChild()));
    }
  }

  protected void prependToNode(Node to, LinkedList<Node> from) {
    while (!from.isEmpty()) {
      to.insertBefore(from.removeLast(), to.getFirstChild());
    }
  }

  private boolean isStyleElement(Element elem) {
    return "style".equalsIgnoreCase(elem.getNodeName()) ||
           ("link".equalsIgnoreCase(elem.getNodeName()) &&
            ("stylesheet".equalsIgnoreCase(elem.getAttribute("rel")) ||
             elem.getAttribute("type").toLowerCase().contains("css")));
  }

  /**
   * Parses a snippet of markup and appends the result as children to the
   * provided node.
   *
   * @param source markup to be parsed
   * @param result Node to append results to
   * @throws GadgetException
   */
  public void parseFragment(String source, Node result) throws GadgetException {
    boolean shouldCache = shouldCache();
    String key = null;
    if (shouldCache) {
      key = HashUtil.checksum(source.getBytes());
      DocumentFragment cachedFragment = fragmentCache.getElement(key);
      if (cachedFragment != null) {
        copyFragment(cachedFragment, result);
        return;
      }
    }

    DocumentFragment fragment = null;
    try {
      fragment = parseFragmentImpl(source);
    } catch (DOMException e) {
      // DOMException is a RuntimeException
      appendParseException(result, e);
      return;
    }

    reprocessScriptForOpenSocial(fragment);
    if (shouldCache) {
      fragmentCache.addElement(key, fragment);
    }
    copyFragment(fragment, result);
  }

  private void copyFragment(DocumentFragment source, Node dest) {
    Document destDoc = dest.getOwnerDocument();
    NodeList nodes = source.getChildNodes();
    for (int i = 0; i < nodes.getLength(); i++) {
      Node clone = destDoc.importNode(nodes.item(i), true);
      dest.appendChild(clone);
    }
  }

  protected Document errorDom(DOMException e) {
    // Create a bare-bones DOM whose body is just error text.
    // We do this to echo information to the developer that originally
    // supplied the data, since doing so is more useful than simply
    // returning a black-box HTML error code stemming from an NPE or other condition downstream.
    // The method is protected to allow overriding of this behavior.
    Document doc = documentFactory.createDocument(null, null, null);
    Node html = doc.createElement("html");
    html.appendChild(doc.createElement("head"));
    Node body = doc.createElement("body");
    appendParseException(body, e);
    html.appendChild(body);
    doc.appendChild(html);
    return doc;
  }

  private void appendParseException(Node node, DOMException e) {
    node.appendChild(node.getOwnerDocument().createTextNode(
        GadgetException.Code.HTML_PARSE_ERROR.toString() + ": " + e.toString()));
  }

  protected boolean shouldCache() {
    return documentCache != null && documentCache.getCapacity() != 0;
  }

  private void reprocessScriptForOpenSocial(Node root) throws GadgetException {
    LinkedList<Node> nodeQueue = Lists.newLinkedList();
    nodeQueue.add(root);
    while (!nodeQueue.isEmpty()) {
      Node next = nodeQueue.removeFirst();
      if (next.getNodeType() == Node.ELEMENT_NODE &&
          "script".equalsIgnoreCase(next.getNodeName())) {
        Attr typeAttr = (Attr)next.getAttributes().getNamedItem("type");
        if (typeAttr != null &&
            SocialDataTags.SCRIPT_TYPE_TO_OSML_TAG.get(typeAttr.getValue()) != null) {
          // The underlying parser impl may have already parsed these.
          // Only re-parse with the coalesced text children if that's all there are.
          boolean parseOs = true;
          StringBuilder sb = new StringBuilder();
          NodeList scriptKids = next.getChildNodes();
          for (int i = 0; parseOs && i < scriptKids.getLength(); ++i) {
            Node scriptKid = scriptKids.item(i);
            if (scriptKid.getNodeType() != Node.TEXT_NODE) {
              parseOs = false;
            }
            sb.append(scriptKid.getTextContent());
          }
          if (parseOs) {
            // Clean out the script node.
            while (next.hasChildNodes()) {
              next.removeChild(next.getFirstChild());
            }
            DocumentFragment osFragment = parseFragmentImpl(sb.toString());
            while (osFragment.hasChildNodes()) {
              Node osKid = osFragment.removeChild(osFragment.getFirstChild());
              osKid = next.getOwnerDocument().adoptNode(osKid);
              if (osKid.getNodeType() == Node.ELEMENT_NODE) {
                next.appendChild(osKid);
              }
            }
          }
        }
      }

      // Enqueue children for inspection.
      NodeList children = next.getChildNodes();
      for (int i = 0; i < children.getLength(); ++i) {
        nodeQueue.add(children.item(i));
      }
    }
  }

  /**
   * TODO: remove the need for parseDomImpl as a parsing method. Gadget HTML is
   * tag soup handled in custom fashion, or is a legitimate fragment. In either case,
   * we can simply use the fragment parsing implementation and patch up in higher-level calls.
   * @param source a piece of HTML
   * @return a Document parsed from the HTML
   * @throws GadgetException
   */
  protected abstract Document parseDomImpl(String source)
      throws GadgetException;

  /**
   * @param source a snippet of HTML markup
   * @return a DocumentFragment containing the parsed elements
   * @throws GadgetException
   */
  protected abstract DocumentFragment parseFragmentImpl(String source)
      throws GadgetException;

  private static class DefaultSerializerProvider implements Provider<HtmlSerializer> {
    public HtmlSerializer get() {
      return new DefaultHtmlSerializer();
    }
  }
}