/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*/
package org.apache.shindig.gadgets.parse;
import org.apache.shindig.common.cache.Cache;
import org.apache.shindig.common.cache.CacheProvider;
import org.apache.shindig.common.util.HashUtil;
import org.apache.shindig.common.xml.DomUtil;
import org.apache.shindig.gadgets.GadgetException;
import org.apache.shindig.gadgets.parse.nekohtml.NekoSimplifiedHtmlParser;
import com.google.inject.ImplementedBy;
import com.google.inject.Inject;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;
/**
* Parser for arbitrary HTML content
*/
@ImplementedBy(NekoSimplifiedHtmlParser.class)
public abstract class GadgetHtmlParser {
public static final String PARSED_DOCUMENTS = "parsedDocuments";
private Cache<String, Document> documentCache;
@Inject
public void setCacheProvider(CacheProvider cacheProvider) {
documentCache = cacheProvider.createCache(PARSED_DOCUMENTS);
}
/**
* @param content
* @return true if we detect a preamble of doctype or html
*/
protected static boolean attemptFullDocParseFirst(String content) {
String normalized = content.substring(0, Math.min(100, content.length())).toUpperCase();
return normalized.contains("<!DOCTYPE") || normalized.contains("<HTML");
}
public final Document parseDom(String source) throws GadgetException {
Document document = null;
String key = null;
// Avoid checksum overhead if we arent caching
boolean shouldCache = shouldCache();
if (shouldCache) {
// TODO - Consider using the source if its under a certain size
key = HashUtil.rawChecksum(source.getBytes());
document = documentCache.getElement(key);
}
if (document == null) {
document = parseDomImpl(source);
// Ensure head tag exists
if (DomUtil.getFirstNamedChildNode(document.getDocumentElement(), "head") == null) {
// Add as first element
document.getDocumentElement().insertBefore(
document.createElement("head"),
document.getDocumentElement().getFirstChild());
}
// If body not found the document was entirely empty. Create the
// element anyway
if (DomUtil.getFirstNamedChildNode(document.getDocumentElement(), "body") == null) {
document.getDocumentElement().appendChild(
document.createElement("body"));
}
if (shouldCache) {
documentCache.addElement(key, document);
}
}
if (shouldCache) {
Document copy = (Document)document.cloneNode(true);
HtmlSerializer.copySerializer(document, copy);
return copy;
}
return document;
}
private boolean shouldCache() {
return documentCache != null && documentCache.getCapacity() != 0;
}
/**
* @param source
* @return a parsed document or document fragment
* @throws GadgetException
*/
protected abstract Document parseDomImpl(String source) throws GadgetException;
/**
* Normalize head and body tags in the passed fragment before including it
* in the document
* @param document
* @param fragment
*/
protected void normalizeFragment(Document document, DocumentFragment fragment) {
Node htmlNode = DomUtil.getFirstNamedChildNode(fragment, "HTML");
if (htmlNode != null) {
document.appendChild(htmlNode);
} else {
Node bodyNode = DomUtil.getFirstNamedChildNode(fragment, "body");
Node headNode = DomUtil.getFirstNamedChildNode(fragment, "head");
if (bodyNode != null || headNode != null) {
// We have either a head or body so put fragment into HTML tag
Node root = document.appendChild(document.createElement("html"));
if (headNode != null && bodyNode == null) {
fragment.removeChild(headNode);
root.appendChild(headNode);
Node body = root.appendChild(document.createElement("body"));
body.appendChild(fragment);
} else {
root.appendChild(fragment);
}
} else {
// No head or body so put fragment into a body
Node root = document.appendChild(document.createElement("html"));
Node body = root.appendChild(document.createElement("body"));
body.appendChild(fragment);
}
}
}
}