/* * Licensed to the Apache Software Foundation (ASF) under one or more contributor license * agreements. See the NOTICE file distributed with this work for additional information regarding * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. You may obtain a * copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. * * Started from * https://svn.apache.org/repos/asf/stanbol/trunk/enhancement-engines/htmlextractor/src/ * main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMBuilder.java */ package com.digitalpebble.stormcrawler.parse; import java.util.HashMap; import java.util.Map; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.apache.html.dom.HTMLDocumentImpl; import org.w3c.dom.Document; import org.w3c.dom.DocumentFragment; import org.w3c.dom.Node; /** * TODO use org.jsoup.helper.W3CDom instead? * * @author <a href="mailto:kasper@dfki.de">Walter Kasper</a> */ public final class JSoupDOMBuilder { /** * Restrict instantiation */ private JSoupDOMBuilder() { } /** * Returns a W3C DOM that exposes the same content as the supplied Jsoup * document into a W3C DOM. * * @param jsoupDocument * The Jsoup document to convert. * @return A W3C Document. */ public static Document jsoup2DOM(org.jsoup.nodes.Document jsoupDocument) { Document document; try { /* Obtain the document builder for the configured XML parser. */ DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory .newInstance(); DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder(); /* Create a document to contain the content. */ document = docBuilder.newDocument(); createDOM(jsoupDocument, document, document, new HashMap<String, String>()); } catch (ParserConfigurationException pce) { throw new RuntimeException(pce); } return document; } public static DocumentFragment jsoup2HTML( org.jsoup.nodes.Document jsoupDocument) { HTMLDocumentImpl htmlDoc = new HTMLDocumentImpl(); htmlDoc.setErrorChecking(false); DocumentFragment fragment = htmlDoc.createDocumentFragment(); createDOM(jsoupDocument, fragment, htmlDoc, new HashMap<String, String>()); return fragment; } /** * The internal helper that copies content from the specified Jsoup * <tt>Node</tt> into a W3C {@link Node}. * * @param node * The Jsoup node containing the content to copy to the specified * W3C {@link Node}. * @param out * The W3C {@link Node} that receives the DOM content. */ public static void createDOM(org.jsoup.nodes.Node node, Node out, Document doc, Map<String, String> ns) { if (node instanceof org.jsoup.nodes.Document) { org.jsoup.nodes.Document d = (org.jsoup.nodes.Document) node; for (org.jsoup.nodes.Node n : d.childNodes()) { createDOM(n, out, doc, ns); } } else if (node instanceof org.jsoup.nodes.Element) { org.jsoup.nodes.Element e = (org.jsoup.nodes.Element) node; org.w3c.dom.Element _e = doc.createElement(e.tagName()); out.appendChild(_e); org.jsoup.nodes.Attributes atts = e.attributes(); for (org.jsoup.nodes.Attribute a : atts) { String attName = a.getKey(); // omit xhtml namespace if (attName.equals("xmlns")) { continue; } String attPrefix = getNSPrefix(attName); if (attPrefix != null) { if (attPrefix.equals("xmlns")) { ns.put(getLocalName(attName), a.getValue()); } else if (!attPrefix.equals("xml")) { String namespace = ns.get(attPrefix); if (namespace == null) { // fix attribute names looking like qnames attName = attName.replace(':', '_'); } } } _e.setAttribute(attName, a.getValue()); } for (org.jsoup.nodes.Node n : e.childNodes()) { createDOM(n, _e, doc, ns); } } else if (node instanceof org.jsoup.nodes.TextNode) { org.jsoup.nodes.TextNode t = (org.jsoup.nodes.TextNode) node; if (!(out instanceof Document)) { out.appendChild(doc.createTextNode(t.text())); } } else if (node instanceof org.jsoup.nodes.Comment) { if (!(out instanceof Document)) { org.jsoup.nodes.Comment comment = (org.jsoup.nodes.Comment) node; out.appendChild(doc.createComment(comment.getData())); } } else if (node instanceof org.jsoup.nodes.DataNode) { if (!(out instanceof Document)) { org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) node; String whole = sourceData.getWholeData(); out.appendChild(doc.createTextNode(whole)); } } } // some hacks for handling namespace in jsoup2DOM conversion private static String getNSPrefix(String name) { if (name != null) { int pos = name.indexOf(':'); if (pos > 0) { return name.substring(0, pos); } } return null; } private static String getLocalName(String name) { if (name != null) { int pos = name.lastIndexOf(':'); if (pos > 0) { return name.substring(pos + 1); } } return name; } }