/* * Copyright (C) 2012 Sebastian Straub <sebastian-straub@gmx.net> * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package de.nx42.wotcrawler.xml; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; import java.net.URL; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.SimpleXmlSerializer; import org.htmlcleaner.TagNode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.xml.sax.InputSource; import org.xml.sax.SAXException; /** * Parses HTML files * * @author Sebastian Straub <sebastian-straub@gmx.net> */ public class Parser { private static final Logger log = LoggerFactory.getLogger(Parser.class); // -------------------- html parsing -------------------- /** * Parses a HTML document, transforms it into valid XML using the * htmlcleaner-library and returns it as org.w3c.dom.Document * @param file the html file to parse * @return org.w3c.dom.Document representation of the cleaned HTML file * @throws IOException cannot access the file * @throws ParserConfigurationException parser configuration invalid * @throws SAXException error while parsing (usually invalid xml) */ public static Document parseHTML(File file) throws IOException, ParserConfigurationException, SAXException { HtmlCleaner cleaner = new HtmlCleaner(); TagNode tagNode = cleaner.clean(file); String cleanHTML = new SimpleXmlSerializer(cleaner.getProperties()).getAsString(tagNode); return buildDOM(cleanHTML); } /** * Parses a HTML document, transforms it into valid XML using the * htmlcleaner-library and returns it as org.w3c.dom.Document * @param url the url where the document can be retrieved * @return org.w3c.dom.Document representation of the cleaned HTML file * @throws IOException cannot access the file * @throws ParserConfigurationException parser configuration invalid * @throws SAXException error while parsing (usually invalid xml) */ public static Document parseHTML(URL url) throws IOException, ParserConfigurationException, SAXException { HtmlCleaner cleaner = new HtmlCleaner(); TagNode tagNode = cleaner.clean(url); String cleanHTML = new SimpleXmlSerializer(cleaner.getProperties()).getAsString(tagNode); return buildDOM(cleanHTML); } // -------------------- DOM operations -------------------- /** * Creates a org.w3c.dom.Document from a given XML String * @param cleanedHTML a valid xml document as string * @return org.w3c.dom.Document representation of the string * @throws ParserConfigurationException parser configuration invalid * @throws SAXException error while parsing (usually invalid xml) * @throws IOException this should not occur, necessary because the String is * read as InputSource */ protected static Document buildDOM(String cleanedHTML) throws ParserConfigurationException, SAXException, IOException { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); dbf.setNamespaceAware(true); DocumentBuilder builder = dbf.newDocumentBuilder(); return builder.parse(new InputSource(new StringReader(cleanedHTML))); } /** * Creates a simple String representation of a given DOM tree * @param node this node and all it's descendants will be printed * @return current DOM as XML String */ public static String domToString(Node node) { try { TransformerFactory transFactory = TransformerFactory.newInstance(); Transformer transformer = transFactory.newTransformer(); StringWriter buffer = new StringWriter(); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); transformer.transform(new DOMSource(node), new StreamResult(buffer)); return buffer.toString(); } catch (TransformerException ex) { log.error("Error while serializing DOM tree", ex); } return "xml transformation failed!"; } }