package org.smartly.packages.htmlparser.impl; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.safety.Cleaner; import org.jsoup.safety.Whitelist; import org.jsoup.select.Elements; import org.smartly.Smartly; import org.smartly.commons.logging.Level; import org.smartly.commons.logging.Logger; import org.smartly.commons.logging.util.LoggingUtils; import java.io.ByteArrayInputStream; import java.io.InputStream; import java.net.URL; /** * Depends from jsoup HTML parser. */ public class HtmlParser { private static final String CHARSET = Smartly.getCharset(); private static final int TIMEOUT = 5000; private final Document _document; public HtmlParser(final URL url) { _document = getDocument(url, TIMEOUT); } public HtmlParser(final Document document) { _document = document; } public HtmlParser(final String html) { _document = getDocument(html, ""); } public HtmlParser(final String html, final String baseURI) { _document = getDocument(html, baseURI); } public HtmlParser(final InputStream is) { _document = getDocument(is, ""); } // -------------------------------------------------------------------- // p u b l i c // -------------------------------------------------------------------- public Document getDocument() { return _document; } public Element getBody() { return null != _document ? _document.body() : null; } public Element getHead() { return null != _document ? _document.head() : null; } public Elements select(final String cssQuery) { return null != _document ? _document.select(cssQuery) : new Elements(0); } public Element selectFirst(final String cssQuery) { if (null != _document) { final Elements elements = _document.select(cssQuery); return null != elements && !elements.isEmpty() ? elements.first() : null; } return null; } public String selectAsString(final String cssQuery) { final Elements result = this.select(cssQuery); return result.outerHtml(); } public String remove(final String[] cssQueries) { for (final String cssQuery : cssQueries) { this.removeElements(cssQuery); } return null != _document ? _document.outerHtml() : ""; } public String remove(final String cssQuery) { this.removeElements(cssQuery); return null != _document ? _document.outerHtml() : ""; } public Document cleanBasic() { final Cleaner cleaner = new Cleaner(Whitelist.basic()); return cleaner.clean(_document); } public String cleanBasicAsString() { final Cleaner cleaner = new Cleaner(Whitelist.basic()); return cleaner.clean(_document).outerHtml(); } public Document cleanRelaxed() { final Cleaner cleaner = new Cleaner(Whitelist.relaxed()); return cleaner.clean(_document); } public String cleanRelaxedAsString() { final Cleaner cleaner = new Cleaner(Whitelist.relaxed()); return cleaner.clean(_document).outerHtml(); } // -------------------------------------------------------------------- // p r i v a t e // -------------------------------------------------------------------- private Logger getLogger() { return LoggingUtils.getLogger(this); } private Elements removeElements(final String cssQuery) { final Elements elements = _document.select(cssQuery); final Elements removed = elements.remove(); return removed; } // -------------------------------------------------------------------- // S T A T I C // -------------------------------------------------------------------- private static Document getDocument(final URL uri, final int timeout) { try { return Jsoup.parse(uri, timeout); } catch (Throwable t) { LoggingUtils.getLogger(HtmlParser.class).log(Level.SEVERE, null, t); } return null; } private static Document getDocument(final String html, final String baseUri) { try { return Jsoup.parse( new ByteArrayInputStream(html.getBytes()), CHARSET, baseUri ); } catch (Throwable t) { LoggingUtils.getLogger(HtmlParser.class).log(Level.SEVERE, null, t); } return null; } private static Document getDocument(final InputStream is, final String baseUri) { try { return Jsoup.parse( is, CHARSET, baseUri ); } catch (Throwable t) { LoggingUtils.getLogger(HtmlParser.class).log(Level.SEVERE, null, t); } return null; } public static Elements select(final String html, final String cssQuery) { final HtmlParser parser = new HtmlParser(html); return parser.select(cssQuery); } public static String remove(final String html, final String cssQuery) { final HtmlParser parser = new HtmlParser(html); return parser.remove(cssQuery); } public static String remove(final String html, final String[] cssQuery) { final HtmlParser parser = new HtmlParser(html); return parser.remove(cssQuery); } }