package org.smartly.packages.htmlparser.impl.vtools; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.smartly.Smartly; import org.smartly.commons.lang.CharEncoding; import org.smartly.commons.network.URLUtils; import org.smartly.commons.util.PathUtils; import org.smartly.commons.util.StringUtils; import java.io.InputStream; import java.net.URL; import java.util.Map; /** * DOM parser. */ public class DocumentTool { public static final String NAME = "document"; private static final String PARAM_URL = "url"; private static final String PARAM_CHARSET = "charset"; private final Map<String, String> _requestParams; private final String _servletPath; private final String _charset; private final URL _url; private final String _protocol; private final String _host; // http://www.google.it private final int _port; private final String _path; private final org.jsoup.nodes.Document _document; private String _title; private String __domain; public DocumentTool(final String servletPath, final Map<String, String> params) throws Exception { _requestParams = params; _servletPath = servletPath; _charset = CharEncoding.isSupported(params.get(PARAM_CHARSET)) ? params.get(PARAM_CHARSET) : getDefaultCharset(); _url = getUrl(params.get(PARAM_URL)); _protocol = _url.getProtocol(); _host = _url.getHost(); _port = _url.getPort(); _path = _url.getPath(); // creates document final InputStream is = URLUtils.getInputStream(_url, 5000, URLUtils.TYPE_HTML); _document = Jsoup.parse(is, _charset, _url.toString()); _title = _document.title(); } public String getName() { return NAME; } @Override public String toString() { return null != _document ? _document.outerHtml() : super.toString(); } //-- DOM --// public String getCharset() { return _charset; } public String getTitle() { return null != _title ? _title : ""; } public org.jsoup.nodes.Document getDocument() { return _document; } public Element getBody() { return null != _document ? _document.body() : null; } public Element getHead() { return null != _document ? _document.head() : null; } public Elements select(final String selector) { return null != _document ? _document.select(selector) : new Elements(0); } public Elements select(final Element element, final String selector) { return null != element ? element.select(selector) : select(selector); } //-- Path --// public String getUrl() { return _url.toString(); } public String getDomain() { if (null == __domain) { final StringBuilder result = new StringBuilder(); if (StringUtils.hasText(_protocol)) { result.append(_protocol); } else { result.append("http"); } result.append("://"); result.append(_host); if (_port > 0 && _port != 80) { result.append(":").append(_port); } __domain = result.toString(); } return __domain; } public String getPath() { return _path; } public boolean isInternal(final String path) { final String domain = this.getDomain(); return path.startsWith(domain); } //-- Transforms --// public void remove(final String selector) { this.removeElements(null, selector); } public void remove(final Element element, final String selector) { this.removeElements(element, selector); } public void removeStyles() { this.removeStyles(null); } public void removeStyles(final Element element) { this.removeElements(element, "style"); this.removeElements(element, "link[rel=stylesheet]"); } /** * Check all relative urls (i.e. "./images/image.png") * and change it in absolute url (i.e. "http://www.mysite.com/images/image.png") */ public void absolutizeImagePaths() { final Elements images = this.select("img"); if (null != images && images.size() > 0) { for (final Element image : images) { if (image.hasAttr("src")) { image.attr("src", this.resolveUrl(image.attr("src"))); } } } } public void mobilizeLinks() { this.mobilizeLinks(true); } public void mobilizeLinks(final boolean excludeExternalLinks) { final Elements links = this.select("a"); if (null != links && links.size() > 0) { for (final Element link : links) { if (link.hasAttr("href")) { link.attr("href", this.mobilizeUrl(link.attr("href"), excludeExternalLinks)); } } } } // ------------------------------------------------------------------------ // p r i v a t e // ------------------------------------------------------------------------ private void removeElements(final Element element, final String selector) { final Elements elements = this.select(element, selector); if (null != elements && elements.size() > 0) { elements.remove(); } } private String resolveUrl(final String path) { if (StringUtils.hasText(path) && path.startsWith(".") || path.startsWith("/")) { final String compound = PathUtils.concat(this.getDomain(), path); return PathUtils.resolve(compound); } return path; } private String mobilizeUrl(final String path, final boolean excludeExternal) { if (!StringUtils.hasText(path) || (excludeExternal && !this.isInternal(path))) { return path; } final String url = this.resolveUrl(path); _requestParams.put(PARAM_URL, url); _requestParams.put(PARAM_CHARSET, this.getCharset()); return PathUtils.addURIParameters(_servletPath, _requestParams, true); } // -------------------------------------------------------------------- // S T A T I C // -------------------------------------------------------------------- private static String __CHARSET; private String getDefaultCharset() { if (null == __CHARSET) { final String charset = Smartly.getConfiguration().getString("mobilizer.charset"); __CHARSET = StringUtils.hasText(charset) && CharEncoding.isSupported(charset) ? charset : CharEncoding.getDefault(); } return __CHARSET; } private static URL getUrl(final String url) { try { return new URL(url); } catch (Throwable ignored) { if (StringUtils.hasText(url)) { if (url.startsWith("www")) { return getUrl("http://".concat(url)); } } } // return default return null; } }