package io.monokkel.core; import io.monokkel.core.api.ResponseParser; import io.monokkel.domain.PageData; import io.monokkel.exceptions.ParseException; import org.apache.stanbol.enhancer.engines.htmlextractor.impl.DOMBuilder; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.NodeList; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.*; import java.util.*; import java.util.stream.Collectors; import static java.lang.String.format; public class HtmlTransformer implements ResponseParser { private Logger log = LoggerFactory.getLogger(getClass()); private List<String> urlRegularExpressions; private String contentRetrievalExpression; private String attributeToLocateContent; private final HashMap<String, Object> transformationField; public HtmlTransformer(final List<String> urlRegularExpressions, final String attributeToLocateContent, final String contentRetrievalExpression, final HashMap<String, Object> transformationField) { this.urlRegularExpressions = urlRegularExpressions; this.attributeToLocateContent = attributeToLocateContent; this.contentRetrievalExpression = contentRetrievalExpression; this.transformationField = transformationField; } public HtmlTransformer(final List<String> urlRegularExpressions, final HashMap<String, Object> transformationField) { this.urlRegularExpressions = urlRegularExpressions; this.transformationField = transformationField; } /** * Parse the response sent and extract title and urls sent in the configuration * * @param url Input url * @param response The HTML response * @param timeStamp The timestamp when the page was visited * @return Parsed data */ public PageData parse(final String url, final String response, final Long timeStamp) throws ParseException { final Set<String> urlSet; final String title; try { final Document document = Jsoup.parse(response, url); urlSet = retrieveAllUrls(document); String content = getContent(document); log.debug("Found {} urls in the document {}", urlSet.size(), url); title = extractTitle(document); final Map transformed = iterateThroughTransformationMap(document, transformationField); return new PageData(urlSet, url, timeStamp, response, title, content, transformed); } catch (Exception e) { log.warn("Failed to parse url {}", url, e); throw new ParseException(format("Failed to parse url %s", url), e); } } @Override public Boolean shouldParse(final String url, final String response, final List<String> typesFromTheResponseHeader) { return typesFromTheResponseHeader.stream().filter(typeHeader -> typeHeader.contains("text/html")).count() > 0; } private Map iterateThroughTransformationMap(Document document, HashMap<String, Object> fields) { return fields.entrySet().stream().collect(Collectors.toMap(entry -> entry.getKey(), entry -> parseDocumentWithXpath(document, entry))); } @SuppressWarnings("unchecked") private Object parseDocumentWithXpath(Document document, Map.Entry s) { final Object value = s.getValue(); if (value instanceof LinkedHashMap) { LinkedHashMap subObject = (LinkedHashMap) value; return iterateThroughTransformationMap(document, subObject); } else if (value instanceof String) { Object result = compileAndEvaluateXPath(document, (String) value); StringBuilder stringBuilder = buildNodeValues((NodeList) result); return stringBuilder.toString(); } else { log.warn("Failed to parse transformation document"); return ""; } } /** * This function is rather simple. It does not serialize the node names etc * * TODO: Build it more sophisticated method that serializes node names into XML strings * * @param result a populated node list * @return a string representation of the nodes */ private StringBuilder buildNodeValues(NodeList result) { NodeList nodes = (NodeList) result; StringBuilder stringBuilder = new StringBuilder(); for (int i = 0; i < nodes.getLength(); i++) { final String nodeValue = nodes.item(i).getNodeValue(); stringBuilder.append(nodeValue); } return stringBuilder; } private Object compileAndEvaluateXPath(Document document, String value) { Object result = null; try { DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance(); domFactory.setNamespaceAware(true); org.w3c.dom.Document doc = DOMBuilder.jsoup2DOM(document); XPath xpath = XPathFactory.newInstance().newXPath(); // XPath Query for showing all nodes value XPathExpression expr = xpath.compile((String) value); result = expr.evaluate(doc, XPathConstants.NODESET); } catch (XPathExpressionException e) { throw new RuntimeException(e); } return result; } private String getContent(Document document) { String content = null; if (contentRetrievalExpression != null) { final Elements elementsByAttributeValueMatching = document.getElementsByAttributeValueMatching(attributeToLocateContent, contentRetrievalExpression); content = elementsByAttributeValueMatching.text(); } return content; } private String extractTitle(final Document document) { try { final Elements titleElement = document.select("title"); return titleElement.text(); } catch (Exception e) { log.error("Failed to parse", e); } // Lets not bother to halt it here.. just return empty title return ""; } private Set<String> retrieveAllUrls(final Document document) { Set<String> urlSet = new HashSet<>(); for (String urlRegularExpression : urlRegularExpressions) { final Set<String> urlsMatchingInDocument = getUrlsMatchingInDocument(document, urlRegularExpression); urlSet.addAll(urlsMatchingInDocument); } return urlSet; } private Set<String> getUrlsMatchingInDocument(final Document document, final String urlRegularExpression) { final String matchQuery = format("a[href~=%s]", urlRegularExpression); final Elements urlElements = document.select(matchQuery); return urlElements.parallelStream().map(e -> e.attr("abs:href")).collect(Collectors.toSet()); } }