package io.monokkel.core;
import io.monokkel.core.api.ResponseParser;
import io.monokkel.domain.PageData;
import io.monokkel.exceptions.ParseException;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.DOMBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.NodeList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.*;
import java.util.*;
import java.util.stream.Collectors;
import static java.lang.String.format;
public class HtmlTransformer implements ResponseParser {
private Logger log = LoggerFactory.getLogger(getClass());
private List<String> urlRegularExpressions;
private String contentRetrievalExpression;
private String attributeToLocateContent;
private final HashMap<String, Object> transformationField;
public HtmlTransformer(final List<String> urlRegularExpressions, final String attributeToLocateContent, final String contentRetrievalExpression, final HashMap<String, Object> transformationField) {
this.urlRegularExpressions = urlRegularExpressions;
this.attributeToLocateContent = attributeToLocateContent;
this.contentRetrievalExpression = contentRetrievalExpression;
this.transformationField = transformationField;
}
public HtmlTransformer(final List<String> urlRegularExpressions, final HashMap<String, Object> transformationField) {
this.urlRegularExpressions = urlRegularExpressions;
this.transformationField = transformationField;
}
/**
* Parse the response sent and extract title and urls sent in the configuration
*
* @param url Input url
* @param response The HTML response
* @param timeStamp The timestamp when the page was visited
* @return Parsed data
*/
public PageData parse(final String url, final String response, final Long timeStamp) throws ParseException {
final Set<String> urlSet;
final String title;
try {
final Document document = Jsoup.parse(response, url);
urlSet = retrieveAllUrls(document);
String content = getContent(document);
log.debug("Found {} urls in the document {}", urlSet.size(), url);
title = extractTitle(document);
final Map transformed = iterateThroughTransformationMap(document, transformationField);
return new PageData(urlSet, url, timeStamp, response, title, content, transformed);
} catch (Exception e) {
log.warn("Failed to parse url {}", url, e);
throw new ParseException(format("Failed to parse url %s", url), e);
}
}
@Override
public Boolean shouldParse(final String url, final String response, final List<String> typesFromTheResponseHeader) {
return typesFromTheResponseHeader.stream().filter(typeHeader -> typeHeader.contains("text/html")).count() > 0;
}
private Map iterateThroughTransformationMap(Document document, HashMap<String, Object> fields) {
return fields.entrySet().stream().collect(Collectors.toMap(entry -> entry.getKey(), entry -> parseDocumentWithXpath(document, entry)));
}
@SuppressWarnings("unchecked")
private Object parseDocumentWithXpath(Document document, Map.Entry s) {
final Object value = s.getValue();
if (value instanceof LinkedHashMap) {
LinkedHashMap subObject = (LinkedHashMap) value;
return iterateThroughTransformationMap(document, subObject);
} else if (value instanceof String) {
Object result = compileAndEvaluateXPath(document, (String) value);
StringBuilder stringBuilder = buildNodeValues((NodeList) result);
return stringBuilder.toString();
} else {
log.warn("Failed to parse transformation document");
return "";
}
}
/**
* This function is rather simple. It does not serialize the node names etc
*
* TODO: Build it more sophisticated method that serializes node names into XML strings
*
* @param result a populated node list
* @return a string representation of the nodes
*/
private StringBuilder buildNodeValues(NodeList result) {
NodeList nodes = (NodeList) result;
StringBuilder stringBuilder = new StringBuilder();
for (int i = 0; i < nodes.getLength(); i++) {
final String nodeValue = nodes.item(i).getNodeValue();
stringBuilder.append(nodeValue);
}
return stringBuilder;
}
private Object compileAndEvaluateXPath(Document document, String value) {
Object result = null;
try {
DocumentBuilderFactory domFactory =
DocumentBuilderFactory.newInstance();
domFactory.setNamespaceAware(true);
org.w3c.dom.Document doc = DOMBuilder.jsoup2DOM(document);
XPath xpath = XPathFactory.newInstance().newXPath();
// XPath Query for showing all nodes value
XPathExpression expr = xpath.compile((String) value);
result = expr.evaluate(doc, XPathConstants.NODESET);
} catch (XPathExpressionException e) {
throw new RuntimeException(e);
}
return result;
}
private String getContent(Document document) {
String content = null;
if (contentRetrievalExpression != null) {
final Elements elementsByAttributeValueMatching = document.getElementsByAttributeValueMatching(attributeToLocateContent, contentRetrievalExpression);
content = elementsByAttributeValueMatching.text();
}
return content;
}
private String extractTitle(final Document document) {
try {
final Elements titleElement = document.select("title");
return titleElement.text();
} catch (Exception e) {
log.error("Failed to parse", e);
}
// Lets not bother to halt it here.. just return empty title
return "";
}
private Set<String> retrieveAllUrls(final Document document) {
Set<String> urlSet = new HashSet<>();
for (String urlRegularExpression : urlRegularExpressions) {
final Set<String> urlsMatchingInDocument = getUrlsMatchingInDocument(document, urlRegularExpression);
urlSet.addAll(urlsMatchingInDocument);
}
return urlSet;
}
private Set<String> getUrlsMatchingInDocument(final Document document, final String urlRegularExpression) {
final String matchQuery = format("a[href~=%s]", urlRegularExpression);
final Elements urlElements = document.select(matchQuery);
return urlElements.parallelStream().map(e -> e.attr("abs:href")).collect(Collectors.toSet());
}
}