//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.contentextractors.helpers; import java.util.List; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import com.google.common.base.Strings; import uk.gov.dstl.baleen.contentmappers.helpers.AnnotationCollector; import uk.gov.dstl.baleen.contentmappers.helpers.ContentMapper; import uk.gov.dstl.baleen.contentmappers.helpers.JCasBuilder; /** * Converts the HTML Document to a JCas document. * * This involves extraction of text from the HTML and then creation of annotations. * * The creation of annotations is controlled by the {@link ContentMapper}s. */ public class DocumentToJCasConverter { private final List<ContentMapper> mappers; /** * Constructor */ public DocumentToJCasConverter(final List<ContentMapper> mappers) { this.mappers = mappers; } /** * Convert the document into the jCas. * * @param document the document * @param jCas the j cas */ public void apply(final Document document, final JCas jCas) { final JCasBuilder builder = new JCasBuilder(jCas); // First walk the head, but don't save the text walk(builder, document.head(), 1, false); // Then walk the body and do save the text walk(builder, document.body(), 1, true); builder.build(); } /** * Walk the HTML document node by node, creating annotations and text. * * @param builder the builder * @param root the root * @param depth the depth */ private void walk(final JCasBuilder builder, final Node root, final int depth, final boolean captureText) { if (root == null) { return; } final int begin = builder.getCurrentOffset(); if (captureText) { // Generate the text and the annotations final String text = mapToText(root); if (!Strings.isNullOrEmpty(text)) { builder.addText(text); } } List<Annotation> annotations = null; if (root instanceof Element) { annotations = mapElementToAnnotations(builder.getJCas(), (Element) root); } // BUG: With multiple mappers depth here is wrong! It puts all mappers at the same depth... // (though in fairness they are all the same begin-end and same element too) // Walk the children for (final Node node : root.childNodes()) { walk(builder, node, depth + 1, captureText); } // Add annotations to the JCas final int end = builder.getCurrentOffset(); if (annotations != null && !annotations.isEmpty()) { builder.addAnnotations(annotations, begin, end, depth); } } /** * Map a node to text. * * @param node the node * @return the string */ private String mapToText(final Node node) { if (node instanceof TextNode) { final TextNode t = (TextNode) node; return t.getWholeText(); } else { return null; } } /** * Map a HTML element to annotations. * * @param jCas the j cas * @param element the element * @return the list */ private List<Annotation> mapElementToAnnotations(final JCas jCas, final Element element) { final AnnotationCollector collector = new AnnotationCollector(); for (final ContentMapper mapper : mappers) { mapper.map(jCas, element, collector); } return collector.getAnnotations(); } }