//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.contentmanipulators; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import uk.gov.dstl.baleen.contentmanipulators.helpers.ContentManipulator; /** * Convert comment areas as asides. * * This manipulator looks for "COMMENT:" through to "COMMENT ENDS" and then wraps all the tags * between with aside. * * That approach might have issues if you have complex HTML between the tags... for example if you * had something like a merged rows in a table where the COMMENT spans multiple rows, but in * practise comments are usually single or multiple paragraph stretches. * * To avoid some of these we look for COMMENT / COMMENT END within paragraph siblings and we wrap * each sibling individually. A later cleaner could merge adjacent comment annotations. */ public class CommentArea implements ContentManipulator { private static final String COMMENT_START = "COMMENT:"; private static final String COMMENT_END = "COMMENT ENDS"; private static final String ASIDE = "<aside />"; @Override public void manipulate(Document document) { document.select("p:contains(" + COMMENT_END + ")").forEach(last -> { // We have the comment ends... but which sibling should we start from... // Cases are: 1. this element is also is the start block // 2. a previous sibling has the comment start // 3. Can't find in this group of siblings (so ignore) // Case 1: Single 'element comment' if (last.ownText().contains(COMMENT_START)) { last.wrap(ASIDE); } else { // Look for sibling before us.. int index = last.elementSiblingIndex(); Elements allSiblings = last.siblingElements(); Elements pSiblings = allSiblings.select("p"); Element startSibling = null; for (int i = index - 1; i >= 0; i--) { Element e = pSiblings.get(i); if (e.ownText().contains(COMMENT_START)) { startSibling = e; break; } } if (startSibling != null) { // NOTE: Difficult to know what to do here (wrap inner, wrap outter, create an // encompassing tag) // We'll wrap around the outer for the moment, which will generate multiple comments for // multiple paragraphs (but not break any HTML structure) for (int i = startSibling.elementSiblingIndex(); i < index; i++) { allSiblings.get(i).wrap(ASIDE); } last.wrap(ASIDE); } } }); } }