//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.contentmanipulators;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import uk.gov.dstl.baleen.contentmanipulators.helpers.ContentManipulator;
import uk.gov.dstl.baleen.contentmanipulators.helpers.MarkupUtils;
/**
* Creates HTML nodes which capture the paragraph classification markings.
*
* If this manipulator sees (CLASSIFICATION) The rest of the paragraph. Then it removes the
* CLASSIFICATION prefix and records the classification in the paragraph tag under data- tags. This
* cleans up the text and allows a classification annotation to be added later.
*
* This is a basic example, and may not work in all cases. It could be more robust.
*
* NOTE this will only output classification tags if used in conjunction with the
* DataAttributeMapper.
*/
public class ParagraphMarkedClassification implements ContentManipulator {
private static final String CLASSFICATION_GROUP = "classfication";
private static final Pattern PARAGRAPH_MARKING =
Pattern.compile("^\\s*\\((?<" + CLASSFICATION_GROUP + ">.*?)\\).*");
@Override
public void manipulate(Document document) {
document.select("p").forEach(this::processParagraph);
}
private void processParagraph(Element p){
String text = p.text();
Matcher matcher = PARAGRAPH_MARKING.matcher(text);
if (matcher.find()) {
String classification = matcher.group(CLASSFICATION_GROUP);
MarkupUtils.additionallyAnnotateAsType(p, "uk.gov.dstl.baleen.types.metadata.ProtectiveMarking");
// TODO: We override this for simplicity but we could select the best classification etc
// (or output everything later and let a cleaner decide)
MarkupUtils.setAttribute(p, "classification", classification.trim());
// TODO: Ideally delete text the classification from the front.
// That needs a util as we need to eat up the children of p until we've got to the end.
// That's quite complex, you'd need to split down the text nodes across multiple children.
// We'll just remove the the first text node matching the classification we've found as an interim.
String marking = "(" + classification + ')';
for (org.jsoup.nodes.TextNode t : p.textNodes()) {
if (t.text().contains(marking)) {
String newText = t.text().replace(marking, "");
t.text(newText);
}
}
}
}
}