ParagraphMarkedClassification.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.contentmanipulators;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import uk.gov.dstl.baleen.contentmanipulators.helpers.ContentManipulator;
import uk.gov.dstl.baleen.contentmanipulators.helpers.MarkupUtils;

/**
 * Creates HTML nodes which capture the paragraph classification markings.
 * 
 * If this manipulator sees (CLASSIFICATION) The rest of the paragraph. Then it removes the
 * CLASSIFICATION prefix and records the classification in the paragraph tag under data- tags. This
 * cleans up the text and allows a classification annotation to be added later.
 * 
 * This is a basic example, and may not work in all cases. It could be more robust.
 * 
 * NOTE this will only output classification tags if used in conjunction with the
 * DataAttributeMapper.
 */
public class ParagraphMarkedClassification implements ContentManipulator {

	private static final String CLASSFICATION_GROUP = "classfication";
	private static final Pattern PARAGRAPH_MARKING =
			Pattern.compile("^\\s*\\((?<" + CLASSFICATION_GROUP + ">.*?)\\).*");


	@Override
	public void manipulate(Document document) {
		document.select("p").forEach(this::processParagraph);
	}
	
	private void processParagraph(Element p){
		String text = p.text();
		Matcher matcher = PARAGRAPH_MARKING.matcher(text);
		if (matcher.find()) {
			String classification = matcher.group(CLASSFICATION_GROUP);

			MarkupUtils.additionallyAnnotateAsType(p, "uk.gov.dstl.baleen.types.metadata.ProtectiveMarking");
			// TODO: We override this for simplicity but we could select the best classification etc
			// (or output everything later and let a cleaner decide)
			MarkupUtils.setAttribute(p, "classification", classification.trim());

			// TODO: Ideally delete text the classification from the front.
			// That needs a util as we need to eat up the children of p until we've got to the end.
			// That's quite complex, you'd need to split down the text nodes across multiple children.
			// We'll just remove the the first text node matching the classification we've found as an interim.

			String marking = "(" + classification + ')';
			for (org.jsoup.nodes.TextNode t : p.textNodes()) {
				if (t.text().contains(marking)) {
					String newText = t.text().replace(marking, "");
					t.text(newText);
				}
			}
		}
	}
}