//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.templates; import java.util.Collections; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.jcas.JCas; import org.jsoup.Jsoup; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import com.google.common.base.Strings; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.templates.TemplateRecordDefinition; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** * Annotates record definitions found in documents using a regular expression. * <p> * RecordDefinitions are regions surrounded by <<record:NAME>> and * <<record:NAME>> marker text, where NAME is a user defined record * type name and must be consistent in the begin and end marker text. Each * RecordDefinition should cover one or more TemplateFieldDefinition annotations * to be useful downstream. * </p> * <p> * Markers for begin and end may be used to make it clearer in the document but * are not required. e.g. <<record:NAME:begin>> and * <<record:NAME:end>> or <<record:NAME begin>> and * <<record:NAME end>> * <p> * * <p> * A repeating record is indicated with the attribute <code>repeat</code> in the * begin record marker. e.g. <<record:NAME repeat>> or * <<record:NAME repeat="true" >> * <p> * This annotator should be used in conjunction with * {@link TemplateFieldDefinitionAnnotator}. * </p> */ public class TemplateRecordDefinitionAnnotator extends BaleenAnnotator { /** Regular expression used to match records. */ private static final String RECORD_TOKEN_REGEX = "<<record:([A-Za-z0-9]+)([:\\s].+?)?(?=>>)>>(.*?)<<record:\\1([:\\s].+?)?(?=>>)>>"; /** The Constant REPEAT_ATTRIBUTE */ private static final String REPEAT_ATTRIBUTE = "repeat"; /** * The compiled regular expression - compiled with the DOTALL option * (effectively '(?s)' in the regex) to enable matches over multiple lines. */ private static final Pattern RECORD_TOKEN_PATTERN = Pattern.compile(RECORD_TOKEN_REGEX, Pattern.DOTALL); @Override protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { String documentText = jCas.getDocumentText(); Matcher matcher = RECORD_TOKEN_PATTERN.matcher(documentText); while (matcher.find()) { createRecordDefinitionAnnotation(jCas, matcher); } } /** * Creates a record definition annotation and adds it to the JCas indexes. * * @param jCas * the JCas * @param matcher * the matcher that triggered the creation, which must have two * groups (first being the name, and the second being the content * within the record) */ private void createRecordDefinitionAnnotation(JCas jCas, Matcher matcher) { TemplateRecordDefinition recordDefinition = new TemplateRecordDefinition(jCas); recordDefinition.setName(matcher.group(1)); recordDefinition.setBegin(matcher.start(3)); recordDefinition.setEnd(matcher.end(3)); recordDefinition.setConfidence(1.0); addAttributes(recordDefinition, "<record:" + matcher.group(1) + matcher.group(2) + " />"); addToJCasIndex(recordDefinition); } /** * Add the attributes to the given record definition * <p> * Uses Jsoup to parse the tag as if html * * @param recordDefinition * the record definition * @param beginText * the begin tag of the record definition */ private void addAttributes(TemplateRecordDefinition recordDefinition, String beginText) { Document doc = Jsoup.parseBodyFragment(beginText); Element fieldElement = doc.body().child(0); Attributes attributes = fieldElement.attributes(); if (attributes.hasKey(REPEAT_ATTRIBUTE)) { String required = attributes.get(REPEAT_ATTRIBUTE); recordDefinition.setRepeat(Strings.isNullOrEmpty(required) ? true : Boolean.valueOf(required)); } } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(TemplateRecordDefinition.class)); } }