//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.templates; import java.util.Collections; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import org.apache.uima.jcas.JCas; import org.jsoup.Jsoup; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import com.google.common.base.Strings; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractRegexAnnotator; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.templates.TemplateFieldDefinition; /** * Annotates template fields found in documents using a regular expression. * <p> * Template fields are text surrounded by ASCII double angle brackets, eg * <<field:fieldname>> for the field "fieldname". * </p> */ public class TemplateFieldDefinitionAnnotator extends AbstractRegexAnnotator<TemplateFieldDefinition> { /** The Constant TEMPLATE_TOKEN_REGEX. */ private static final String TEMPLATE_TOKEN_REGEX = "<<field:([A-Za-z0-9]+)(\\s.+?)?(?=>>)>>"; /** The Constant TEMPLATE_TOKEN_PATTERN. */ private static final Pattern TEMPLATE_TOKEN_PATTERN = Pattern.compile(TEMPLATE_TOKEN_REGEX); /** The Constant DEFAULT_VALUE_ATTRIBUTE */ private static final String DEFAULT_VALUE_ATTRIBUTE = "defaultValue"; /** The Constant REGEX_ATTRIBUTE */ private static final String REGEX_ATTRIBUTE = "regex"; /** The Constant REPEAT_ATTRIBUTE */ private static final String REPEAT_ATTRIBUTE = "repeat"; /** The Constant REQUIRED_ATTRIBUTE */ private static final String REQUIRED_ATTRIBUTE = "required"; /** * Instantiates a new template field definition annotator which will * assigning confidence 1.0 to all matched field definitions. */ public TemplateFieldDefinitionAnnotator() { super(TEMPLATE_TOKEN_PATTERN, 1.0); } @Override protected TemplateFieldDefinition create(JCas jCas, Matcher matcher) { TemplateFieldDefinition field = new TemplateFieldDefinition(jCas); field.setName(matcher.group(1)); if (matcher.group(2) != null) { addFieldAttributes(field, matcher.group()); } return field; } /** * Add the attributes of the field definition * * @param field * the filed definition object under construction * @param coveredText * the covered text defining the field definition */ private void addFieldAttributes(TemplateFieldDefinition field, String coveredText) { Document doc = Jsoup.parseBodyFragment(coveredText.substring(1, coveredText.length() - 2) + " />"); Element fieldElement = doc.body().child(0); Attributes attributes = fieldElement.attributes(); if (attributes.hasKey(DEFAULT_VALUE_ATTRIBUTE)) { field.setDefaultValue(attributes.get(DEFAULT_VALUE_ATTRIBUTE)); } if (attributes.hasKey(REGEX_ATTRIBUTE)) { String regex = attributes.get(REGEX_ATTRIBUTE); checkRegexCompiles(regex); field.setRegex(regex); } if (attributes.hasKey(REPEAT_ATTRIBUTE)) { String required = attributes.get(REPEAT_ATTRIBUTE); field.setRepeat(Strings.isNullOrEmpty(required) ? true : Boolean.valueOf(required)); } if (attributes.hasKey(REQUIRED_ATTRIBUTE)) { String required = attributes.get(REQUIRED_ATTRIBUTE); field.setRequired(Strings.isNullOrEmpty(required) ? true : Boolean.valueOf(required)); } } /** * Check that the regex compiles * * @param regex * @throws PatternSyntaxException * If the expression's syntax is invalid */ private void checkRegexCompiles(String regex) { Pattern.compile(regex); } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(TemplateFieldDefinition.class)); } }