//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.templates; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Map.Entry; import java.util.Optional; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.annotators.templates.TemplateRecordConfiguration.Kind; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.exceptions.InvalidParameterException; import uk.gov.dstl.baleen.types.structure.Structure; import uk.gov.dstl.baleen.types.templates.TemplateField; import uk.gov.dstl.baleen.types.templates.TemplateRecord; import uk.gov.dstl.baleen.uima.utils.SelectorPath; import uk.gov.dstl.baleen.uima.utils.StructureHierarchy; import uk.gov.dstl.baleen.uima.utils.StructureUtil; /** * Using previously created record definitions, creates annotations for records * and the the fields contained within them. * * <p> * Each YAML configuration file contains multiple definitions in an array/list, * with each definition being an object with following fields: * <p> * <dl> * <dt>fields</dt> * <dd>A list of field definitions. Fields must have a <code>name</code> and * <code>path</code>, and can optionally have a regular expression * (<code>regex</code>) a <code>defaultValue</code> and declare if they are * <code>required</code>. A TemplateField annotation is created for each matched * path and restrictions.</dd> * * <dt>kind</dt> * <dd>Whether the field selectors above should be used to create a * <code>NAMED</code> record, in which case a name field will also be supplied, * or these are not part of an explicit record, and thus gathered into a * <code>DEFAULT</code> record, so they are still annotated as * TemplateFields.</dd> * <dt>name</dt> * <dd>Only present on <code>NAMED</code> RecordDefinitions, and is populated * with the name of the record. * <dd> * </dl> * * An example YAML configuration could be: * * <pre> --- - name: "NamedRecord" kind: "NAMED" order: 1 fields: - name: "Description" path: "Paragraph:nth-of-type(8)" - name: "FullName" path: "Table:nth-of-type(2) > TableBody > TableRow:nth-of-type(2) >\ \ TableCell:nth-of-type(2) > Paragraph" required: "true" precedingPath: "Paragraph:nth-of-type(6)" followingPath: "Paragraph:nth-of-type(10)" - name: "row" kind: "NAMED" order: 2 fields: - name: "title" path: "Document > Table:nth-of-type(2) > TableBody > TableRow:nth-of-type(1) > TableCell:nth-of-type(1)" required: false - name: "FirstName" path: "Document > Table:nth-of-type(2) > TableBody > TableRow:nth-of-type(1) > TableCell:nth-of-type(2)" required: false - name: "Surname" path: "Document > Table:nth-of-type(2) > TableBody > TableRow:nth-of-type(1) > TableCell:nth-of-type(3)" required: false - name: "DoB" path: "Document > Table:nth-of-type(2) > TableBody > TableRow:nth-of-type(1) > TableCell:nth-of-type(4)" required: false precedingPath: "Document > Heading:nth-of-type(4)" followingPath: "Document > Paragraph:nth-of-type(4)" repeat: true coveredPaths: - "Document > Table:nth-of-type(2)" minimalRepeat: "Document > Table:nth-of-type(2) > TableBody > TableRow:nth-of-type(1)" - kind: "DEFAULT" order: 3 fields: - name: "DocumentTitle" path: "Heading:nth-of-type(2)" - name: "DocumentDate" path: "Paragraph:nth-of-type(3)" regex: "\d{1,2}\/\d{1,2}\/\d{4}" * </pre> * <p> * Configurations are typically created by running a pipeline with the * RecordDefinitionConfigurationCreatingConsumer, which uses annotations created * by RecordDefinitionAnnotation and TemplateFieldDefinitionAnnotator running * over template documents. * </p> */ public class TemplateAnnotator extends AbstractTemplateAnnotator { /** * A list of structural types which will be considered during record path * analysis. * * @baleen.config Paragraph,TableCell,ListItem,Aside, ... */ public static final String PARAM_TYPE_NAMES = "types"; /** The type names. */ @ConfigurationParameter(name = PARAM_TYPE_NAMES, mandatory = false) private String[] typeNames; /** The structural classes. */ private Set<Class<? extends Structure>> structuralClasses; @Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException { super.doInitialize(aContext); structuralClasses = StructureUtil.getStructureClasses(typeNames); } @Override @SuppressWarnings({ "unchecked", "rawtypes" }) protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException { for (Entry<String, Collection<TemplateRecordConfiguration>> entry : recordDefinitions.asMap().entrySet()) { doProcessRecordDefinitions(jCas, entry.getKey(), (List) entry.getValue()); } } /** * Process for the given source and record definition. The passed JCas * object contains information about the document and any existing * annotations. * * @param jCas * JCas object to process * @param source * the source of this record definition * @param recordDefinitions * the recordDefinitions */ protected void doProcessRecordDefinitions(final JCas jCas, String source, List<TemplateRecordConfiguration> recordDefinitions) { RecordStructureManager manager = new RecordStructureManager(StructureHierarchy.build(jCas, structuralClasses)); Collections.sort(recordDefinitions, Comparator.comparing(TemplateRecordConfiguration::getOrder)); for (TemplateRecordConfiguration recordDefinition : recordDefinitions) { doProcessRecordDefinition(jCas, manager, source, recordDefinition); } } /** * Process the given record definition. * * @param jCas * JCas object to process * @param manager * the record structure manager * @param source * the source of the record * @param recordDefinition * the record definition */ private void doProcessRecordDefinition(JCas jCas, RecordStructureManager manager, String source, TemplateRecordConfiguration recordDefinition) { if (recordDefinition.getKind() == Kind.NAMED) { createRecord(jCas, manager, source, recordDefinition); } else { createTemplateFields(jCas, manager, source, recordDefinition.getFields(), jCas.getDocumentText().length()); } } /** * Creates the record based on the paths in the record definition. * * If errors occur during selection these are logged. * * @param jCas * JCas object to process * @param manager * the record structure manager * @param source * the source * @param recordDefinition * the record definition * @throws InvalidParameterException */ private void createRecord(JCas jCas, RecordStructureManager manager, String source, TemplateRecordConfiguration recordDefinition) { Optional<Structure> preceding = Optional.empty(); try { preceding = manager.select(recordDefinition.getPrecedingPath()); } catch (InvalidParameterException e) { getMonitor().warn("Failed to select structure preceeding record " + recordDefinition.getName(), e); } if (recordDefinition.isRepeat()) { Optional<Structure> last; RepeatSearch repeatSearch; try { repeatSearch = manager.createRepeatSearch(recordDefinition); } catch (InvalidParameterException e) { getMonitor().warn("Error while generating repeating unit for record " + recordDefinition.getName(), e); return; } boolean isFirst = true; do { last = manager.repeatRecord(preceding, repeatSearch, isFirst); if (last.isPresent()) { createRecordAnnotation(jCas, source, recordDefinition.getName(), getPreceedingEnd(preceding), last.get().getEnd()); createTemplateFields(jCas, manager, source, recordDefinition.getFields(), last.get().getEnd()); } isFirst = false; preceding = last; } while (last.isPresent()); } else { Optional<Structure> following = Optional.empty(); try { following = manager.select(recordDefinition.getFollowingPath()); } catch (InvalidParameterException e) { getMonitor().warn("Failed to select structure following record " + recordDefinition.getName(), e); } int end = getFollowingBegining(jCas, following); createRecordAnnotation(jCas, source, recordDefinition.getName(), getPreceedingEnd(preceding), end); createTemplateFields(jCas, manager, source, recordDefinition.getFields(), end); } } /** * Get the begin based on the preceding * * @param preceding * optional of the preceding * @return the end of the preceding or the start of the document */ private int getPreceedingEnd(Optional<Structure> preceding) { if (preceding.isPresent()) { return preceding.get().getEnd(); } else { return 0; } } /** * Get the begin based on the following * * @param jCas * the jcas * @param following * optional of the following * @return the begin of the following or the end of the document */ private int getFollowingBegining(JCas jCas, Optional<Structure> following) { if (following.isPresent()) { return following.get().getBegin(); } else { return jCas.getDocumentText().length(); } } /** * Creates the template fields based on the field configurations. * * @param jCas * JCas object to process * @param manager * the record structure manager * @param source * the source * @param fields * the fields * @param end * the end to stop at */ private void createTemplateFields(JCas jCas, RecordStructureManager manager, String source, List<TemplateFieldConfiguration> fields, int end) { for (TemplateFieldConfiguration field : fields) { createTemplateField(jCas, manager, source, field, end); } } /** * * Create the template field from the configuration * * @param jCas * the jCas * @param manager * the record structure manager * @param source * the source * @param field * the field configuration * @param end * the end to stop at */ private void createTemplateField(JCas jCas, RecordStructureManager manager, String source, TemplateFieldConfiguration field, int end) { String fieldName = field.getName(); String fieldPath = field.getPath(); try { SelectorPath path = SelectorPath.parse(fieldPath); Optional<Structure> fieldStructure = manager.select(path); if (fieldStructure.isPresent()) { createFieldAnnotation(jCas, source, field, fieldStructure.get()); } else { manager.recordMissing(path); getMonitor().warn("Expected single structure element for field {} with path {} - ignoring", fieldName, path); } if (field.isRepeat()) { SelectorPath parentPath = path.toDepth(path.getDepth() - 1); Optional<Structure> parent = manager.select(parentPath); int fieldEnd = Math.min(end, parent.map(Structure::getEnd).orElse(Integer.MAX_VALUE)); while (field.isRepeat() && fieldStructure.isPresent()) { fieldStructure = manager.repeatField(fieldStructure, path, fieldEnd); if (fieldStructure.isPresent()) { createFieldAnnotation(jCas, source, field, fieldStructure.get()); } } } } catch (InvalidParameterException e) { getMonitor().warn("Failed to match structure for field " + fieldName, e); } } /** * Create field annotation for the given field definition and matched * structural element. * * @param jCas * JCas object to process * @param manager * the record structure manager * @param source * the source template definition file name * @param field * the field * @param structure * the structure */ private void createFieldAnnotation(JCas jCas, String source, TemplateFieldConfiguration field, Structure structure) { String defaultValue = field.getDefaultValue(); if (structure.getCoveredText().isEmpty()) { if (field.isRequired() && defaultValue == null) { getMonitor().info("Required field missing {} in {}", field.getName(), source); } else { createFieldAnnotation(jCas, source, field.getName(), structure.getBegin(), structure.getEnd(), defaultValue); } return; } String regex = field.getRegex(); if (regex == null) { createFieldAnnotation(jCas, source, field.getName(), structure.getBegin(), structure.getEnd(), structure.getCoveredText()); } else { Pattern pattern = Pattern.compile(regex); String coveredText = structure.getCoveredText(); Matcher matcher = pattern.matcher(coveredText); if (matcher.find()) { createFieldAnnotation(jCas, source, field.getName(), structure.getBegin() + matcher.start(), structure.getBegin() + matcher.end(), matcher.group()); } else if (defaultValue != null) { getMonitor().info("Failed to match pattern {} in {} - using default value {}", regex, coveredText, defaultValue); createFieldAnnotation(jCas, source, field.getName(), structure.getBegin(), structure.getBegin(), defaultValue); } else { getMonitor().warn("Failed to match pattern {} in {} - ignoring", regex, coveredText); } } } /** * Creates the field annotation. * * @param jCas * the JCas * @param source * the source * @param name * the name * @param begin * the begin * @param end * the end * @param value * the value * @return the created field */ protected TemplateField createFieldAnnotation(JCas jCas, String source, String name, int begin, int end, String value) { TemplateField field = new TemplateField(jCas); field.setBegin(begin); field.setEnd(end); field.setName(name); field.setSource(source); field.setValue(value); addToJCasIndex(field); return field; } /** * Creates the record annotation. * * @param jCas * the JCas * @param source * the source * @param name * the name * @param begin * the begin * @param end * the end * @return the created Record */ protected TemplateRecord createRecordAnnotation(JCas jCas, String source, String name, int begin, int end) { TemplateRecord record = new TemplateRecord(jCas); record.setBegin(begin); record.setSource(source); record.setEnd(end); record.setName(name); addToJCasIndex(record); return record; } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(new HashSet<>(structuralClasses), ImmutableSet.of(TemplateField.class, TemplateRecord.class)); } }