//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.consumers.template; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import com.google.common.collect.HashMultimap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Multimap; import uk.gov.dstl.baleen.annotators.templates.TemplateAnnotator; import uk.gov.dstl.baleen.consumers.utils.SourceUtils; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.templates.TemplateField; import uk.gov.dstl.baleen.types.templates.TemplateRecord; import uk.gov.dstl.baleen.uima.BaleenConsumer; /** * Abstract Template Record Consumer that converts TemplateRecord annotations * and the TemplateField annotations covered by them, to a more convenient * record-centric form. * * <p> * Each entry in the file is an "object" with a <code>kind</code> field of * <code>NAMED</code> or <code>DEFAULT</code> and a <code>fields</code> field * consisting of a dictionary / map of name and value pairs from the * TemplateField annotations. In the case of <code>NAMED</code> records, there * will be an additional <code>name</code> field. * </p> * <p> * Subclasses of this consumer should be used with {@link TemplateAnnotator}. * </p> */ public abstract class AbstractTemplateRecordConsumer extends BaleenConsumer { @Override protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException { Multimap<String, ExtractedRecord> records = HashMultimap.create(); Collection<TemplateRecord> recordAnnotations = JCasUtil.select(jCas, TemplateRecord.class); HashSet<TemplateField> allFields = new HashSet<>(JCasUtil.select(jCas, TemplateField.class)); for (TemplateRecord recordAnnotation : recordAnnotations) { Collection<TemplateField> fieldAnnotations = JCasUtil.selectCovered(TemplateField.class, recordAnnotation); List<TemplateField> recordFieldAnnotations = fieldAnnotations.stream() .filter(f -> recordAnnotation.getSource().equals(f.getSource())).collect(Collectors.toList()); allFields.removeAll(recordFieldAnnotations); Collection<ExtractedField> fieldValues = makeFieldValues(recordFieldAnnotations); records.put(recordAnnotation.getSource(), new ExtractedRecord(recordAnnotation.getName(), fieldValues)); } Multimap<String, TemplateField> remainingFields = HashMultimap.create(); for (TemplateField templateField : allFields) { remainingFields.put(templateField.getSource(), templateField); } for (String source : remainingFields.keySet()) { records.put(source, new ExtractedRecord(makeFieldValues(remainingFields.get(source)))); } String documentSourceName = SourceUtils.getDocumentSourceBaseName(jCas, getSupport()); writeRecords(jCas, documentSourceName, records.asMap()); } /** * Makes the field name/value pairs from a collection of field annotations. * * @param fieldAnnotations * the field annotations * @return the field value name/value pairs */ private static Collection<ExtractedField> makeFieldValues(Collection<TemplateField> fieldAnnotations) { Collection<ExtractedField> fieldValues = new ArrayList<>(); for (TemplateField templateField : fieldAnnotations) { fieldValues.add(new ExtractedField(templateField.getName(), templateField.getValue())); } return fieldValues; } protected abstract void writeRecords(JCas jCas, String documentSourceName, Map<String, Collection<ExtractedRecord>> records) throws AnalysisEngineProcessException; @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(TemplateRecord.class, TemplateField.class), Collections.emptySet()); } }