//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.templates;
import java.util.Collection;
import java.util.Collections;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.templates.TemplateField;
import uk.gov.dstl.baleen.types.templates.TemplateRecord;
/**
* Using previously created record and template definitions and annotations,
* remove records that are not valid.
*
* <p>
* Each YAML configuration file for records can contain multiple definitions for
* records. Template fields can be considered required to make a record valid.
* </p>
*
* <p>
* This annotator (or cleaner) removes records which do not contain all required
* fields.
* </p>
*
* <p>
* This can be configured to only remove invalid records of a specified source
* and/or from a list of specified records.
* </p>
*
*
* Example configuration:
*
* <pre>
...
annotators:
- class templates.TemplateAnnotator:
...
- class templates.TemplateValidator:
source: athleteReportDefinitions
records:
- athleteDetails
- athletePerformance
*
* </pre>
*
*/
public class TemplateValidator extends AbstractTemplateAnnotator {
/**
* A specific source file that the records should be from.
*
* If not specified all sources are used
*
* @baleen.config myRecords.yaml
*/
public static final String PARAM_SOURCE = "source";
/**
* A specific list of records to be validated.
*
* If not specified all records are validated
*
* @baleen.config myRecords.yaml
*/
public static final String PARAM_RECORDS = "records";
/** The source names. */
@ConfigurationParameter(name = PARAM_SOURCE, mandatory = false)
private String source;
/** The records names. */
@ConfigurationParameter(name = PARAM_RECORDS, mandatory = false)
private String[] records;
@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {
Collection<TemplateRecord> recordAnnotations = JCasUtil.select(jCas, TemplateRecord.class);
for (Entry<String, TemplateRecordConfiguration> entry : recordDefinitions.entries()) {
if (shouldProcessDefinition(entry.getKey(), entry.getValue())) {
doProcessRecordDefinition(entry.getKey(), entry.getValue(), recordAnnotations);
}
}
}
/**
* Check if this definition should be processed based on configuration
*
* @param source
* the source
* @param recordDefinition
* the record definition
* @return
*/
private boolean shouldProcessDefinition(String recordSource, TemplateRecordConfiguration recordDefinition) {
if (StringUtils.isNotBlank(source) && !source.equals(recordSource)) {
return false;
}
if (ArrayUtils.isNotEmpty(records) && !ArrayUtils.contains(records, recordDefinition.getName())) {
return false;
}
return true;
}
/**
*
* Removes invalid records.
*
* @param source
* the source
* @param recordDefinition
* the RecordDefinitionConfiguration
* @param recordAnnotations
* the record annotations
*/
protected void doProcessRecordDefinition(String source, TemplateRecordConfiguration recordDefinition,
Collection<TemplateRecord> recordAnnotations) {
getRecordsForRecordDefinition(recordAnnotations, source, recordDefinition.getName()).forEach(r -> {
Collection<TemplateField> fieldAnnotations = getTemplateFieldsForRecord(source, r);
Set<String> fieldsPresent = getNamesOfFieldsPresent(fieldAnnotations);
Optional<String> missingRequired = streamNamesOfRequiredFields(recordDefinition)
.filter(required -> !fieldsPresent.contains(required)).findFirst();
if (missingRequired.isPresent()) {
getMonitor().info("Removing invalid record {} - {} from as missing require field {}", source,
recordDefinition.getName(), missingRequired.get());
removeFromJCasIndex(r);
removeFromJCasIndex(fieldAnnotations);
}
});
}
/**
* Stream the names of the required fields for the given record definition.
*
* @param recordDefinition
* the record definition
* @return stream of the names of the require fields
*/
private Stream<String> streamNamesOfRequiredFields(TemplateRecordConfiguration recordDefinition) {
return recordDefinition.getFields().stream().filter(TemplateFieldConfiguration::isRequired)
.map(TemplateFieldConfiguration::getName);
}
/**
* Get the names of the fields given
*
* @param fieldAnnotations
* the field annotations
* @return the names of the given fields
*/
private Set<String> getNamesOfFieldsPresent(Collection<TemplateField> fieldAnnotations) {
return fieldAnnotations.stream().map(TemplateField::getName).collect(Collectors.toSet());
}
/**
* Get the template fields for the given source and record.
*
* @param source
* the source
* @param record
* the record
* @return
*/
private Collection<TemplateField> getTemplateFieldsForRecord(String source, TemplateRecord record) {
return JCasUtil.selectCovered(TemplateField.class, record).stream().filter(t -> source.equals(t.getSource()))
.collect(Collectors.toList());
}
/**
* Get the records for the given source and record definition name
*
* @param records
* all the records
* @param source
* the source
* @param name
* the name of the record
* @return a stream of the records
*/
private Stream<TemplateRecord> getRecordsForRecordDefinition(Collection<TemplateRecord> records, String source, String name) {
return records.stream().filter(r -> source.equals(r.getSource()) && name.equals(r.getName()));
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(TemplateField.class, TemplateRecord.class), Collections.emptySet());
}
}