//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.consumers.template;
import java.io.IOException;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.io.FilenameUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import com.fasterxml.jackson.annotation.JsonInclude.Include;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableList.Builder;
import uk.gov.dstl.baleen.annotators.templates.TemplateAnnotator;
import uk.gov.dstl.baleen.annotators.templates.TemplateFieldConfiguration;
import uk.gov.dstl.baleen.annotators.templates.TemplateFieldDefinitionAnnotator;
import uk.gov.dstl.baleen.annotators.templates.TemplateRecordConfiguration;
import uk.gov.dstl.baleen.annotators.templates.TemplateRecordDefinitionAnnotator;
import uk.gov.dstl.baleen.consumers.utils.SourceUtils;
import uk.gov.dstl.baleen.types.structure.Structure;
import uk.gov.dstl.baleen.types.templates.TemplateFieldDefinition;
import uk.gov.dstl.baleen.types.templates.TemplateRecordDefinition;
import uk.gov.dstl.baleen.uima.BaleenConsumer;
import uk.gov.dstl.baleen.uima.utils.CoveringStructureHierarchy;
import uk.gov.dstl.baleen.uima.utils.StructureUtil;
import uk.gov.dstl.baleen.uima.utils.select.ItemHierarchy;
/**
* Writes RecordDefinitions, and the TemplateFieldDefinitions that they cover,
* to YAML files for subsequent use in {@link TemplateAnnotator}.
* <p>
* See {@link TemplateAnnotator} for a description of the format.
* </p>
*
* <p>
* This consumer should be used with {@link TemplateRecordDefinitionAnnotator}
* and {@link TemplateFieldDefinitionAnnotator}.
* </p>
*/
public class TemplateRecordConfigurationCreatingConsumer extends BaleenConsumer {
/**
* A list of structural types which will be considered during record path
* analysis.
*
* @baleen.config Paragraph,TableCell,ListItem,Aside, ...
*/
public static final String PARAM_TYPE_NAMES = "types";
/** The type names. */
@ConfigurationParameter(name = PARAM_TYPE_NAMES, mandatory = false)
private String[] typeNames;
/** The structural classes. */
private Set<Class<? extends Structure>> structuralClasses;
/** The Constant PARAM_OUTPUT_DIRECTORY. */
public static final String PARAM_OUTPUT_DIRECTORY = "outputDirectory";
/** The output directory. */
@ConfigurationParameter(name = PARAM_OUTPUT_DIRECTORY, defaultValue = "recordDefinitions")
private String outputDirectory = "recordDefinitions";
/** The object mapper. */
private final ObjectMapper objectMapper;
/**
* Instantiates a new record definition configuration creating consumer.
*/
public TemplateRecordConfigurationCreatingConsumer() {
objectMapper = new ObjectMapper(new YAMLFactory());
objectMapper.setSerializationInclusion(Include.NON_NULL);
}
@Override
public void doInitialize(final UimaContext aContext) throws ResourceInitializationException {
super.doInitialize(aContext);
structuralClasses = StructureUtil.getStructureClasses(typeNames);
}
@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {
CoveringStructureHierarchy structureHierarchy = CoveringStructureHierarchy.build(jCas, structuralClasses);
Collection<TemplateRecordDefinition> recordDefinitions = JCasUtil.select(jCas, TemplateRecordDefinition.class);
Collection<TemplateFieldDefinition> fieldDefinitions = new HashSet<>(
JCasUtil.select(jCas, TemplateFieldDefinition.class));
Map<String, TemplateRecordConfiguration> recordConfigurations = new HashMap<>();
for (TemplateRecordDefinition recordDefinition : recordDefinitions) {
String name = recordDefinition.getName();
if (recordConfigurations.containsKey(name)) {
throw new AnalysisEngineProcessException(
new IllegalArgumentException("Record name not unique: " + name));
}
Optional<Structure> startStructure = JCasUtil
.selectCovering(jCas, Structure.class, recordDefinition.getBegin(), recordDefinition.getBegin())
.stream().max(Comparator.comparingInt(Structure::getDepth));
Optional<Structure> coveringStructure = structureHierarchy.getCoveringStructure(recordDefinition);
Optional<Structure> endStructure = JCasUtil
.selectCovering(jCas, Structure.class, recordDefinition.getEnd(), recordDefinition.getEnd())
.stream().max(Comparator.comparingInt(Structure::getDepth));
if (!startStructure.isPresent() || !endStructure.isPresent()) {
getMonitor().warn("Could not find start or end structure elements for record definition {} - giving up",
name);
continue;
}
List<TemplateFieldDefinition> definitions = JCasUtil.selectCovered(TemplateFieldDefinition.class,
recordDefinition);
fieldDefinitions.removeAll(definitions);
List<TemplateFieldConfiguration> fields = makeFields(structureHierarchy, definitions);
Optional<Structure> preceding = structureHierarchy.getPrevious(startStructure.get());
Optional<Structure> following = structureHierarchy.getNext(endStructure.get());
List<Structure> coveredStructures = JCasUtil.selectBetween(Structure.class, getPreceding(jCas, preceding),
getFollowing(jCas, following));
String precedingPath = preceding.isPresent()
? structureHierarchy.getSelectorPath(preceding.get()).toString() : "";
String followingPath = following.isPresent()
? structureHierarchy.getSelectorPath(following.get()).toString() : "";
if (recordDefinition.getRepeat()) {
int depth = Math.max(preceding.map(Structure::getDepth).orElse(0),
following.map(Structure::getDepth).orElse(0));
List<String> coveredPaths = generateCoveredPaths(structureHierarchy, coveredStructures, depth);
String minimalRepeat = null;
if (coveringStructure.isPresent() && coveredStructures.contains(coveringStructure.get())) {
Structure repeatingUnit = coveringStructure.get();
minimalRepeat = structureHierarchy.getSelectorPath(repeatingUnit).toString();
}
recordConfigurations.put(name, new TemplateRecordConfiguration(name, precedingPath, coveredPaths,
minimalRepeat, followingPath, fields, recordDefinition.getBegin()));
} else {
recordConfigurations.put(name, new TemplateRecordConfiguration(name, precedingPath, followingPath,
fields, recordDefinition.getBegin()));
}
}
List<TemplateRecordConfiguration> configurations = new ArrayList<>(recordConfigurations.values());
if (!fieldDefinitions.isEmpty()) {
for (TemplateFieldDefinition field : fieldDefinitions) {
configurations.add(new TemplateRecordConfiguration(
makeFields(structureHierarchy, ImmutableList.of(field)), field.getBegin()));
}
}
String documentSourceName = SourceUtils.getDocumentSourceBaseName(jCas, getSupport());
try (Writer w = createOutputWriter(documentSourceName)) {
Collections.sort(configurations, Comparator.comparing(TemplateRecordConfiguration::getOrder));
objectMapper.writeValue(w, configurations);
} catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
}
/**
* Get the preceding annotation from the optional or return a substitute
* annotation at the start of the document.
*
* @param jCas
* the jCas
* @param structure
* the optional structure
* @return a preceding annotations
*/
private Structure getPreceding(JCas jCas, Optional<Structure> structure) {
if (structure.isPresent()) {
return structure.get();
}
return new Structure(jCas, 0, 0);
}
/**
* Get the following annotation from the optional or return a substitute
* annotation at the end of the document
*
* @param jCas
* the jCas
* @param structure
* the optional structure
* @return a following annotations
*/
private Structure getFollowing(JCas jCas, Optional<Structure> structure) {
if (structure.isPresent()) {
return structure.get();
}
int length = jCas.getDocumentText().length();
return new Structure(jCas, length, length);
}
/**
* Generate the covered paths, reducing to the lowest depth.
*
* @param structureHierarchy
* the structure hierarchy
* @param coveredStructures
* the covered structures
* @param depth
* the maximum depth
* @return list of paths for the covered structures
*/
private List<String> generateCoveredPaths(ItemHierarchy<Structure> structureHierarchy,
List<Structure> coveredStructures, int depth) {
LinkedHashSet<String> collect = coveredStructures.stream()
.map(s -> structureHierarchy.getSelectorPath(s).toDepth(depth).toString())
.collect(Collectors.toCollection(LinkedHashSet::new));
Builder<String> builder = ImmutableList.<String>builder();
String parent = collect.iterator().next();
builder.add(parent);
for (String path : collect) {
if (!path.startsWith(parent)) {
builder.add(path);
parent = path;
}
}
return builder.build();
}
/**
* Make fields from definitions and look up the location in the structure
*
* @param structureHierarchy
* the structure hierarchy
* @param fields
* the fields
* @return the list of configurations
*/
private List<TemplateFieldConfiguration> makeFields(final CoveringStructureHierarchy structureHierarchy,
Collection<TemplateFieldDefinition> definitions) {
List<TemplateFieldConfiguration> fields = new ArrayList<>();
for (TemplateFieldDefinition templateFieldDefinition : definitions) {
String fieldPath = structureHierarchy.generatePath(templateFieldDefinition).toString();
TemplateFieldConfiguration field = makeField(templateFieldDefinition, fieldPath);
fields.add(field);
}
return fields;
}
/**
* Make field from definition and path
*
* @param templateFieldDefinition
* the field definition
* @param fields
* the fields
* @return the configuration
*/
private TemplateFieldConfiguration makeField(TemplateFieldDefinition templateFieldDefinition, String fieldPath) {
TemplateFieldConfiguration field = new TemplateFieldConfiguration(templateFieldDefinition.getName(), fieldPath);
field.setRequired(templateFieldDefinition.getRequired());
field.setRepeat(templateFieldDefinition.getRepeat());
field.setRegex(templateFieldDefinition.getRegex());
return field;
}
/**
* Creates the output writer for the configuration yaml files.
*
* @param documentSourceName
* the document source name
* @return the writer
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private Writer createOutputWriter(final String documentSourceName) throws IOException {
Path directoryPath = Paths.get(outputDirectory);
if (!directoryPath.toFile().exists()) {
Files.createDirectories(directoryPath);
}
String baseName = FilenameUtils.getBaseName(documentSourceName);
Path outputFilePath = directoryPath.resolve(baseName + ".yaml");
if (outputFilePath.toFile().exists()) {
getMonitor().warn("Overwriting existing output properties file {}", outputFilePath);
}
return Files.newBufferedWriter(outputFilePath, StandardCharsets.UTF_8);
}
}