//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.core.pipelines;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.yaml.snakeyaml.DumperOptions;
import org.yaml.snakeyaml.DumperOptions.FlowStyle;
import org.yaml.snakeyaml.Yaml;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineActionStore;
import uk.gov.dstl.baleen.core.pipelines.orderers.IPipelineOrderer;
import uk.gov.dstl.baleen.core.utils.YamlConfiguration;
/**
* A UIMA-based pipeline that will take a collection reader, annotators and consumers
* and order using an {@link IPipelineOrderer} them before running the pipeline. The
* annotators and consumers will be ordered separately to each other.
*
* The pipeline is persistent, and will continuously look for new documents until the
* pipeline is destroyed.
*
* The pipeline can be paused and unpaused. Whilst paused, the pipeline will not look for
* new documents (but will finish processing the current document).
*/
public class BaleenPipeline implements Runnable {
private static final Logger LOGGER = LoggerFactory.getLogger(BaleenPipeline.class);
private final CollectionReader collectionReader;
private final List<AnalysisEngine> annotators;
private final List<AnalysisEngine> consumers;
private volatile boolean paused = false;
private volatile boolean destroy = false;
private final String name;
private final String originalYaml;
/**
* Constructor
*
* @param name
* Pipeline name
* @param originalYaml
* The original YAML string that was used to build the pipeline
* @param orderer
* The IPipelineOrderer to use to order the pipeline
* @param collectionReader
* The collection reader
* @param annotators
* The annotators to be ordered and used
* @param consumers
* The consumers to be ordered and used
*/
public BaleenPipeline(String name, String originalYaml, IPipelineOrderer orderer, CollectionReader collectionReader, List<AnalysisEngine> annotators, List<AnalysisEngine> consumers){
this.name = name;
this.originalYaml = originalYaml;
this.collectionReader = collectionReader;
this.annotators = orderer.orderPipeline(annotators);
this.consumers = orderer.orderPipeline(consumers);
}
/**
* Get the name of the pipeline
*
* @return
* Pipeline name
*/
public String getName(){
return name;
}
/**
* Get the original YAML used to build the pipeline
*
* @return
* Original YAML
*/
public String originalYaml(){
if(originalYaml == null)
return "";
return originalYaml;
}
/**
* Get an ordered version of the YAML that matches the actual pipeline order
*
* @return
* Ordered YAML
*/
@SuppressWarnings("unchecked")
public String orderedYaml(){
if(originalYaml == null)
return "";
DumperOptions options = new DumperOptions();
options.setDefaultFlowStyle(FlowStyle.BLOCK);
options.setPrettyFlow(true);
Yaml y = new Yaml(options);
//Load original configuration
Map<String,Object> confMap;
try{
confMap = (Map<String, Object>)y.load(YamlConfiguration.cleanTabs(originalYaml));
}catch(ClassCastException cce){
LOGGER.error("Unable to build ordered YAML string", cce);
return "";
}
//Replace annotators and consumers with ordered versions
List<Object> ann = new ArrayList<>();
for(AnalysisEngine a : annotators)
ann.add(a.getConfigParameterValue(PipelineBuilder.ORIGINAL_CONFIG));
confMap.put("annotators", ann);
List<Object> con = new ArrayList<>();
for(AnalysisEngine c : consumers)
con.add(c.getConfigParameterValue(PipelineBuilder.ORIGINAL_CONFIG));
confMap.put("consumers", con);
//Return YAML
return y.dump(confMap);
}
@Override
public void run() {
//Create a JCas object to be used (and reused)
JCas jCas = null;
try{
jCas = JCasFactory.createJCas();
}catch(UIMAException e){
LOGGER.error("Unable to create JCas object - {} will not run", getType(), e);
return;
}
//While we're not destroying the pipeline, run a continuous loop
LOGGER.info("Starting {} {}", getType(), name);
while(!destroy){
try{
//If we're not paused and there are documents to process, then process them
while(!paused && collectionReader.hasNext()){
LOGGER.debug("Beginning processing of document on {} {}", getType(), name);
//Get next document from Collection Reader
collectionReader.getNext(jCas.getCas());
//Process JCas with each annotator in turn
for(AnalysisEngine ae : annotators){
processAnalysisEngine(jCas, ae, "annotator");
}
//Process JCas with each consumer in turn
for(AnalysisEngine ae : consumers){
processAnalysisEngine(jCas, ae, "consumer");
}
//Prepare the JCas for the next document
jCas.reset();
//Check that we should continue
if(destroy)
break;
}
}catch(CollectionException | IOException e){
LOGGER.error("Error from collection reader", e);
}
}
LOGGER.debug("Finished processing loop for {} {}", getType(), name);
//Destroy collection reader and analysis engines
LOGGER.debug("Destroying {} {}", getType(), name);
collectionReader.destroy();
for(AnalysisEngine ae : annotators){
AnalysisEngineActionStore.getInstance().remove((String)ae.getConfigParameterValue(PipelineBuilder.ANNOTATOR_UUID));
ae.destroy();
}
for(AnalysisEngine ae : consumers){
AnalysisEngineActionStore.getInstance().remove((String)ae.getConfigParameterValue(PipelineBuilder.ANNOTATOR_UUID));
ae.destroy();
}
}
/**
* Pause the pipeline
*/
public void pause(){
LOGGER.info("Paused {} {}", getType(), name);
paused = true;
}
/**
* Unpause the pipeline
*/
public void unpause(){
LOGGER.info("Unpaused {} {}", getType(), name);
paused = false;
}
/**
* Returns whether the pipeline is currently paused or not
*/
public boolean isPaused(){
return paused;
}
/**
* Destroy the pipeline after the current document has finished processing
*/
public void destroy(){
LOGGER.info("Destroying {} {} after current document", getType(), name);
destroy = true;
}
/**
* Return the collection reader used by this pipeline
*/
public CollectionReader collectionReader(){
return collectionReader;
}
/**
* Return the annotators used by this pipeline
*/
public List<AnalysisEngine> annotators(){
return annotators;
}
/**
* Return the consumers used by this pipeline
*/
public List<AnalysisEngine> consumers(){
return consumers;
}
/**
* Provide the type of pipeline for use in logging
*/
protected String getType(){
return "pipeline";
}
private void processAnalysisEngine(JCas jCas, AnalysisEngine ae, String type){
try{
ae.process(jCas);
}catch(AnalysisEngineProcessException e){
LOGGER.error("Processing error from {} {}", type, ae.getAnalysisEngineMetaData().getName(), e);
}
}
}