//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.core.pipelines; import java.lang.reflect.Field; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.UUID; import org.apache.uima.UIMAException; import org.apache.uima.UIMAFramework; import org.apache.uima.analysis_component.AnalysisComponent; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.collection.CollectionReader; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.CollectionReaderFactory; import org.apache.uima.fit.factory.ExternalResourceFactory; import org.apache.uima.fit.internal.ResourceManagerFactory; import org.apache.uima.resource.ExternalResourceDescription; import org.apache.uima.resource.Resource; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.yaml.snakeyaml.Yaml; import uk.gov.dstl.baleen.core.history.BaleenHistory; import uk.gov.dstl.baleen.core.history.logging.LoggingBaleenHistory; import uk.gov.dstl.baleen.core.pipelines.orderers.IPipelineOrderer; import uk.gov.dstl.baleen.core.pipelines.orderers.NoOpOrderer; import uk.gov.dstl.baleen.core.utils.BaleenDefaults; import uk.gov.dstl.baleen.core.utils.BuilderUtils; import uk.gov.dstl.baleen.core.utils.YamlConfiguration; import uk.gov.dstl.baleen.exceptions.BaleenException; import uk.gov.dstl.baleen.exceptions.InvalidParameterException; import uk.gov.dstl.baleen.exceptions.MissingParameterException; /** * This class provides functionality to convert a Baleen YAML configuration file into a * {@link BaleenPipeline} that can be executed by Baleen. * <p> * The YAML configuration file should contain a single <i>collectionreader</i> 'object' and a list * of <i>annotators</i> and <i>consumers</i> objects. Each analysis engine should have a * <i>class</i> property, which refers to the class of the annotator. If the class cannot be found * as specified, then the default Baleen package for that type is searched instead (e.g. * uk.gov.dstl.baleen.annotators). If an collection reader, annotator or consumer has no properties * then the class property prefix is optional (i.e. the list item can consist solely of the * annotator class). * <p> * An `orderer` can also be specified to control the ordering of the pipeline. If not specified, the * default pipeline orderer will be used instead. * <p> * Any additional properties on the analysis engine are passed as Params to the analysis engine. * Additionally, any top level objects that aren't expected are assumed to be global parameters that * are passed to all analysis engines. Where locally specified parameters have the same name as * global ones, the local versions take precedent. * <p> * For example: * * <pre> * shape: * color: red * size: large * * history: * class: uk.gov.dstl.baleen.core.history.memory.InMemoryBaleenHistory * mergeDistinctEntities: true * * orderer: NoOpPipelineOrderer * * collectionreader: * class: DummyReader * inputdirectory: \data\input * * annotators: * - DummyAnnotator1 * - class: DummyAnnotatorWithParams * min: 20 * max: 200 * - DummyAnnotator2 * * consumers: * - class: DummyConsumer * shape.color: green * </pre> * * Here, the pipeline would run as follows with provided parameters listed in brackets: * <ul> * <li>DummyReader (shape.color: red, shape.size: large, inputdirectory: \data\input)</li> * <li>DummyAnnotator1 (shape.color: red, shape.size: large)</li> * <li>DummyAnnotatorWithParams (shape.color: red, shape.size: large, min: 20, max: 20)</li> * <li>DummyAnnotator2 (shape.color: red, shape.size: large)</li> * <li>DummyConsumer (shape.color: green, shape.size: large)</li> * </ul> * <p> * Resources are automatically detected (assuming the analysis engine has used the @ExternalResource * annotation) and created. Resources should use global parameters (e.g. shape.color in the above * example) to initialise themselves, as these are the only ones that will be passed to them. * <p> * If not otherwise specified, <em>history.class</em> will default to <code>uk.gov.dstl.baleen.core.history.logging.LoggingBaleenHistory</code>, * and <em>orderer</em> will default to <code>uk.gov.dstl.baleen.core.pipelines.orderers.DependencyGraphPipelineOrderer</code>. * * @baleen.javadoc */ public class PipelineBuilder { /** * Key for the configuration parameter holding the pipeline name */ public static final String PIPELINE_NAME = "__pipelineName"; /** * Key for the configuration parameter holding the annotator UUID */ public static final String ANNOTATOR_UUID = "__uuid"; /** * Key for the resource holding the history object */ public static final String BALEEN_HISTORY = "__baleenHistory"; /** * Metadata key for storing the original YAML configuration */ public static final String ORIGINAL_CONFIG = "__originalConfig"; private static final List<String> ignoreParams = new ArrayList<>(Arrays.asList("class")); private static final Logger LOGGER = LoggerFactory.getLogger(PipelineBuilder.class); protected final String name; protected final String yaml; protected Map<String, Object> globalConfig; protected Map<String, Object> collectionReaderConfig; protected List<Object> annotatorsConfig; protected List<Object> consumersConfig; protected String pipelineOrderer; private ResourceManager resourceManager; private Map<String, ExternalResourceDescription> resourceDescriptors; /** * Construct a PipelineBuilder from the name and YAML * * @param name * Pipeline name * @param yaml * Pipeline YAML */ public PipelineBuilder(String name, String yaml){ this.name = name; this.yaml = yaml; } /** * Create a new BaleenOrderingPipeline from the name and * YAML configuration provided to the constructor */ public BaleenPipeline createNewPipeline() throws BaleenException{ LOGGER.info("Creating pipeline {}", name); //Read in configuration from YAML readConfiguration(); //Initialise resource manager resourceDescriptors = new HashMap<>(); try{ resourceManager = ResourceManagerFactory.newResourceManager(); }catch(UIMAException ue){ throw new BaleenException("Could not create Resource Manager", ue); } //Create components LOGGER.debug("Configuring pipeline orderer"); IPipelineOrderer orderer = createPipelineOrderer(); LOGGER.debug("Configuring history"); ExternalResourceDescription erdHistory = configureHistory(); resourceDescriptors.put(BALEEN_HISTORY, erdHistory); LOGGER.debug("Creating collection reader"); CollectionReader collectionReader = createCollectionReader(); List<AnalysisEngine> annotators; if(annotatorsConfig != null && !annotatorsConfig.isEmpty()){ LOGGER.debug("Creating annotators"); annotators = createAnnotators(); }else{ annotators = Collections.emptyList(); } List<AnalysisEngine> consumers; if(consumersConfig != null && !consumersConfig.isEmpty()){ LOGGER.debug("Creating consumers"); consumers = createConsumers(); }else{ consumers = Collections.emptyList(); } return toPipeline(name, yaml, orderer, collectionReader, annotators, consumers); } /** * Take a number of parameters and return a pipeline (or sub-class) * * @param name * Pipeline name * @param yaml * Original YAML * @param orderer * Pipeline orderer to use * @param collectionReader * Collection reader to use * @param annotators * List of annotators (can be empty) * @param consumers * List of consumers (can be empty) * * @return * Configured BaleenPipeline */ protected BaleenPipeline toPipeline(String name, String yaml, IPipelineOrderer orderer, CollectionReader collectionReader, List<AnalysisEngine> annotators, List<AnalysisEngine> consumers){ return new BaleenPipeline(name, yaml, orderer, collectionReader, annotators, consumers); } /** * Read configuration into the class variables */ @SuppressWarnings("unchecked") protected void readConfiguration(){ LOGGER.debug("Reading configuration"); Yaml y = new Yaml(); String cleanYaml = YamlConfiguration.cleanTabs(yaml); globalConfig = (Map<String, Object>) y.load(cleanYaml); pipelineOrderer = BuilderUtils.getClassNameFromConfig(globalConfig.remove("orderer")); Object s = globalConfig.remove("collectionreader"); if(s instanceof String){ collectionReaderConfig = new HashMap<>(); collectionReaderConfig.put("class", s); }else{ collectionReaderConfig = (Map<String, Object>) s; } annotatorsConfig = (List<Object>) globalConfig.remove("annotators"); consumersConfig = (List<Object>) globalConfig.remove("consumers"); globalConfig = BuilderUtils.flattenConfig(null, globalConfig); globalConfig.put(PIPELINE_NAME, name); } /** * Create a new pipeline orderer */ private IPipelineOrderer createPipelineOrderer(){ if(pipelineOrderer == null){ LOGGER.info("Pipeline orderer not specified - default will be used"); return getDefaultPipelineOrderer(); } Class<?> c; try{ c = BuilderUtils.getClassFromString(pipelineOrderer, getDefaultOrdererPackage()); }catch(InvalidParameterException ipe){ LOGGER.warn("Couldn't find specified orderer - default will be used", ipe); return getDefaultPipelineOrderer(); } if(!(IPipelineOrderer.class.isAssignableFrom(c))){ LOGGER.warn("Specified orderer does not implement IPipelineOrderer interface - default will be used"); return getDefaultPipelineOrderer(); } try { return (IPipelineOrderer) c.newInstance(); } catch (Exception e) { LOGGER.error("Specified orderer does not implement IPipelineOrderer interface - default will be used", e); return getDefaultPipelineOrderer(); } } private IPipelineOrderer getDefaultPipelineOrderer(){ try{ return (IPipelineOrderer) Class.forName(BaleenDefaults.DEFAULT_ORDERER).newInstance(); }catch(Exception e){ LOGGER.error("Unable to create default pipeline orderer, pipeline will run in order specified in configuration", e); return new NoOpOrderer(); } } /** * Configure a new history resource object */ @SuppressWarnings("unchecked") private ExternalResourceDescription configureHistory() { String historyClass = (String) globalConfig.get("history.class"); Class<? extends BaleenHistory> clazz = null; if (historyClass != null) { try { clazz = (Class<? extends BaleenHistory>) Class.forName(historyClass); } catch (ClassNotFoundException | ClassCastException e) { LOGGER.warn("Unable to find perferred history implementation {}", historyClass, e); } } else { LOGGER.warn("No history implementation specified"); } if (clazz == null) { clazz = LoggingBaleenHistory.class; LOGGER.info("Using the default history implementation {}", clazz.getCanonicalName()); } Object[] params = BuilderUtils.extractParams(globalConfig, ignoreParams, getOrCreateResources(clazz)); Object[] stringParams = BuilderUtils.convertToStringArray(params); return ExternalResourceFactory.createExternalResourceDescription(BALEEN_HISTORY, clazz, stringParams); } /** * Create a new Collection Reader */ private CollectionReader createCollectionReader() throws BaleenException{ String className = BuilderUtils.getClassNameFromConfig(collectionReaderConfig); Map<String, Object> params = BuilderUtils.flattenConfig(null, BuilderUtils.getParamsFromConfig(collectionReaderConfig)); if (className == null || className.isEmpty()) { throw new InvalidParameterException("Collection Reader class not specified"); } Map<String, Object> nonNullParams = params; if (nonNullParams == null) { nonNullParams = Collections.emptyMap(); } try { Class<? extends CollectionReader> clazz = BuilderUtils.getClassFromString(className, getDefaultReaderPackage()); Map<String, ExternalResourceDescription> crResources = getOrCreateResources(clazz); Object[] paramArr = BuilderUtils.mergeAndExtractParams(globalConfig, nonNullParams, ignoreParams, crResources); return UIMAFramework.produceCollectionReader(CollectionReaderFactory.createReaderDescription(clazz, paramArr), resourceManager, null); } catch (ResourceInitializationException e) { throw new BaleenException("Couldn't initialize collection reader", e); } } /** * Create a new analysis engine */ private AnalysisEngine createAnalysisEngine(String className, String defaultPackage, Map<String, Object> annotatorConfig, Object originalConfig) throws BaleenException{ if (className == null || className.isEmpty()) { throw new MissingParameterException("No class name provided for annotator, or unable to parse list item - analysis engine will be skipped"); } try { Class<? extends AnalysisComponent> clazz = BuilderUtils.getClassFromString(className, defaultPackage); Map<String, ExternalResourceDescription> aResources = getOrCreateResources(clazz); annotatorConfig.put(ANNOTATOR_UUID, UUID.randomUUID().toString()); Object[] aParams = BuilderUtils.mergeAndExtractParams(globalConfig, annotatorConfig, ignoreParams, aResources); AnalysisEngine ae = createEngine(clazz, resourceManager, aParams); ae.setConfigParameterValue(ORIGINAL_CONFIG, originalConfig); return ae; } catch (BaleenException | ResourceInitializationException e) { throw new BaleenException("Failed to build annotator description - analysis engine will be skipped", e); } } /** * Create new annotators */ private List<AnalysisEngine> createAnnotators() throws BaleenException { List<AnalysisEngine> analysisEngines = new ArrayList<>(); for (Object objAnnotator : annotatorsConfig) { String className = BuilderUtils.getClassNameFromConfig(objAnnotator); Map<String, Object> params = BuilderUtils.flattenConfig(null, BuilderUtils.getParamsFromConfig(objAnnotator)); try{ analysisEngines.add(createAnalysisEngine(className, getDefaultAnnotatorPackage(), params, objAnnotator)); }catch(BaleenException be){ LOGGER.error("Annotator {} could not be created and has been skipped", className, be); } } return analysisEngines; } /** * Create new consumers */ private List<AnalysisEngine> createConsumers() throws BaleenException { List<AnalysisEngine> analysisEngines = new ArrayList<>(); for (Object objConsumer : consumersConfig) { String className = BuilderUtils.getClassNameFromConfig(objConsumer); Map<String, Object> params = BuilderUtils.flattenConfig(null, BuilderUtils.getParamsFromConfig(objConsumer)); try{ analysisEngines.add(createAnalysisEngine(className, getDefaultConsumerPackage(), params, objConsumer)); }catch(BaleenException be){ LOGGER.error("Consumer {} could not be created and has been skipped", className, be); } } return analysisEngines; } /** * Get a resource if it already exists, otherwise create a new one */ @SuppressWarnings("unchecked") private Map<String, ExternalResourceDescription> getOrCreateResources(Class<?> clazz) { Map<String, ExternalResourceDescription> ret = new HashMap<>(); List<Field> fields = new ArrayList<>(); Class<?> c = clazz; while (c != null && c != Object.class) { fields.addAll(Arrays.asList(c.getDeclaredFields())); c = c.getSuperclass(); } for (Field f : fields) { if (f.isAnnotationPresent(ExternalResource.class) && Resource.class.isAssignableFrom(f.getType())) { ExternalResource annotation = f.getAnnotation(ExternalResource.class); String key = annotation.key(); ExternalResourceDescription erd; if (resourceDescriptors.containsKey(key)) { erd = resourceDescriptors.get(key); } else { Map<String, ExternalResourceDescription> erds = getOrCreateResources(f.getType()); Object[] params = BuilderUtils.extractParams(globalConfig, ignoreParams, erds); // Since createExternalResourceDescription actually casts Objects to Strings we need to convert Object[] stringParams = BuilderUtils.convertToStringArray(params); erd = ExternalResourceFactory.createExternalResourceDescription(key, (Class<? extends Resource>) f.getType(), stringParams); resourceDescriptors.put(key, erd); } ret.put(key, erd); } } return ret; } /** * Create a new analysis engine */ private AnalysisEngine createEngine(Class<? extends AnalysisComponent> componentClass, ResourceManager resourceManager, Object... configurationData) throws ResourceInitializationException{ return UIMAFramework.produceAnalysisEngine(AnalysisEngineFactory.createEngineDescription(componentClass, configurationData), resourceManager, null); } /** * Return the package to use as the default location for Orderers. * * This is done as a method rather than accessing the constant * directly so that sub-classes can override it */ protected String getDefaultOrdererPackage(){ return BaleenDefaults.DEFAULT_ORDERER_PACKAGE; } /** * Return the package to use as the default location for Collection Readers. * * This is done as a method rather than accessing the constant * directly so that sub-classes can override it */ protected String getDefaultReaderPackage(){ return BaleenDefaults.DEFAULT_READER_PACKAGE; } /** * Return the package to use as the default location for Annotators. * * This is done as a method rather than accessing the constant * directly so that sub-classes can override it */ protected String getDefaultAnnotatorPackage(){ return BaleenDefaults.DEFAULT_ANNOTATOR_PACKAGE; } /** * Return the package to use as the default location for Consumers. * * This is done as a method rather than accessing the constant * directly so that sub-classes can override it */ protected String getDefaultConsumerPackage(){ return BaleenDefaults.DEFAULT_CONSUMERS_PACKAGE; } }