//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.uima; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.apache.uima.UimaContext; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import uk.gov.dstl.baleen.core.history.BaleenHistory; import uk.gov.dstl.baleen.core.metrics.MetricsFactory; import uk.gov.dstl.baleen.core.pipelines.PipelineBuilder; import uk.gov.dstl.baleen.core.utils.BuilderUtils; import uk.gov.dstl.baleen.exceptions.InvalidParameterException; import uk.gov.dstl.baleen.uima.utils.UimaUtils; /** * This class provides basic functionality for a collection reader, such as * metrics and logging, so that we don't need to put it into every annotator * manually. All collection readers in Baleen should inherit from this class and * use any utility methods it provides as required to ensure we standardise * logging and metrics as much as possible. * * @baleen.javadoc */ public abstract class BaleenCollectionReader extends JCasCollectionReader_ImplBase { private UimaMonitor monitor; private UimaSupport support; /** * Baleen History resource * * @baleen.resource uk.gov.dstl.baleen.core.history.BaleenHistory */ public static final String KEY_HISTORY = PipelineBuilder.BALEEN_HISTORY; @ExternalResource(key = KEY_HISTORY, mandatory = false) BaleenHistory history; @Override public final void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); // This will do initialization of resources, // but won't be included in the metrics String pipelineName = UimaUtils.getPipelineName(context); monitor = new UimaMonitor(pipelineName, this.getClass()); support = new UimaSupport(pipelineName, this.getClass(), history, monitor, UimaUtils.isMergeDistinctEntities(context)); monitor.startFunction("initialize"); doInitialize(context); monitor.finishFunction("initialize"); } /** * Called when the collection reader is being initialized. Any required * resources, for example, should be opened at this point. * * @param context * The UimaContext for the collection reader */ protected abstract void doInitialize(UimaContext context) throws ResourceInitializationException; @Override public final void getNext(JCas jCas) throws IOException, CollectionException { monitor.startFunction("getNext"); MetricsFactory.getInstance().getPipelineMetrics(monitor.getPipelineName()).startDocumentProcess(); doGetNext(jCas); monitor.finishFunction("getNext"); monitor.persistCounts(); } /** * Called when UIMA wants the next document. The passed CAS object should be * populated with the document content, and any initial annotations. * * @param jCas * The JCas object to populate */ protected abstract void doGetNext(JCas jCas) throws IOException, CollectionException; @Override public final void close() throws IOException { monitor.startFunction("close"); doClose(); monitor.finishFunction("close"); } /** * Called when the collection reader has finished and is closing down. Any * open resources, for example, should be closed at this point. */ protected abstract void doClose() throws IOException; @Override public final Progress[] getProgress() { monitor.startFunction("getProgress"); Progress[] ret = doGetProgress(); monitor.finishFunction("getProgress"); return ret; } @Override public void destroy() { super.destroy(); try { close(); } catch (IOException e) { getMonitor().warn("Close on destroy", e); } } /** * Called when UIMA wants to know how far we've got with processing the * current collection of documents. Most collection readers shouldn't need * (or won't be able) to implement this as how can we give the progress if * we're persistently looking for new data? * * @return An array of progress objects indicating how far we've currently * got. */ public Progress[] doGetProgress() { return new Progress[0]; } /** * Override of the UIMA hasNext() method with logic to continuously check * for new documents until one is found. This prevents the collection reader * from exiting (unless asked to), and so creates a persistent collection * reader and pipeline. */ @Override public final boolean hasNext() throws IOException, CollectionException { monitor.startFunctionTrace("hasNext"); boolean next = doHasNext(); monitor.finishFunctionTrace("hasNext"); return next; } /** * Called when UIMA is asking whether there is another document to process. * Implementations should return whether there is currently a document * available, and not do any waiting for a new document as this is handled * by BaleenCollectionReader. * * @return True if there is another document, false otherwise */ public abstract boolean doHasNext() throws IOException, CollectionException; /** * Takes a string of the class name and return a Class * * @param className * The name of the class, which must implement IContentExtractor * @return The class specified */ public static IContentExtractor getContentExtractor(String className) throws InvalidParameterException { try { return (IContentExtractor) BuilderUtils .getClassFromString(className, "uk.gov.dstl.baleen.contentextractors").newInstance(); } catch (Exception e1) { throw new InvalidParameterException("Could not find or instantiate content extractor " + className, e1); } } protected UimaMonitor getMonitor() { return monitor; } protected UimaSupport getSupport() { return support; } /** Create a configuration map from a context. * @param context the context * @return non-empty map of config param name to config param value */ protected static Map<String, Object> getConfigParameters(UimaContext context){ Map<String, Object> ret = new HashMap<>(); for(String name : context.getConfigParameterNames()){ ret.put(name, context.getConfigParameterValue(name)); } return ret; } }