//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.stats;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.DocumentAnnotation;
import org.apache.uima.resource.ResourceInitializationException;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableSet;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.tokenize.SimpleTokenizer;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.core.utils.ConfigUtils;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
/**
* Set the document type parameter on a document using an OpenNLP DocCat model
*
* <p>
* An OpenNLP DocCat (Document Categorisation) model, specified by the user, is
* used to categorise the document. A minimum confidence threshold can be
* specified, and categorisations under that threshold will not be used.
* </p>
*
*
* @baleen.javadoc
*/
public class DocumentType extends BaleenAnnotator {
/**
* The minimum confidence to have in a categorisation before it is used
*
* @baleen.config 0.7
*/
public static final String PARAM_CONFIDENCE_THRESHOLD = "confidenceThreshold";
@ConfigurationParameter(name = PARAM_CONFIDENCE_THRESHOLD, defaultValue = "0.7")
private String thresholdString;
//Parse the threshold config parameter into this variable to avoid issues with parameter types
private Float threshold;
/**
* The model to use for document categorisation
*
* @baleen.config doctype.bin
*/
public static final String PARAM_MODEL = "model";
@ConfigurationParameter(name = PARAM_MODEL, defaultValue = "doctype.bin")
private String modelFile = null;
private DocumentCategorizerME doccat = null;
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
threshold = ConfigUtils.stringToFloat(thresholdString, 0.7f);
DoccatModel model;
try (InputStream modelIn = new FileInputStream(modelFile)) {
model = new DoccatModel(modelIn);
doccat = new DocumentCategorizerME(model);
} catch (IOException e) {
getMonitor().error("Couldn't load OpenNLP DocCat model '{}' - annotator unable to initialise", modelFile);
throw new ResourceInitializationException(e);
}
}
@Override
public void doProcess(JCas jCas) throws AnalysisEngineProcessException {
DocumentAnnotation da = getDocumentAnnotation(jCas);
if (Strings.isNullOrEmpty(da.getDocType())) {
double[] outcomes = doccat.categorize(SimpleTokenizer.INSTANCE.tokenize(jCas.getDocumentText()));
String cat = doccat.getBestCategory(outcomes);
double max = -Double.MAX_VALUE;
for (double d : outcomes) {
if (d > max) {
max = d;
}
}
if (threshold != null && max > threshold) {
da.setDocType(cat);
}
} else {
getMonitor().warn("A DocType annotation already exists. A second annotation will not be added.");
}
}
@Override
public void doDestroy() {
doccat = null;
threshold = null;
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(DocumentAnnotation.class));
}
}