//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.stats;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import org.apache.uima.UIMAException;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import com.google.common.collect.ImmutableSet;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.util.Span;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.exceptions.BaleenException;
import uk.gov.dstl.baleen.types.language.Sentence;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
import uk.gov.dstl.baleen.uima.utils.TypeSystemSingleton;
import uk.gov.dstl.baleen.uima.utils.TypeUtils;
/**
* Annotate entities using a user specified OpenNLP NER model
*
* <p>An OpenNLP model specified by the user is loaded into a NameFinderME object.
* This is then ran over the document content and entities are extracted.
* These entities are then assigned to a user defined type that inherits from the Entity type.
* Common properties, such as the value and confidence, are set, but type specific properties are not set (e.g. currency for Money).
* If no type is specified, a generic Entity is created.</p>
*
*
* @baleen.javadoc
*/
public class OpenNLP extends BaleenAnnotator {
public static final String MODEL_KEY = "namedEntityModel";
private NameFinderME nameFinder;
private Class<? extends Entity> et = null;
/**
* The model to use for entity extraction
*
* @baleen.config model.bin
*/
public static final String PARAM_MODEL = "model";
@ConfigurationParameter(name = PARAM_MODEL, defaultValue="model.bin")
private String model;
/**
* The entity type to annotate matches with
*
* @baleen.config uk.gov.dstl.baleen.types.semantic.Entity
*/
public static final String PARAM_TYPE = "type";
@ConfigurationParameter(name = PARAM_TYPE, defaultValue="uk.gov.dstl.baleen.types.semantic.Entity")
private String type;
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
try{
et = TypeUtils.getEntityClass(type, JCasFactory.createJCas(TypeSystemSingleton.getTypeSystemDescriptionInstance()));
}catch(UIMAException | BaleenException e){
throw new ResourceInitializationException(e);
}
try(
InputStream modelIn = new FileInputStream(new File(model));
){
nameFinder = new NameFinderME(new TokenNameFinderModel(modelIn));
} catch (IOException e) {
throw new ResourceInitializationException(e);
}
}
@Override
public void doProcess(JCas aJCas) throws AnalysisEngineProcessException {
Collection<Sentence> sentences = JCasUtil.select(aJCas, Sentence.class);
for ( Sentence sentence: sentences) {
List<WordToken> sentenceTokens = JCasUtil.selectCovered(WordToken.class, sentence);
List<String> sentenceTokenContent = new ArrayList<String>();
for(WordToken token : sentenceTokens){
sentenceTokenContent.add(token.getCoveredText());
}
Span[] names = nameFinder.find((String[]) Arrays.copyOf(sentenceTokenContent.toArray(), sentenceTokenContent.size(), String[].class));
double[] probs = nameFinder.probs();
for(int i = 0; i < names.length; i++){
Span name = names[i];
Entity entity;
try {
entity = et.getConstructor(JCas.class).newInstance(aJCas);
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
entity.setConfidence((float)probs[i]);
entity.setBegin(sentenceTokens.get(name.getStart()).getBegin());
entity.setEnd(sentenceTokens.get(name.getEnd() - 1).getEnd());
entity.setValue(entity.getCoveredText());
addToJCasIndex(entity);
}
}
nameFinder.clearAdaptiveData();
}
@Override
public void doDestroy(){
nameFinder = null;
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(Sentence.class, WordToken.class), ImmutableSet.of(et));
}
}