//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.cleaners;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.core.utils.ConfigUtils;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
/**
* Remove entities which have a low confidence
*
* <p>All entities are looped through, and should an entity be found that has a confidence below a user specified threshold.
* The user can choose to ignore 0 confidence entities in this process, which are likely to be entities that don't have a confidence set.</p>
*
*
* @baleen.javadoc
*/
public class RemoveLowConfidenceEntities extends BaleenAnnotator {
/**
* The confidence threshold, below which entities will be removed
*
* @baleen.config 0.0
*/
public static final String PARAM_THRESHOLD = "confidenceThreshold";
@ConfigurationParameter(name = PARAM_THRESHOLD,defaultValue="0.0")
private String confidenceThresholdString;
//Parse the confidenceThreshold config parameter into this variable to avoid issues with parameter types
private Float confidenceThreshold;
/**
* Should entities with 0 confidence, usually indicative that it hasn't been set, be ignored?
*
* @baleen.config true
*/
public static final String PARAM_IGNORE_ZERO = "ignoreZeroConfidence";
@ConfigurationParameter(name = PARAM_IGNORE_ZERO,defaultValue="true")
private Boolean ignoreZeroConfidence;
/**
* Initialise the annotator
*/
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
confidenceThreshold = ConfigUtils.stringToFloat(confidenceThresholdString, 0.0f);
}
@Override
public void doProcess(JCas aJCas) throws AnalysisEngineProcessException {
List<Entity> toRemove = new ArrayList<Entity>();
FSIterator<Annotation> iter = aJCas.getAnnotationIndex(Entity.type).iterator();
while(iter.hasNext()){
Entity e = (Entity) iter.next();
if(e.getConfidence() < confidenceThreshold && (!ignoreZeroConfidence || e.getConfidence() > 0.0)){
toRemove.add(e);
getMonitor().debug("Low confidence entity found (ID: {}) - this entity will be removed", e.getInternalId());
}
}
removeFromJCasIndex(toRemove);
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(Entity.class), Collections.emptySet());
}
}