//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.cleaners;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.uima.UIMAException;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.exceptions.BaleenException;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
import uk.gov.dstl.baleen.uima.utils.TypeSystemSingleton;
import uk.gov.dstl.baleen.uima.utils.TypeUtils;
/**
* Remove entities that appear on a blacklist supplied by the user
*
* <p>Loops through a list of blacklisted entity values supplied by the user,
* and if an entity value matches one on the blacklist that entity is removed.
* This can be done either case sensitive or case insensitive (the default),
* and for specific entity types or all entity types.</p>
*
*
* @baleen.javadoc
*/
public class Blacklist extends BaleenAnnotator {
/**
* A list of blacklisted entity values to remove from the CAS
*
* @baleen.config
*/
public static final String PARAM_BLACKLIST = "blacklist";
@ConfigurationParameter(name = PARAM_BLACKLIST, defaultValue={})
String[] terms;
List<String> thingsToRemove = null;
/**
* Should the comparison of the blacklist with entity values be done case sensitively?
*
* @baleen.config false
*/
public static final String PARAM_CASE_SENSITIVE = "caseSensitive";
@ConfigurationParameter(name = PARAM_CASE_SENSITIVE, defaultValue="false")
Boolean caseSensitive;
/**
* The entity type to compare against the blacklist, including subclasses of this type
*
* @baleen.config uk.gov.dstl.baleen.types.semantic.Entity
*/
public static final String PARAM_TYPE = "type";
@ConfigurationParameter(name = PARAM_TYPE, defaultValue="uk.gov.dstl.baleen.types.semantic.Entity")
String type;
Class<? extends Annotation> et = null;
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
try{
et = TypeUtils.getEntityClass(type, JCasFactory.createJCas(TypeSystemSingleton.getTypeSystemDescriptionInstance()));
}catch(UIMAException | BaleenException e){
throw new ResourceInitializationException(e);
}
thingsToRemove = Arrays.asList((String[]) terms);
if(!caseSensitive)
thingsToRemove = toLowerCase(thingsToRemove);
}
@Override
public void doProcess(JCas aJCas) throws AnalysisEngineProcessException {
Entity e;
try {
e = (Entity) et.getConstructor(JCas.class).newInstance(aJCas);
} catch (Exception ex) {
throw new AnalysisEngineProcessException(ex);
}
Set<Entity> toRemove = new HashSet<Entity>();
FSIndex<Annotation> index = aJCas.getAnnotationIndex(e.getType());
for(Annotation a : index){
Entity entity = (Entity) a;
String val = entity.getCoveredText();
if(!caseSensitive)
val = val.toLowerCase();
if(thingsToRemove.contains(val)){
getMonitor().info("Removing entity '{}' because it appears on the blacklist", entity.getCoveredText());
toRemove.add(entity);
}
}
getMonitor().debug("{} has removed {} entities", this.getClass().getName(), toRemove.size());
for(Entity ent : toRemove){
removeFromJCasIndex(ent);
}
}
@Override
public void doDestroy(){
thingsToRemove = null;
et = null;
}
private List<String> toLowerCase(List<String> list){
List<String> l = new ArrayList<String>();
for(String s : list){
l.add(s.toLowerCase());
}
return l;
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(et), Collections.emptySet());
}
}