//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.cleaners; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.uima.UIMAException; import org.apache.uima.UimaContext; import org.apache.uima.cas.FSIterator; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import uk.gov.dstl.baleen.annotators.cleaners.helpers.AbstractNestedEntities; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.core.utils.ReflectionUtils; import uk.gov.dstl.baleen.exceptions.BaleenException; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.uima.utils.TypeSystemSingleton; import uk.gov.dstl.baleen.uima.utils.TypeUtils; /** * Remove entities which are contained within other entities of any type. * <p> * This is useful for relations and event extraction pipelines where having overlapping entities * will produce poorer results. * <p> * For example the "The British Army fought in Iraq" might provide entities British[Nationality], * British Army[Organisation] and Iraq[Location]. A simple relationship extraction may relate all * entities in a sentence British-British Army, British Army-Iraq and British-Iraq. The final * relation is true in this case but not really the meaning of the text - the word British was not * meant to be considered in isolation. * * @baleen.javadoc */ public class RemoveOverlappingEntities extends AbstractNestedEntities<Entity> { /** * A list of types to exclude when removing nested entities. * * @baleen.config */ public static final String PARAM_EXCLUDED_TYPES = "excludedTypes"; @ConfigurationParameter(name = PARAM_EXCLUDED_TYPES, defaultValue = {}) private Set<String> excluded; Set<Class<? extends Annotation>> classTypes = new HashSet<>(); @Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException{ JCas jCas; try { jCas = JCasFactory.createJCas(TypeSystemSingleton.getTypeSystemDescriptionInstance()); } catch (UIMAException e) { throw new ResourceInitializationException(e); } for(String type : excluded){ try{ classTypes.add(TypeUtils.getEntityClass(type, jCas)); }catch(BaleenException e){ getMonitor().error("Couldn't parse type {} - type will not be excluded", type, e); } } } @Override protected Collection<List<Entity>> compileEntities(JCas jCas) { Set<Entity> annotations = new HashSet<>(); FSIterator<Annotation> iter = jCas.getAnnotationIndex(Entity.type) .iterator(); while (iter.hasNext()) { Entity e = (Entity) iter.next(); String type = e.getType().getName(); if (!excluded.contains(type)) { annotations.add(e); } } return Collections.singleton(new ArrayList<>(annotations)); } @Override protected boolean shouldMerge(Entity keep, Entity remove) { // Merge everything return true; } @Override public void doDestroy() { excluded = null; } @Override public AnalysisEngineAction getAction() { Set<Class<? extends Entity>> types = ReflectionUtils.getInstance().getSubTypesOf(Entity.class); types.removeAll(classTypes); Set<Class<? extends Annotation>> annotations = new HashSet<>(); annotations.addAll(types); return new AnalysisEngineAction(annotations, Collections.emptySet()); } }