//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.cleaners.helpers; import java.util.Collection; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.jcas.JCas; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** * <b>Remove entities which are contained within other entities of the same type</b> * <p> * All entities are looped through, and should an entity be found to be entirely contained within * another entity of the same type it is removed. The comparison is done purely on start and end * positions, and ignores other information within the entity. If two entities of the same type have * the same start and end position, then the one with the lower confidence is removed; and if both * have the same confidence then the first entity in the annotation index is removed. * <p> * Information may be lost by the removal of entities. Some kind of merging of entities might be a * better option. * * Implementations should override shouldMerge to determine if item should be merged, and the * compileEntities to create lists of entities to be considered for merging. * * */ public abstract class AbstractNestedEntities<T extends Entity> extends BaleenAnnotator { @Override public void doProcess(JCas jCas) throws AnalysisEngineProcessException { Collection<List<T>> annotations = compileEntities(jCas); List<MergePair<T>> mergePairs = new LinkedList<MergePair<T>>(); Set<T> toRemove = new HashSet<T>(); for (List<T> entities : annotations) { removeNestedEntities(entities, mergePairs, toRemove); } for (MergePair<T> pair : mergePairs) { T keep = pair.getKeep(); T remove = pair.getRemove(); if (shouldMerge(keep, remove)) { // We used to log the change here, but calling // having any of the parameters in this debug causes a javac compile error // Seems to be a recognised JDK see JDK-8056984, to be fixed in JDK 9 // Issue arose when introducing History. Perhaps this has to do // with the Recordable interface? mergeWithExisting(keep, remove); } else { // If we aren't merging this we should keep both toRemove.remove(remove); } } removeFromJCasIndex(toRemove); } private void removeNestedEntities(List<T> typeAnnotations, List<MergePair<T>> mergePairs, Set<T> toRemove) { for (int x = 0; x < typeAnnotations.size(); x++) { T eX = typeAnnotations.get(x); for (int y = x + 1; y < typeAnnotations.size(); y++) { T eY = typeAnnotations.get(y); if (toRemove.contains(eX) || toRemove.contains(eY)) { // If we are always removing the one of the entities, we should have already // dealt with other cases continue; } if (eX.getBegin() == eY.getBegin() && eX.getEnd() == eY.getEnd()) { removeLeastConfidentEntity(mergePairs, toRemove, eX, eY); } else if (containedWithin(eX, eY)) { // Remove nested entity Y removeEntity(mergePairs, toRemove, eY, eX); } else if (containedWithin(eY, eX)) { // Remove nested entity Y removeEntity(mergePairs, toRemove, eX, eY); } } } } /** * Is e2 contained within e1? */ private boolean containedWithin(T e1, T e2) { return e1.getBegin() <= e2.getBegin() && e1.getEnd() >= e2.getEnd(); } /** * Removes the least confident entity, or e2 if they have the same confidence */ private void removeLeastConfidentEntity(List<MergePair<T>> mergePairs, Set<T> toRemove, T e1, T e2) { if (e1.getConfidence() >= e2.getConfidence()) { removeEntity(mergePairs, toRemove, e2, e1); } else { removeEntity(mergePairs, toRemove, e1, e2); } } private void removeEntity(List<MergePair<T>> mergePairs, Set<T> toRemove, T entityToRemove, T entityToKeep) { toRemove.add(entityToRemove); mergePairs.add(new MergePair<T>(entityToKeep, entityToRemove)); } /** * Get list of entities to processed for overlap and nesting (and hence merged together), each * list will be treated independently. * * @param jCas * the CAS to pull entities from. * @return a collection of lists. The lists should contain entities to be compared together. * typically this will be entities of the same type. */ protected abstract Collection<List<T>> compileEntities(JCas jCas); /** * Determine if these specific pairs of entities should be merged together. * * @param keep * the entity to be kept * @param remove * the entity being considered for removal. * @return false if both entities should be kept, true is the remove entity should be merged * into the kept. */ protected abstract boolean shouldMerge(T keep, T remove); }