//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.cleaners; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import org.apache.uima.UIMAException; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.exceptions.BaleenException; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.uima.BaleenAnnotator; import uk.gov.dstl.baleen.uima.utils.TypeSystemSingleton; import uk.gov.dstl.baleen.uima.utils.TypeUtils; /** * Merge adjacent entities of the same type. * * This can be useful if a gazetteer contains name parts rather than full names, * and you want to combine those into single entities. * * @baleen.javadoc */ public class MergeAdjacent extends BaleenAnnotator { /** * A regular expression that sets what is allowed between entities * for them to be considered adjacent. * * @baleen.config \h* */ public static final String PARAM_SEPARATOR = "separator"; @ConfigurationParameter(name = PARAM_SEPARATOR, defaultValue="\\h*") String separator; Pattern separatorPattern; /** * A list of the allowed entity types. * Entities will still only be compared with entities of the same type, * but only entity types on this list (excluding subtypes) will be checked. * * This may be ignored by some annotators which inherit from this class. * * @baleen.config */ public static final String PARAM_TYPE = "types"; @ConfigurationParameter(name = PARAM_TYPE, defaultValue={}) String[] types; List<Class<? extends Entity>> classTypes = new ArrayList<>(); @Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException{ separatorPattern = Pattern.compile(separator); JCas jCas; try { jCas = JCasFactory.createJCas(TypeSystemSingleton.getTypeSystemDescriptionInstance()); } catch (UIMAException e) { throw new ResourceInitializationException(e); } for(String type : types){ try{ classTypes.add(TypeUtils.getEntityClass(type, jCas)); }catch(BaleenException e){ getMonitor().error("Couldn't parse type - type will not be included", e); } } if(classTypes.isEmpty()){ getMonitor().warn("No valid types specified, no merging of entities will take place"); } } @Override protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { for(Class<? extends Entity> klass : classTypes){ processType(jCas, klass); } } private void processType(JCas jCas, Class<? extends Entity> type){ List<List<Entity>> mergeables = findAllEntitiesToMerge(jCas, type); for (List<Entity> mergeable : mergeables) { int begin = mergeable.get(0).getBegin(); int end = mergeable.get(mergeable.size() - 1).getEnd(); Double lowestConfidence = mergeable.stream().min(new Comparator<Entity>() { @Override public int compare(Entity e1, Entity e2) { return Double.compare(e1.getConfidence(), e2.getConfidence()); } }).get().getConfidence(); // Build a new annotation try{ Entity merged = type.getConstructor(JCas.class).newInstance(new Object[] {jCas}); merged.setBegin(begin); merged.setEnd(end); merged.setValue(jCas.getDocumentText().substring(begin, end)); merged.setConfidence(lowestConfidence); if(mergeAdditionalProperties(merged, type, mergeable)){ mergeWithNew(merged, mergeable); }else{ merged = null; } }catch(Exception e){ getMonitor().error("Unable to merge entities", e); } } } private List<List<Entity>> findAllEntitiesToMerge(JCas jCas, Class<? extends Entity> type) { List<List<Entity>> mergeables = new LinkedList<List<Entity>>(); Map<Entity, List<Entity>> toMerge = new HashMap<>(); // Create a mapping of annotations to join together List<Entity> entities = filterEntities(JCasUtil.select(jCas, type), type); for (Entity current : entities) { List<Entity> following = filterEntities(JCasUtil.selectFollowing(jCas, type, current, 1), type); if (following.isEmpty()) continue; Entity next = following.get(0); String between = jCas.getDocumentText().substring(current.getEnd(), next.getBegin()); // Check that the entities are only separated by whitespace, if (separatorPattern.matcher(between).matches() && shouldMerge(current, next)) { List<Entity> list; if (toMerge.containsKey(current)) { list = toMerge.get(current); } else { list = new LinkedList<>(); list.add(current); toMerge.put(current, list); mergeables.add(list); } list.add(next); toMerge.put(next, list); } } return mergeables; } private List<Entity> filterEntities(Collection<? extends Entity> entities, Class<? extends Entity> type){ //A better way to do this would be with .collect(), //but there's a bug with the version of JDK we have installed on the Jenkins server that won't allow that at the moment List<Entity> ret = new ArrayList<>(); entities.stream().filter(e -> e.getClass().equals(type)).forEach(e -> ret.add(e)); return ret; } /** * Returns true if e1 should be merged with e2 */ public boolean shouldMerge(Entity e1, Entity e2){ return true; } /** * Merge additional properties from originalEntities into merged. * * If this method returns false, then none of the entities will be merged */ public boolean mergeAdditionalProperties(Entity merged, Class<? extends Entity> type, List<Entity> originalEntities){ //Do nothing here - this is intended to be overridden if additional merging is required return true; } @Override public AnalysisEngineAction getAction() { Set<Class<? extends Annotation>> annotatorTypes = new HashSet<>(); annotatorTypes.addAll(classTypes); return new AnalysisEngineAction(annotatorTypes, annotatorTypes); } }