//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.cleaners; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.StringJoiner; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.types.semantic.ReferenceTarget; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** * Where an entity is followed by something that may be it's initials in brackets, * associate those initials with the entity and extract other occurrences. * * For the purposes of this annotator, initials are defined as an uppercase string * containing only letters that appear in the entity (in the correct order). * * For example LDN would be accepted as initials for London, * but IoP would not be initials for the Institute of Physics as it is not entirely upper case (IOP would be accepted) */ public class EntityInitials extends BaleenAnnotator { private static final Pattern BRACKETS = Pattern.compile("^\\s*\\((.*?)\\)"); /** * If two entities are thought to be coreferences, but they have different existing reference targets, should we merge them? * * @baleen.config false */ public static final String PARAM_MERGE_REFERENTS = "mergeReferents"; @ConfigurationParameter(name = PARAM_MERGE_REFERENTS, defaultValue = "false") private boolean mergeReferents = false; @Override protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { for(Entity e : JCasUtil.select(jCas, Entity.class)){ List<Entity> addedEntities = new ArrayList<>(); List<Entity> existingEntities = new ArrayList<>(); String subsequentText = jCas.getDocumentText().substring(e.getEnd()); Integer offset = e.getEnd(); Matcher m = BRACKETS.matcher(subsequentText); while(m.find()){ if(isInitials(e.getCoveredText(), m.group(1))){ //Find all instances of entity, including initial instance Pattern p = Pattern.compile("\\b"+m.group(1)+"\\b"); Matcher mInitial = p.matcher(subsequentText); //Initials should only appear after it's been defined while(mInitial.find()){ List<? extends Entity> existing = JCasUtil.selectCovered(jCas, e.getClass(), offset + mInitial.start(), offset + mInitial.end()); if(existing.isEmpty()){ Entity eInitials2 = null; try { eInitials2 = e.getClass().getConstructor(JCas.class).newInstance(jCas); eInitials2.setBegin(offset + mInitial.start()); eInitials2.setEnd(offset + mInitial.end()); //TODO: Should we copy properties across too? addedEntities.add(eInitials2); } catch (Exception ex) { getMonitor().error("Unable to create new entity of class {}", e.getClass().getName(), ex); } }else{ existingEntities.addAll(existing); } } //Add entities to JCas and merge references addToJCasIndex(addedEntities); mergeReferences(jCas, e, existingEntities, addedEntities); } offset += m.end(); subsequentText = subsequentText.substring(m.end()); m = BRACKETS.matcher(subsequentText); } } } /** * Returns true if candidateInitials is a valid set of initials for text. */ private boolean isInitials(String text, String candidateInitials){ if(!candidateInitials.toUpperCase().equals(candidateInitials)) return false; StringJoiner sj = new StringJoiner(".*", ".*", ".*"); for(int i = 0; i < candidateInitials.length(); i++){ sj.add(Pattern.quote(candidateInitials.substring(i, i + 1))); } return text.toUpperCase().matches(sj.toString()); } /** * Merge the references of initials with the original entity * * @param jCas * JCas object for the current document * @param e * The original entity * @param existingEntities * List of entities already covered by the found initials * @param addedEntities * List of new entities created as a result of the initials */ private void mergeReferences(JCas jCas, Entity e, List<Entity> existingEntities, List<Entity> addedEntities){ ReferenceTarget rt = setReferenceTarget(jCas, e, getReferenceTargets(existingEntities)); //All new entities should have same RT as initial entity for(Entity ent : addedEntities){ ent.setReferent(rt); } //If existing entities don't have an RT, set it - otherwise merge? Set<ReferenceTarget> mergeRts = new HashSet<>(); for(Entity ent : existingEntities){ if(ent.getReferent() == null){ ent.setReferent(rt); }else if(mergeReferents){ mergeRts.add(ent.getReferent()); } } //Merge all reference targets identified previously if(!mergeRts.isEmpty()){ for(Entity ent : JCasUtil.select(jCas, Entity.class)){ if(ent.getReferent() != null && mergeRts.contains(ent.getReferent())){ ent.setReferent(rt); } } } } private List<ReferenceTarget> getReferenceTargets(List<Entity> entities){ List<ReferenceTarget> rts = new ArrayList<>(); for(Entity ent : entities){ if(ent.getReferent() != null){ rts.add(ent.getReferent()); } } return rts; } private ReferenceTarget setReferenceTarget(JCas jCas, Entity e, List<ReferenceTarget> rts){ ReferenceTarget rt; if(rts.size() == 1 && e.getReferent() == null){ //Entity doesn't have RT, but one of the subsequent initials does rt = rts.get(0); e.setReferent(rt); }else if(e.getReferent() != null){ // Entity has RT rt = e.getReferent(); }else{ //Entity doesn't have RT, and 0 or several of the subsequent initials do rt = new ReferenceTarget(jCas); addToJCasIndex(rt); e.setReferent(rt); } return rt; } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(Entity.class), ImmutableSet.of(Entity.class)); } }