//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.cleaners; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.uima.UIMAException; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.Multimap; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.core.utils.ReflectionUtils; import uk.gov.dstl.baleen.exceptions.BaleenException; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.types.semantic.ReferenceTarget; import uk.gov.dstl.baleen.uima.BaleenAnnotator; import uk.gov.dstl.baleen.uima.utils.TypeSystemSingleton; import uk.gov.dstl.baleen.uima.utils.TypeUtils; /** * Coreference entities where a series of entities of the same type appears in brackets. * * For example, "William Tell (Bill)" would coreference William Tell and Bill if they were both person entities. * * @baleen.javadoc */ public class CorefBrackets extends BaleenAnnotator { /** * If two entities are thought to be coreferences, but they have different existing reference targets, should we merge them? * * @baleen.config false */ public static final String PARAM_MERGE_REFERENTS = "mergeReferents"; @ConfigurationParameter(name = PARAM_MERGE_REFERENTS, defaultValue = "false") private boolean mergeReferents = false; /** * A list of the excluded entity types. * * @baleen.config */ public static final String PARAM_TYPE = "excludedTypes"; @ConfigurationParameter(name = PARAM_TYPE, defaultValue={}) String[] excludedTypes; List<Class<? extends Entity>> classTypes = new ArrayList<>(); private static final Pattern BRACKETS = Pattern.compile("^\\s*\\((.*?)\\)"); @Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException{ JCas jCas; try { jCas = JCasFactory.createJCas(TypeSystemSingleton.getTypeSystemDescriptionInstance()); } catch (UIMAException e) { throw new ResourceInitializationException(e); } for(String type : excludedTypes){ try{ classTypes.add(TypeUtils.getEntityClass(type, jCas)); }catch(BaleenException e){ getMonitor().error("Couldn't parse type {} - type will not be excluded", type, e); } } } @Override public void doProcess(JCas jCas) throws AnalysisEngineProcessException { Collection<Entity> entities = JCasUtil.select(jCas, Entity.class); Multimap<Integer, Entity> entityMap = ArrayListMultimap.create(); entities.forEach(e -> entityMap.put(e.getBegin(), e)); for(Entity e : entities){ String followingText = jCas.getDocumentText().substring(e.getEnd()); Matcher m = BRACKETS.matcher(followingText); Integer offset = e.getEnd(); List<Entity> matched = new ArrayList<>(); while(m.find()){ final Integer end = offset + m.end(1); matched.addAll( entityMap.get(offset + m.start(1)).stream() .filter(f -> f.getClass().isAssignableFrom(e.getClass()) || e.getClass().isAssignableFrom(f.getClass())) .filter(f -> f.getEnd() == end) .collect(Collectors.toList()) ); //Prepare matcher to look for next bracket followingText = followingText.substring(m.end()); offset += m.end(); m = BRACKETS.matcher(followingText); } if(!matched.isEmpty()){ matched.add(e); makeCoref(jCas, matched); } } } @Override public AnalysisEngineAction getAction() { Set<Class<? extends Entity>> types = ReflectionUtils.getInstance().getSubTypesOf(Entity.class); types.removeAll(classTypes); Set<Class<? extends Annotation>> annotations = new HashSet<>(); annotations.addAll(types); return new AnalysisEngineAction(annotations, Collections.emptySet()); } private void makeCoref(JCas jCas, Collection<Entity> entities) { Set<ReferenceTarget> rts = new HashSet<>(); for(Entity e : entities){ if(e.getReferent() != null){ rts.add(e.getReferent()); } } if(rts.isEmpty()){ setAllReferents(createReferenceTarget(jCas), entities); }else if(rts.size() == 1){ setAllReferents(rts.toArray(new ReferenceTarget[0])[0], entities); }else{ if(mergeReferents){ ReferenceTarget rt = createReferenceTarget(jCas); setReferents(rt, entities, rts); removeFromJCasIndex(rts); }else{ getMonitor().warn("Multiple existing referents found, only those entities without existing referents will be modified"); setNewReferentIfNull(jCas, entities); } } } /** * Set the referents of all entities in <em>entities</em> to the ReferenceTarget specified by <em>target</em> */ private void setAllReferents(ReferenceTarget target, Collection<Entity> entities){ for(Entity e : entities){ e.setReferent(target); } } /** * For all entities in <em>entities</em> that don't have an existing referent, set them to the same new ReferenceTarget. * This ReferenceTarget is returned if created, else null is returned. */ private ReferenceTarget setNewReferentIfNull(JCas jCas, Collection<Entity> entities){ ReferenceTarget rt = null; for(Entity e : entities){ if(e.getReferent() == null){ if(rt == null){ rt = createReferenceTarget(jCas); } e.setReferent(rt); } } return rt; } /** * Sets the referents to <em>target</em> on all entities that either have a null referent, or a referent that matches one of the targets specified in <em>condition</em> */ private void setReferents(ReferenceTarget target, Collection<Entity> entities, Collection<ReferenceTarget> condition){ for(Entity e : entities){ if(e.getReferent() == null || condition.contains(e.getReferent())){ e.setReferent(target); } } } private ReferenceTarget createReferenceTarget(JCas jCas){ ReferenceTarget rt = new ReferenceTarget(jCas); rt.setBegin(0); rt.setEnd(jCas.getDocumentText().length()); addToJCasIndex(rt); return rt; } }