//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.coreference; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Objects; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.annotators.coreference.impl.MentionDetector; import uk.gov.dstl.baleen.annotators.coreference.impl.data.Cluster; import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention; import uk.gov.dstl.baleen.annotators.coreference.impl.enhancers.AcronymEnhancer; import uk.gov.dstl.baleen.annotators.coreference.impl.enhancers.AnimacyEnhancer; import uk.gov.dstl.baleen.annotators.coreference.impl.enhancers.GenderEnhancer; import uk.gov.dstl.baleen.annotators.coreference.impl.enhancers.MentionEnhancer; import uk.gov.dstl.baleen.annotators.coreference.impl.enhancers.MultiplicityEnhancer; import uk.gov.dstl.baleen.annotators.coreference.impl.enhancers.PersonEnhancer; import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.CoreferenceSieve; import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.ExactStringMatchSieve; import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.ExtractReferenceTargets; import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.InSentencePronounSieve; import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.PreciseConstructsSieve; import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.PronounResolutionSieve; import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.ProperHeadMatchSieve; import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.RelaxedHeadMatchSieve; import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.RelaxedStringMatchSieve; import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.StrictHeadMatchSieve; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.resources.SharedGenderMultiplicityResource; import uk.gov.dstl.baleen.resources.SharedStopwordResource; import uk.gov.dstl.baleen.types.Base; import uk.gov.dstl.baleen.types.common.CommsIdentifier; import uk.gov.dstl.baleen.types.common.DocumentReference; import uk.gov.dstl.baleen.types.common.Frequency; import uk.gov.dstl.baleen.types.common.Money; import uk.gov.dstl.baleen.types.common.Nationality; import uk.gov.dstl.baleen.types.common.Organisation; import uk.gov.dstl.baleen.types.common.Person; import uk.gov.dstl.baleen.types.common.Url; import uk.gov.dstl.baleen.types.common.Vehicle; import uk.gov.dstl.baleen.types.geo.Coordinate; import uk.gov.dstl.baleen.types.language.PhraseChunk; import uk.gov.dstl.baleen.types.language.Sentence; import uk.gov.dstl.baleen.types.language.WordToken; import uk.gov.dstl.baleen.types.military.MilitaryPlatform; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.types.semantic.Location; import uk.gov.dstl.baleen.types.semantic.ReferenceTarget; import uk.gov.dstl.baleen.types.semantic.Temporal; import uk.gov.dstl.baleen.uima.BaleenAnnotator; import uk.gov.dstl.baleen.uima.grammar.DependencyGraph; import uk.gov.dstl.baleen.uima.grammar.ParseTree; /** * Resolves coreferent entities. * <p> * In effect the Stanford approach is a set of 10+ passes which address the different types of * coreference. At each stage mentions are related, each related mention is added to a cluster (a * set of mentions which are related). At the end of the process the clusters are joined * transitively and all mentions inside a cluster are considered coreferent. * <p> * A mention is a NP, entity or pronoun. In Stanford the largest NP is taken, within Baleen we felt * that entities are more important, therefore we take the largest NP which does not contain a NP. * <p> * TODO: Review mention extraction * <p> * This is a partial implementation at present, and so will not perform as well as the * StanfordCoreNlp coreference. This is partially due to time constraints. * <p> * The following implementation details to date: * <ul> * <li>Mention detection: Done * <li>Pass 1 Speaker Identification: TODO * <li>Pass 2 Exact String Match: Done * <li>Pass 3 Relaxed String Match: Done * <li>Pass X: We added a pronoun match within the same sentence. * <li>Pass 4 Precise Constructs: Done - appositive, predicate. relative pronoun, acronym. Not done * - role appositive (since Baleen doens't have a role entity to mark up). Done elsewhere - demonym * are covered in the NationalityToLocation annotator. * <li>Pass 5-7 Strict Head Match: Done * <li>Pass 8 Proper Head Noun Match: Done * <li>Pass 9 Relaxed Head Match: Done * <li>Pass 10 Pronoun Resolution: Done * <li>Post process: Done * <li>Output: Done * </ul> * * Attributes of mentions (gender, animacy, number) are included, but for animacy we could not get * the data (Ji and Lin, 2009) and it says for research use only anyway. As such we ignore the * dictionary lookup. * <p> * We discard any algorithms which are for a specific corpus (eg OntoNotes). * <p> * This is very much unoptimised. Each sieve will calculate over all entities, even though many will * already in the same cluster. * <p> * TODO: At the moment we don't do the clustering properly. We need just perform pairwise operations * repeated. * * For more information see the various supporting papers. * <ul> * <li>http://nlp.stanford.edu/software/dcoref.shtml * <li>http://www.mitpressjournals.org/doi/pdf/10.1162/COLI_a_00152 * <li>http://nlp.stanford.edu/pubs/discourse-referent-lifespans.pdf * <li>http://nlp.stanford.edu/pubs/conllst2011-coref.pdf * <li>http://nlp.stanford.edu/pubs/coreference-emnlp10.pdf * </ul> * * TODO: To really improve further, we need an analysis of what is missing higher up Baleen. For * example we don't have roles or the animacy information so "a doctor" is just a noun phrase and * hence could be mapped to it. If we had "person role" entity marker we would mark this an ANIMATE. * * @baleen.javadoc */ public class SieveCoreference extends BaleenAnnotator { /** * Connection to Stopwords Resource * * @baleen.resource uk.gov.dstl.baleen.resources.SharedStopwordResource */ public static final String KEY_STOPWORDS = "stopwords"; @ExternalResource(key = KEY_STOPWORDS) protected SharedStopwordResource stopwordResource; /** * GenderMultiplicityResource to provide information on gender and multiplicity from a * dictionary. * * @baleen.resource uk.gov.dstl.baleen.resources.GenderMultiplicityResource */ public static final String KEY_GENDER_MULTIPLICITY = "genderMultiplicity"; @ExternalResource(key = KEY_GENDER_MULTIPLICITY) private SharedGenderMultiplicityResource genderMultiplicityResource; /** * The stoplist to use. If the stoplist matches one of the enum's provided in * {@link uk.gov.dstl.baleen.resources.SharedStopwordResource#StopwordList}, then * that list will be loaded. * * Otherwise, the string is taken to be a file path and that file is used. * The format of the file is expected to be one stopword per line. * * @baleen.config DEFAULT */ public static final String PARAM_STOPLIST = "stoplist"; @ConfigurationParameter(name = PARAM_STOPLIST, defaultValue="DEFAULT") protected String stoplist; /** * Perform only a single pass (of the provided index) * * Only useful for unit testing. * * -1 means all * * @baleen.config -1 */ public static final String PARAM_SINGLE_PASS = "pass"; @ConfigurationParameter(name = PARAM_SINGLE_PASS, defaultValue = "-1") private int singlePass; /** * Should pronomial resolution (John - he) be performed. * * This is the worst performing sieve in that is must 'guess' without any real rules what entity * the pronoun is referring to. We currently have little data about animacy etc which will help * (They - BBC ok, He - BBC not ok). * * Currently a closest entity of the same type is used, but that won't perform well in many * cases. * * @baleen.config pronomial false */ public static final String PARAM_INCLUDE_PRONOMIAL = "pronomial"; @ConfigurationParameter(name = PARAM_INCLUDE_PRONOMIAL, defaultValue = "false") private boolean includePronomial; protected Collection<String> stopwords; @Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException { super.doInitialize(aContext); try{ stopwords = stopwordResource.getStopwords(SharedStopwordResource.StopwordList.valueOf(stoplist)); }catch(IOException ioe){ getMonitor().error("Unable to load stopwords", ioe); throw new ResourceInitializationException(ioe); } } @Override protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { DependencyGraph dependencyGraph = DependencyGraph.build(jCas); ParseTree parseTree = ParseTree.build(jCas); // Detect mentions List<Mention> mentions = new MentionDetector(jCas, dependencyGraph).detect(); // Extract head words and other aspects needed for later, determine acronyms, denonym, gender, etc enhanceMention(mentions); List<Cluster> clusters = sieve(jCas, parseTree, mentions); // Post processing postProcess(clusters); // Output to reference targets outputReferenceTargets(jCas, clusters); } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(PhraseChunk.class, WordToken.class, Entity.class, Sentence.class, CommsIdentifier.class, DocumentReference.class, Frequency.class, Money.class, Url.class, Vehicle.class, Coordinate.class, MilitaryPlatform.class, Location.class, Temporal.class, Nationality.class, Person.class, Organisation.class), Collections.emptySet()); } private void enhanceMention(List<Mention> mentions) { MentionEnhancer[] enhancers = new MentionEnhancer[] { new AcronymEnhancer(), new PersonEnhancer(), new MultiplicityEnhancer(genderMultiplicityResource), new GenderEnhancer(genderMultiplicityResource), new AnimacyEnhancer() }; for (Mention mention : mentions) { for (MentionEnhancer enhancer : enhancers) { enhancer.enhance(mention); } } } private List<Cluster> sieve(JCas jCas, ParseTree parseTree, List<Mention> mentions) { List<Cluster> clusters = new ArrayList<>(); CoreferenceSieve[] sieves = new CoreferenceSieve[] { new ExtractReferenceTargets(jCas, clusters, mentions), // Good // TODO: SpeakerIdentificationSieve not implemented new ExactStringMatchSieve(jCas, clusters, mentions), // Good new RelaxedStringMatchSieve(jCas, clusters, mentions), // Good new InSentencePronounSieve(jCas, clusters, mentions), // Good new PreciseConstructsSieve(jCas, parseTree, clusters, mentions), // Good // Pass A-C are all strict head with different params new StrictHeadMatchSieve(jCas, clusters, mentions, true, true, stopwords), // Good new StrictHeadMatchSieve(jCas, clusters, mentions, true, false, stopwords), // Good new StrictHeadMatchSieve(jCas, clusters, mentions, false, true, stopwords), // Good new ProperHeadMatchSieve(jCas, clusters, mentions), // Good new RelaxedHeadMatchSieve(jCas, clusters, mentions, stopwords), // Good includePronomial ? new PronounResolutionSieve(jCas, clusters, mentions) : null // Questionable - Needs more help from // Baleen entities yet and more data from animacy if its to work well. }; if (singlePass >= 0 && sieves.length > singlePass) { sieves = new CoreferenceSieve[] { sieves[singlePass] }; getMonitor().info("Single pass mode {}: {}", singlePass, sieves[0].getClass().getSimpleName()); } Arrays.stream(sieves) .filter(Objects::nonNull) .forEach(CoreferenceSieve::sieve); return clusters; } private void postProcess(List<Cluster> clusters) { // NOTE: The paper says the two rules are *only* used in OntoNotes: // 1. Remove singleton clusters // 2. Short mentions of appositive patterns // We implement 1, as it makes sense genreally and leave 2 as an OntoNotes specific // optimisation. Iterator<Cluster> iterator = clusters.iterator(); while (iterator.hasNext()) { Cluster cluster = iterator.next(); if (cluster.getSize() <= 1) { iterator.remove(); } } } private void outputReferenceTargets(JCas jCas, List<Cluster> clusters) { // Merge the clusters together List<Cluster> merged = mergeClusters(clusters); // Remove all the previous reference targets as we've included them in our process ArrayList<ReferenceTarget> toRemove = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class)); removeFromJCasIndex(toRemove); // Save clusters a referent targets merged.forEach(c -> { ReferenceTarget target = new ReferenceTarget(jCas); for (Mention m : c.getMentions()) { // We overwrite the referent target here, given that we used the initial target to // bootstrap our work // TODO: Could add an option not to override here. Base annotation = m.getAnnotation(); annotation.setReferent(target); } addToJCasIndex(target); }); } private List<Cluster> mergeClusters(List<Cluster> clusters) { List<Cluster> merged = new ArrayList<>(clusters.size()); for (Cluster cluster : clusters) { boolean overlap = false; for (Cluster mergedCluster : merged) { if (mergedCluster.intersects(cluster)) { mergedCluster.add(cluster); overlap = true; break; } } if (!overlap) { merged.add(cluster); } } return merged; } }