//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.coreference;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.annotators.coreference.impl.MentionDetector;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Cluster;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention;
import uk.gov.dstl.baleen.annotators.coreference.impl.enhancers.AcronymEnhancer;
import uk.gov.dstl.baleen.annotators.coreference.impl.enhancers.AnimacyEnhancer;
import uk.gov.dstl.baleen.annotators.coreference.impl.enhancers.GenderEnhancer;
import uk.gov.dstl.baleen.annotators.coreference.impl.enhancers.MentionEnhancer;
import uk.gov.dstl.baleen.annotators.coreference.impl.enhancers.MultiplicityEnhancer;
import uk.gov.dstl.baleen.annotators.coreference.impl.enhancers.PersonEnhancer;
import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.CoreferenceSieve;
import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.ExactStringMatchSieve;
import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.ExtractReferenceTargets;
import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.InSentencePronounSieve;
import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.PreciseConstructsSieve;
import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.PronounResolutionSieve;
import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.ProperHeadMatchSieve;
import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.RelaxedHeadMatchSieve;
import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.RelaxedStringMatchSieve;
import uk.gov.dstl.baleen.annotators.coreference.impl.sieves.StrictHeadMatchSieve;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.resources.SharedGenderMultiplicityResource;
import uk.gov.dstl.baleen.resources.SharedStopwordResource;
import uk.gov.dstl.baleen.types.Base;
import uk.gov.dstl.baleen.types.common.CommsIdentifier;
import uk.gov.dstl.baleen.types.common.DocumentReference;
import uk.gov.dstl.baleen.types.common.Frequency;
import uk.gov.dstl.baleen.types.common.Money;
import uk.gov.dstl.baleen.types.common.Nationality;
import uk.gov.dstl.baleen.types.common.Organisation;
import uk.gov.dstl.baleen.types.common.Person;
import uk.gov.dstl.baleen.types.common.Url;
import uk.gov.dstl.baleen.types.common.Vehicle;
import uk.gov.dstl.baleen.types.geo.Coordinate;
import uk.gov.dstl.baleen.types.language.PhraseChunk;
import uk.gov.dstl.baleen.types.language.Sentence;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.types.military.MilitaryPlatform;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.types.semantic.Location;
import uk.gov.dstl.baleen.types.semantic.ReferenceTarget;
import uk.gov.dstl.baleen.types.semantic.Temporal;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
import uk.gov.dstl.baleen.uima.grammar.DependencyGraph;
import uk.gov.dstl.baleen.uima.grammar.ParseTree;
/**
* Resolves coreferent entities.
* <p>
* In effect the Stanford approach is a set of 10+ passes which address the different types of
* coreference. At each stage mentions are related, each related mention is added to a cluster (a
* set of mentions which are related). At the end of the process the clusters are joined
* transitively and all mentions inside a cluster are considered coreferent.
* <p>
* A mention is a NP, entity or pronoun. In Stanford the largest NP is taken, within Baleen we felt
* that entities are more important, therefore we take the largest NP which does not contain a NP.
* <p>
* TODO: Review mention extraction
* <p>
* This is a partial implementation at present, and so will not perform as well as the
* StanfordCoreNlp coreference. This is partially due to time constraints.
* <p>
* The following implementation details to date:
* <ul>
* <li>Mention detection: Done
* <li>Pass 1 Speaker Identification: TODO
* <li>Pass 2 Exact String Match: Done
* <li>Pass 3 Relaxed String Match: Done
* <li>Pass X: We added a pronoun match within the same sentence.
* <li>Pass 4 Precise Constructs: Done - appositive, predicate. relative pronoun, acronym. Not done
* - role appositive (since Baleen doens't have a role entity to mark up). Done elsewhere - demonym
* are covered in the NationalityToLocation annotator.
* <li>Pass 5-7 Strict Head Match: Done
* <li>Pass 8 Proper Head Noun Match: Done
* <li>Pass 9 Relaxed Head Match: Done
* <li>Pass 10 Pronoun Resolution: Done
* <li>Post process: Done
* <li>Output: Done
* </ul>
*
* Attributes of mentions (gender, animacy, number) are included, but for animacy we could not get
* the data (Ji and Lin, 2009) and it says for research use only anyway. As such we ignore the
* dictionary lookup.
* <p>
* We discard any algorithms which are for a specific corpus (eg OntoNotes).
* <p>
* This is very much unoptimised. Each sieve will calculate over all entities, even though many will
* already in the same cluster.
* <p>
* TODO: At the moment we don't do the clustering properly. We need just perform pairwise operations
* repeated.
*
* For more information see the various supporting papers.
* <ul>
* <li>http://nlp.stanford.edu/software/dcoref.shtml
* <li>http://www.mitpressjournals.org/doi/pdf/10.1162/COLI_a_00152
* <li>http://nlp.stanford.edu/pubs/discourse-referent-lifespans.pdf
* <li>http://nlp.stanford.edu/pubs/conllst2011-coref.pdf
* <li>http://nlp.stanford.edu/pubs/coreference-emnlp10.pdf
* </ul>
*
* TODO: To really improve further, we need an analysis of what is missing higher up Baleen. For
* example we don't have roles or the animacy information so "a doctor" is just a noun phrase and
* hence could be mapped to it. If we had "person role" entity marker we would mark this an ANIMATE.
*
* @baleen.javadoc
*/
public class SieveCoreference extends BaleenAnnotator {
/**
* Connection to Stopwords Resource
*
* @baleen.resource uk.gov.dstl.baleen.resources.SharedStopwordResource
*/
public static final String KEY_STOPWORDS = "stopwords";
@ExternalResource(key = KEY_STOPWORDS)
protected SharedStopwordResource stopwordResource;
/**
* GenderMultiplicityResource to provide information on gender and multiplicity from a
* dictionary.
*
* @baleen.resource uk.gov.dstl.baleen.resources.GenderMultiplicityResource
*/
public static final String KEY_GENDER_MULTIPLICITY = "genderMultiplicity";
@ExternalResource(key = KEY_GENDER_MULTIPLICITY)
private SharedGenderMultiplicityResource genderMultiplicityResource;
/**
* The stoplist to use. If the stoplist matches one of the enum's provided in
* {@link uk.gov.dstl.baleen.resources.SharedStopwordResource#StopwordList}, then
* that list will be loaded.
*
* Otherwise, the string is taken to be a file path and that file is used.
* The format of the file is expected to be one stopword per line.
*
* @baleen.config DEFAULT
*/
public static final String PARAM_STOPLIST = "stoplist";
@ConfigurationParameter(name = PARAM_STOPLIST, defaultValue="DEFAULT")
protected String stoplist;
/**
* Perform only a single pass (of the provided index)
*
* Only useful for unit testing.
*
* -1 means all
*
* @baleen.config -1
*/
public static final String PARAM_SINGLE_PASS = "pass";
@ConfigurationParameter(name = PARAM_SINGLE_PASS, defaultValue = "-1")
private int singlePass;
/**
* Should pronomial resolution (John - he) be performed.
*
* This is the worst performing sieve in that is must 'guess' without any real rules what entity
* the pronoun is referring to. We currently have little data about animacy etc which will help
* (They - BBC ok, He - BBC not ok).
*
* Currently a closest entity of the same type is used, but that won't perform well in many
* cases.
*
* @baleen.config pronomial false
*/
public static final String PARAM_INCLUDE_PRONOMIAL = "pronomial";
@ConfigurationParameter(name = PARAM_INCLUDE_PRONOMIAL, defaultValue = "false")
private boolean includePronomial;
protected Collection<String> stopwords;
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
super.doInitialize(aContext);
try{
stopwords = stopwordResource.getStopwords(SharedStopwordResource.StopwordList.valueOf(stoplist));
}catch(IOException ioe){
getMonitor().error("Unable to load stopwords", ioe);
throw new ResourceInitializationException(ioe);
}
}
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
DependencyGraph dependencyGraph = DependencyGraph.build(jCas);
ParseTree parseTree = ParseTree.build(jCas);
// Detect mentions
List<Mention> mentions = new MentionDetector(jCas, dependencyGraph).detect();
// Extract head words and other aspects needed for later, determine acronyms, denonym, gender, etc
enhanceMention(mentions);
List<Cluster> clusters = sieve(jCas, parseTree, mentions);
// Post processing
postProcess(clusters);
// Output to reference targets
outputReferenceTargets(jCas, clusters);
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(ImmutableSet.of(PhraseChunk.class, WordToken.class, Entity.class, Sentence.class, CommsIdentifier.class, DocumentReference.class, Frequency.class, Money.class, Url.class,
Vehicle.class, Coordinate.class, MilitaryPlatform.class, Location.class, Temporal.class, Nationality.class, Person.class, Organisation.class),
Collections.emptySet());
}
private void enhanceMention(List<Mention> mentions) {
MentionEnhancer[] enhancers = new MentionEnhancer[] {
new AcronymEnhancer(),
new PersonEnhancer(),
new MultiplicityEnhancer(genderMultiplicityResource),
new GenderEnhancer(genderMultiplicityResource),
new AnimacyEnhancer()
};
for (Mention mention : mentions) {
for (MentionEnhancer enhancer : enhancers) {
enhancer.enhance(mention);
}
}
}
private List<Cluster> sieve(JCas jCas, ParseTree parseTree, List<Mention> mentions) {
List<Cluster> clusters = new ArrayList<>();
CoreferenceSieve[] sieves = new CoreferenceSieve[] {
new ExtractReferenceTargets(jCas, clusters, mentions), // Good
// TODO: SpeakerIdentificationSieve not implemented
new ExactStringMatchSieve(jCas, clusters, mentions), // Good
new RelaxedStringMatchSieve(jCas, clusters, mentions), // Good
new InSentencePronounSieve(jCas, clusters, mentions), // Good
new PreciseConstructsSieve(jCas, parseTree, clusters, mentions), // Good
// Pass A-C are all strict head with different params
new StrictHeadMatchSieve(jCas, clusters, mentions, true, true, stopwords), // Good
new StrictHeadMatchSieve(jCas, clusters, mentions, true, false, stopwords), // Good
new StrictHeadMatchSieve(jCas, clusters, mentions, false, true, stopwords), // Good
new ProperHeadMatchSieve(jCas, clusters, mentions), // Good
new RelaxedHeadMatchSieve(jCas, clusters, mentions, stopwords), // Good
includePronomial ? new PronounResolutionSieve(jCas, clusters, mentions) : null
// Questionable - Needs more help from
// Baleen entities yet and more data from animacy if its to work well.
};
if (singlePass >= 0 && sieves.length > singlePass) {
sieves = new CoreferenceSieve[] {
sieves[singlePass]
};
getMonitor().info("Single pass mode {}: {}", singlePass, sieves[0].getClass().getSimpleName());
}
Arrays.stream(sieves)
.filter(Objects::nonNull)
.forEach(CoreferenceSieve::sieve);
return clusters;
}
private void postProcess(List<Cluster> clusters) {
// NOTE: The paper says the two rules are *only* used in OntoNotes:
// 1. Remove singleton clusters
// 2. Short mentions of appositive patterns
// We implement 1, as it makes sense genreally and leave 2 as an OntoNotes specific
// optimisation.
Iterator<Cluster> iterator = clusters.iterator();
while (iterator.hasNext()) {
Cluster cluster = iterator.next();
if (cluster.getSize() <= 1) {
iterator.remove();
}
}
}
private void outputReferenceTargets(JCas jCas, List<Cluster> clusters) {
// Merge the clusters together
List<Cluster> merged = mergeClusters(clusters);
// Remove all the previous reference targets as we've included them in our process
ArrayList<ReferenceTarget> toRemove = new ArrayList<>(JCasUtil.select(jCas, ReferenceTarget.class));
removeFromJCasIndex(toRemove);
// Save clusters a referent targets
merged.forEach(c -> {
ReferenceTarget target = new ReferenceTarget(jCas);
for (Mention m : c.getMentions()) {
// We overwrite the referent target here, given that we used the initial target to
// bootstrap our work
// TODO: Could add an option not to override here.
Base annotation = m.getAnnotation();
annotation.setReferent(target);
}
addToJCasIndex(target);
});
}
private List<Cluster> mergeClusters(List<Cluster> clusters) {
List<Cluster> merged = new ArrayList<>(clusters.size());
for (Cluster cluster : clusters) {
boolean overlap = false;
for (Cluster mergedCluster : merged) {
if (mergedCluster.intersects(cluster)) {
mergedCluster.add(cluster);
overlap = true;
break;
}
}
if (!overlap) {
merged.add(cluster);
}
}
return merged;
}
}