//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.coreference.impl.sieves;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.uima.jcas.JCas;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Animacy;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Cluster;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.MentionType;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Person;
import uk.gov.dstl.baleen.resources.data.Gender;
/**
* Attempts to connect pronouns to an entity.
* <p>
* This is very difficult problem which may perform suboptimally in the current implementation.
*/
public class PronounResolutionSieve extends AbstractCoreferenceSieve {
private static final int MAX_SENTENCE_DISTANCE = 3;
/**
* Constructor for PronounResolutionSieve
*/
public PronounResolutionSieve(JCas jCas, List<Cluster> clusters, List<Mention> mentions) {
super(jCas, clusters, mentions);
}
@Override
public void sieve() {
//Find potential entity->pronoun matches
final Multimap<Mention, Mention> potential = HashMultimap.create();
// Check that all mentions have a valid sentence index
List<Mention> filteredMentions = getMentions().stream().filter(m -> m.getSentenceIndex() >= 0).collect(Collectors.toList());
for (int i = 0; i < filteredMentions.size(); i++) {
final Mention a = filteredMentions.get(i);
for (int j = i + 1; j < filteredMentions.size(); j++) {
final Mention b = filteredMentions.get(j);
if(!validateMentions(a, b)){
continue;
}
resolvePronoun(a, b, potential);
}
}
// For each of the matches we need to select the best one
potential.asMap().entrySet().stream().forEach(e -> addBestAsMatch(e.getKey(), e.getValue()));
}
/**
* If the pronoun meets the conditions in this function, it is *likely* to be a person
*/
private boolean isPronounPerson(Mention pronoun){
return pronoun.getPerson() != Person.UNKNOWN //Acceptable values: FIRST, SECOND and THIRD
&& (pronoun.getGender() == Gender.M || pronoun.getGender() == Gender.F)
|| pronoun.getAnimacy() == Animacy.ANIMATE;
}
private boolean validateMentions(Mention a, Mention b){
//We are coreferencing pronouns only, so a OR b must be a pronoun, but not both
if (!((a.getType() == MentionType.PRONOUN && b.getType() != MentionType.PRONOUN) || (a.getType() != MentionType.PRONOUN && b.getType() == MentionType.PRONOUN))){
return false;
}
// Not in paper: No overlap it makes little sense
if (a.overlaps(b)) {
return false;
}
// Are the attributes compatible (gender=gender, etc)
if (!a.isAttributeCompatible(b)) {
return false;
}
return true;
}
private void resolvePronoun(Mention a, Mention b, Multimap<Mention, Mention> potential){
final Mention pronoun = a.getType() == MentionType.PRONOUN ? a : b;
final Mention other = a.getType() == MentionType.PRONOUN ? b : a;
// Not in paper: If the pronoun is before the other that's odd, (He said Hello. John
// did.)
if (pronoun.getAnnotation().getEnd() < other.getAnnotation().getBegin()) {
return;
}
// Not in paper: We found poor results for "it" really because it never refers to something that Baleen annotates
// It would be good for say money but currently we'll just drop it
if (pronoun.getText().toLowerCase().startsWith("it")) {
return;
}
//Originally there was code here to handle a and b both being pronouns (this is described in the paper)
//However, we are excluding links between two pronouns in validateMentions, so this code was removed.
// Paper: Only consider within three
// Not in paper: And the pronoun must be after the mention
final int sentenceDistance = pronoun.getSentenceIndex() - other.getSentenceIndex();
if (sentenceDistance < 0 || sentenceDistance > MAX_SENTENCE_DISTANCE) {
return;
}
// Not in paper: If the same sentence the pronoun should be after
if (sentenceDistance == 0 && pronoun.getAnnotation().getEnd() <= other.getAnnotation().getBegin()) {
return;
}
if (isPronounPerson(pronoun) && !(other.getAnnotation() instanceof uk.gov.dstl.baleen.types.common.Person || other.getAnnotation() instanceof uk.gov.dstl.baleen.types.common.Nationality)) {
return;
}
// Similarly to avoid if we have a neutral we can't link to a person
if (pronoun.getGender() == Gender.N && other.getAnnotation() instanceof uk.gov.dstl.baleen.types.common.Person) {
return;
}
// TODO: There might be many more of these simple constraints on our semantic types...
potential.put(pronoun, other);
}
private void addBestAsMatch(Mention key, Collection<Mention> potentialMatches) {
final Collection<Mention> matched;
if (potentialMatches.size() > 1) {
List<Mention> list = new ArrayList<Mention>(potentialMatches);
Collections.sort(list, (a, b) -> {
if (a.overlaps(b)) {
return 0;
}
// Use in-sentence word distance
if (a.getAnnotation().getEnd() <= b.getAnnotation().getBegin()) {
return b.getAnnotation().getBegin() - a.getAnnotation().getEnd();
} else {
return b.getAnnotation().getEnd() - a.getAnnotation().getBegin();
}
});
matched = list;
} else {
// Either empty or just one...
matched = potentialMatches;
}
// Get the first (nearest) which doesn't overlap
Optional<Mention> match = matched.stream()
.filter(m -> !key.overlaps(m))
.findFirst();
if (match.isPresent()) {
addToCluster(key, match.get());
}
}
}