//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.coreference.impl;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention;
import uk.gov.dstl.baleen.annotators.coreference.impl.enhancers.SentenceEnhancer;
import uk.gov.dstl.baleen.types.language.PhraseChunk;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.uima.grammar.DependencyGraph;
/**
* Extract mentions from the jCas.
*/
public class MentionDetector {
private final JCas jCas;
private final DependencyGraph dependencyGraph;
private List<WordToken> pronouns;
private Collection<Entity> entities;
/**
* Create a new mention detector using the specified JCas and DependencyGraph
*/
public MentionDetector(JCas jCas, DependencyGraph dependencyGraph) {
this.jCas = jCas;
this.dependencyGraph = dependencyGraph;
}
/**
* Detect mentions in the JCas and DependencyGraph (provided in the constructor) and return them as a list
*/
public List<Mention> detect() {
setup();
final List<Mention> mentions = new ArrayList<>(pronouns.size() + entities.size());
detectPronouns(mentions);
detectEntities(mentions);
detectPhrases(mentions);
new SentenceEnhancer().enhance(jCas, mentions);
return mentions;
}
private void setup() {
pronouns = JCasUtil.select(jCas, WordToken.class).stream()
.filter(w -> w.getPartOfSpeech().startsWith("PP") || w.getPartOfSpeech().startsWith("WP")
|| w.getPartOfSpeech().startsWith("PRP"))
.collect(Collectors.toList());
entities = JCasUtil.select(jCas, Entity.class);
}
private void detectPronouns(List<Mention> mentions) {
pronouns.stream()
.map(Mention::new)
.map(m -> {
final List<WordToken> list = Collections.singletonList((WordToken) m.getAnnotation());
m.setWords(list);
return m;
}).forEach(mentions::add);
}
private void detectEntities(Collection<Mention> mentions) {
entities.stream()
.map(Mention::new)
.map(m -> {
final Collection<WordToken> list = JCasUtil.selectCovered(jCas, WordToken.class, m.getAnnotation());
m.setWords(new ArrayList<WordToken>(list));
m.setHeadWordToken(determineHead(m.getWords()));
return m;
}).forEach(mentions::add);
}
private WordToken determineHead(List<WordToken> words) {
// A dependency grammar approach to head word extraction
// - find the Noun in the noun phrase which is the link out of the words
// - this seems to be the head word
// TODO: Investigate other approachces Collin 1999, etc. Do they give the same/better results?
if (words.size() == 1) {
return words.get(0);
} else {
final List<WordToken> candidates = identifyHeadCandidates(words);
if (candidates.isEmpty()) {
return null;
}
// TODO: No idea if its it possible to get more than one if all things work.
// I think this would be a case of marking an entity which cross the NP boundary and is likely wrong.
// TODO: Not sure if we should pull out compound words here... (its a head word but even so)
return candidates.get(0);
}
}
private List<WordToken> identifyHeadCandidates(List<WordToken> words){
final List<WordToken> candidates = new LinkedList<WordToken>();
for (final WordToken word : words) {
if (word.getPartOfSpeech().startsWith("N")) {
final Stream<WordToken> edges = dependencyGraph.getEdges(word);
if (edges.anyMatch(p -> !words.contains(p))) {
candidates.add(word);
}
}
}
return candidates;
}
private void detectPhrases(List<Mention> mentions) {
// Limit to noun phrases
final List<PhraseChunk> phrases = JCasUtil.select(jCas, PhraseChunk.class).stream()
.filter(p -> p.getChunkType().startsWith("N"))
.collect(Collectors.toList());
// Remove any noun phrases which cover entities
JCasUtil.indexCovering(jCas, Entity.class, PhraseChunk.class).values()
.stream()
.flatMap(e -> e.stream())
.forEach(phrases::remove);
final Map<PhraseChunk, Collection<WordToken>> phraseToWord = JCasUtil.indexCovered(jCas, PhraseChunk.class,
WordToken.class);
// Create an index for head words
final Multimap<WordToken, PhraseChunk> headToChunk = HashMultimap.create();
phrases.stream()
.forEach(p -> {
final Collection<WordToken> collection = phraseToWord.get(p);
final WordToken head = determineHead(new ArrayList<>(collection));
if (head != null) {
headToChunk.put(head, p);
}
// TODO: What should we do to those without heads?
});
// Paper: keep the largest noun phrase which has the same head word.
headToChunk.asMap().entrySet().stream()
.filter(e -> e.getValue().size() == 1)
.forEach(e -> {
PhraseChunk largest = null;
int largestSize = 0;
for (final PhraseChunk p : e.getValue()) {
// the head is always common word, so we know they overlap
final int size = p.getEnd() - p.getBegin();
if (largest == null || largestSize < size) {
largest = p;
largestSize = size;
}
}
// Remove all the small ones
for (final PhraseChunk p : e.getValue()) {
if (!p.equals(largest)) {
phrases.removeAll(headToChunk.values());
}
}
});
// Remove all phrases based on their single content
JCasUtil.indexCovering(jCas, PhraseChunk.class,
WordToken.class)
.entrySet()
.stream()
.filter(e -> e.getValue().size() == 1)
.filter(e -> filterBySingleContent(e.getValue().iterator().next()))
.map(Entry::getKey)
.forEach(phrases::remove);
// TODO: Remove all pronouns which are covered by the phrases? I think not...
// TODO: Paper removes It if possible (see Appendix B for regex)
// TODO: Paper removes static list of stop words (but we should determine that ourselves)
// TODO: Paper removes partivit or quantifier (millions of people). Unsure why though.
phrases.stream()
.map(Mention::new)
.map(m -> {
final List<WordToken> words = new ArrayList<>(phraseToWord.get(m.getAnnotation()));
// TODO: We already calculated this early (for headToWord), but we just redo again here. Would be nice to reuse
m.setWords(words);
m.setHeadWordToken(determineHead(words));
return m;
}).forEach(mentions::add);
}
private boolean filterBySingleContent(WordToken t){
if (pronouns.contains(t)) {
// Remove NP which are
return true;
} else{
// Paper: Remove cardinal / numerics
return "CD".equalsIgnoreCase(t.getPartOfSpeech());
}
}
}