//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.coreference.impl; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import com.google.common.collect.HashMultimap; import com.google.common.collect.Multimap; import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention; import uk.gov.dstl.baleen.annotators.coreference.impl.enhancers.SentenceEnhancer; import uk.gov.dstl.baleen.types.language.PhraseChunk; import uk.gov.dstl.baleen.types.language.WordToken; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.uima.grammar.DependencyGraph; /** * Extract mentions from the jCas. */ public class MentionDetector { private final JCas jCas; private final DependencyGraph dependencyGraph; private List<WordToken> pronouns; private Collection<Entity> entities; /** * Create a new mention detector using the specified JCas and DependencyGraph */ public MentionDetector(JCas jCas, DependencyGraph dependencyGraph) { this.jCas = jCas; this.dependencyGraph = dependencyGraph; } /** * Detect mentions in the JCas and DependencyGraph (provided in the constructor) and return them as a list */ public List<Mention> detect() { setup(); final List<Mention> mentions = new ArrayList<>(pronouns.size() + entities.size()); detectPronouns(mentions); detectEntities(mentions); detectPhrases(mentions); new SentenceEnhancer().enhance(jCas, mentions); return mentions; } private void setup() { pronouns = JCasUtil.select(jCas, WordToken.class).stream() .filter(w -> w.getPartOfSpeech().startsWith("PP") || w.getPartOfSpeech().startsWith("WP") || w.getPartOfSpeech().startsWith("PRP")) .collect(Collectors.toList()); entities = JCasUtil.select(jCas, Entity.class); } private void detectPronouns(List<Mention> mentions) { pronouns.stream() .map(Mention::new) .map(m -> { final List<WordToken> list = Collections.singletonList((WordToken) m.getAnnotation()); m.setWords(list); return m; }).forEach(mentions::add); } private void detectEntities(Collection<Mention> mentions) { entities.stream() .map(Mention::new) .map(m -> { final Collection<WordToken> list = JCasUtil.selectCovered(jCas, WordToken.class, m.getAnnotation()); m.setWords(new ArrayList<WordToken>(list)); m.setHeadWordToken(determineHead(m.getWords())); return m; }).forEach(mentions::add); } private WordToken determineHead(List<WordToken> words) { // A dependency grammar approach to head word extraction // - find the Noun in the noun phrase which is the link out of the words // - this seems to be the head word // TODO: Investigate other approachces Collin 1999, etc. Do they give the same/better results? if (words.size() == 1) { return words.get(0); } else { final List<WordToken> candidates = identifyHeadCandidates(words); if (candidates.isEmpty()) { return null; } // TODO: No idea if its it possible to get more than one if all things work. // I think this would be a case of marking an entity which cross the NP boundary and is likely wrong. // TODO: Not sure if we should pull out compound words here... (its a head word but even so) return candidates.get(0); } } private List<WordToken> identifyHeadCandidates(List<WordToken> words){ final List<WordToken> candidates = new LinkedList<WordToken>(); for (final WordToken word : words) { if (word.getPartOfSpeech().startsWith("N")) { final Stream<WordToken> edges = dependencyGraph.getEdges(word); if (edges.anyMatch(p -> !words.contains(p))) { candidates.add(word); } } } return candidates; } private void detectPhrases(List<Mention> mentions) { // Limit to noun phrases final List<PhraseChunk> phrases = JCasUtil.select(jCas, PhraseChunk.class).stream() .filter(p -> p.getChunkType().startsWith("N")) .collect(Collectors.toList()); // Remove any noun phrases which cover entities JCasUtil.indexCovering(jCas, Entity.class, PhraseChunk.class).values() .stream() .flatMap(e -> e.stream()) .forEach(phrases::remove); final Map<PhraseChunk, Collection<WordToken>> phraseToWord = JCasUtil.indexCovered(jCas, PhraseChunk.class, WordToken.class); // Create an index for head words final Multimap<WordToken, PhraseChunk> headToChunk = HashMultimap.create(); phrases.stream() .forEach(p -> { final Collection<WordToken> collection = phraseToWord.get(p); final WordToken head = determineHead(new ArrayList<>(collection)); if (head != null) { headToChunk.put(head, p); } // TODO: What should we do to those without heads? }); // Paper: keep the largest noun phrase which has the same head word. headToChunk.asMap().entrySet().stream() .filter(e -> e.getValue().size() == 1) .forEach(e -> { PhraseChunk largest = null; int largestSize = 0; for (final PhraseChunk p : e.getValue()) { // the head is always common word, so we know they overlap final int size = p.getEnd() - p.getBegin(); if (largest == null || largestSize < size) { largest = p; largestSize = size; } } // Remove all the small ones for (final PhraseChunk p : e.getValue()) { if (!p.equals(largest)) { phrases.removeAll(headToChunk.values()); } } }); // Remove all phrases based on their single content JCasUtil.indexCovering(jCas, PhraseChunk.class, WordToken.class) .entrySet() .stream() .filter(e -> e.getValue().size() == 1) .filter(e -> filterBySingleContent(e.getValue().iterator().next())) .map(Entry::getKey) .forEach(phrases::remove); // TODO: Remove all pronouns which are covered by the phrases? I think not... // TODO: Paper removes It if possible (see Appendix B for regex) // TODO: Paper removes static list of stop words (but we should determine that ourselves) // TODO: Paper removes partivit or quantifier (millions of people). Unsure why though. phrases.stream() .map(Mention::new) .map(m -> { final List<WordToken> words = new ArrayList<>(phraseToWord.get(m.getAnnotation())); // TODO: We already calculated this early (for headToWord), but we just redo again here. Would be nice to reuse m.setWords(words); m.setHeadWordToken(determineHead(words)); return m; }).forEach(mentions::add); } private boolean filterBySingleContent(WordToken t){ if (pronouns.contains(t)) { // Remove NP which are return true; } else{ // Paper: Remove cardinal / numerics return "CD".equalsIgnoreCase(t.getPartOfSpeech()); } } }