//
// StanfordCoreNLP -- a suite of NLP tools
// Copyright (c) 2009-2011 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
// Christopher Manning
// Dept of Computer Science, Gates 1A
// Stanford CA 94305-9010
// USA
//
package edu.stanford.nlp.dcoref;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import edu.stanford.nlp.classify.LogisticClassifier;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.SemanticHeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
/**
* Generic mention extractor from a corpus.
*
* @author Jenny Finkel
* @author Mihai Surdeanu
* @author Karthik Raghunathan
* @author Heeyoung Lee
* @author Sudarshan Rangarajan
*/
public class MentionExtractor {
private final HeadFinder headFinder;
protected String currentDocumentID;
protected final Dictionaries dictionaries;
protected final Semantics semantics;
public CorefMentionFinder mentionFinder;
protected StanfordCoreNLP stanfordProcessor;
protected LogisticClassifier<String, String> singletonPredictor;
/** The maximum mention ID: for preventing duplicated mention ID assignment */
protected int maxID = -1;
public static final boolean VERBOSE = false;
public MentionExtractor(Dictionaries dict, Semantics semantics) {
this.headFinder = new SemanticHeadFinder();
this.dictionaries = dict;
this.semantics = semantics;
this.mentionFinder = new RuleBasedCorefMentionFinder(); // Default
}
public void setMentionFinder(CorefMentionFinder mentionFinder)
{
this.mentionFinder = mentionFinder;
}
/**
* Extracts the info relevant for coref from the next document in the corpus
* @return List of mentions found in each sentence ordered according to the tree traversal.
* @throws Exception
*/
public Document nextDoc() throws Exception { return null; }
/**
* Reset so that we start at the beginning of the document collection
*/
public void resetDocs() {
maxID = -1;
currentDocumentID = null;
}
public Document arrange(
Annotation anno,
List<List<CoreLabel>> words,
List<Tree> trees,
List<List<Mention>> unorderedMentions) throws Exception {
return arrange(anno, words, trees, unorderedMentions, null, false);
}
protected int getHeadIndex(Tree t) {
// The trees passed in do not have the CoordinationTransformer
// applied, but that just means the SemanticHeadFinder results are
// slightly worse.
Tree ht = t.headTerminal(headFinder);
if(ht==null) return -1; // temporary: a key which is matched to nothing
CoreLabel l = (CoreLabel) ht.label();
return l.get(CoreAnnotations.IndexAnnotation.class);
}
private String treeToKey(Tree t) {
int idx = getHeadIndex(t);
String key = Integer.toString(idx) + ':' + t.toString();
return key;
}
public Document arrange(
Annotation anno,
List<List<CoreLabel>> words,
List<Tree> trees,
List<List<Mention>> unorderedMentions,
List<List<Mention>> unorderedGoldMentions,
boolean doMergeLabels) throws Exception {
List<List<Mention>> predictedOrderedMentionsBySentence = arrange(anno, words, trees, unorderedMentions, doMergeLabels);
List<List<Mention>> goldOrderedMentionsBySentence = null;
// SieveCoreferenceSystem.debugPrintMentions(System.err, "UNORDERED GOLD MENTIONS:", unorderedGoldMentions);
if(unorderedGoldMentions != null) {
goldOrderedMentionsBySentence = arrange(anno, words, trees, unorderedGoldMentions, doMergeLabels);
}
// SieveCoreferenceSystem.debugPrintMentions(System.err, "ORDERED GOLD MENTIONS:", goldOrderedMentionsBySentence);
return new Document(anno, predictedOrderedMentionsBySentence, goldOrderedMentionsBySentence, dictionaries);
}
/**
* Post-processes the extracted mentions. Here we set the Mention fields required for coref and order mentions by tree-traversal order.
* @param words List of words in each sentence, in textual order
* @param trees List of trees, one per sentence
* @param unorderedMentions List of unordered, unprocessed mentions
* Each mention MUST have startIndex and endIndex set!
* Optionally, if scoring is desired, mentions must have mentionID and originalRef set.
* All the other Mention fields are set here.
* @return List of mentions ordered according to the tree traversal
* @throws Exception
*/
public List<List<Mention>> arrange(
Annotation anno,
List<List<CoreLabel>> words,
List<Tree> trees,
List<List<Mention>> unorderedMentions,
boolean doMergeLabels) throws Exception {
List<List<Mention>> orderedMentionsBySentence = new ArrayList<>();
//
// traverse all sentences and process each individual one
//
for (int sent = 0, sz = words.size(); sent < sz; sent ++) {
List<CoreLabel> sentence = words.get(sent);
Tree tree = trees.get(sent);
List<Mention> mentions = unorderedMentions.get(sent);
Map<String, List<Mention>> mentionsToTrees = Generics.newHashMap();
// merge the parse tree of the entire sentence with the sentence words
if(doMergeLabels) mergeLabels(tree, sentence);
//
// set the surface information and the syntactic info in each mention
// startIndex and endIndex MUST be set before!
//
for (Mention mention: mentions) {
mention.contextParseTree = tree;
mention.sentenceWords = sentence;
mention.originalSpan = new ArrayList<>(mention.sentenceWords.subList(mention.startIndex, mention.endIndex));
if(!((CoreLabel) tree.label()).containsKey(CoreAnnotations.BeginIndexAnnotation.class)) tree.indexSpans(0);
if(mention.headWord==null) {
Tree headTree = ((RuleBasedCorefMentionFinder) mentionFinder).findSyntacticHead(mention, tree, sentence);
mention.headWord = (CoreLabel)headTree.label();
mention.headIndex = mention.headWord.get(CoreAnnotations.IndexAnnotation.class) - 1;
}
if(mention.mentionSubTree==null) {
// mentionSubTree = highest NP that has the same head
Tree headTree = tree.getLeaves().get(mention.headIndex);
if (headTree == null) { throw new RuntimeException("Missing head tree for a mention!"); }
Tree t = headTree;
while ((t = t.parent(tree)) != null) {
if (t.headTerminal(headFinder) == headTree && t.value().equals("NP")) {
mention.mentionSubTree = t;
} else if(mention.mentionSubTree != null){
break;
}
}
if (mention.mentionSubTree == null) {
mention.mentionSubTree = headTree;
}
}
List<Mention> mentionsForTree = mentionsToTrees.get(treeToKey(mention.mentionSubTree));
if(mentionsForTree == null){
mentionsForTree = new ArrayList<>();
mentionsToTrees.put(treeToKey(mention.mentionSubTree), mentionsForTree);
}
mentionsForTree.add(mention);
// generates all fields required for coref, such as gender, number, etc.
mention.process(dictionaries, semantics, this, singletonPredictor);
}
//
// Order all mentions in tree-traversal order
//
List<Mention> orderedMentions = new ArrayList<>();
orderedMentionsBySentence.add(orderedMentions);
// extract all mentions in tree traversal order (alternative: tree.postOrderNodeList())
for (Tree t : tree.preOrderNodeList()) {
List<Mention> lm = mentionsToTrees.get(treeToKey(t));
if(lm != null){
for(Mention m: lm){
orderedMentions.add(m);
}
}
}
//
// find appositions, predicate nominatives, relative pronouns in this sentence
//
findSyntacticRelations(tree, orderedMentions);
assert(mentions.size() == orderedMentions.size());
}
return orderedMentionsBySentence;
}
/**
* Sets the label of the leaf nodes of a Tree to be the CoreLabels in the given sentence.
* The original value() of the Tree nodes is preserved, and otherwise the label of tree
* leaves becomes the label from the List.
*/
// todo [cdm 2015]: This clearly shouldn't be here! Maybe it's not needed at all now since parsing code does this?
public static void mergeLabels(Tree tree, List<CoreLabel> sentence) {
int idx = 0;
for (Tree t : tree.getLeaves()) {
CoreLabel cl = sentence.get(idx ++);
String value = t.value();
cl.set(CoreAnnotations.ValueAnnotation.class, value);
t.setLabel(cl);
}
tree.indexLeaves();
}
private static boolean inside(int i, Mention m) {
return i >= m.startIndex && i < m.endIndex;
}
/** Find syntactic relations (e.g., appositives) in a sentence */
private void findSyntacticRelations(Tree tree, List<Mention> orderedMentions) {
markListMemberRelation(orderedMentions);
Set<Pair<Integer, Integer>> appos = Generics.newHashSet();
// TODO: This apposition finding doesn't seem to be very good - what about using "appos" from dependencies?
findAppositions(tree, appos);
markMentionRelation(orderedMentions, appos, "APPOSITION");
Set<Pair<Integer, Integer>> preNomi = Generics.newHashSet();
findPredicateNominatives(tree, preNomi);
markMentionRelation(orderedMentions, preNomi, "PREDICATE_NOMINATIVE");
Set<Pair<Integer, Integer>> relativePronounPairs = Generics.newHashSet();
findRelativePronouns(tree, relativePronounPairs);
markMentionRelation(orderedMentions, relativePronounPairs, "RELATIVE_PRONOUN");
}
/** Find syntactic pattern in a sentence by tregex */
private void findTreePattern(Tree tree, String tregex, Set<Pair<Integer, Integer>> foundPairs) {
try {
TregexPattern tgrepPattern = TregexPattern.compile(tregex);
findTreePattern(tree, tgrepPattern, foundPairs);
} catch (Exception e) {
// shouldn't happen....
throw new RuntimeException(e);
}
}
private void findTreePattern(Tree tree, TregexPattern tgrepPattern, Set<Pair<Integer, Integer>> foundPairs) {
try {
TregexMatcher m = tgrepPattern.matcher(tree);
while (m.find()) {
Tree t = m.getMatch();
Tree np1 = m.getNode("m1");
Tree np2 = m.getNode("m2");
Tree np3 = null;
if(tgrepPattern.pattern().contains("m3")) np3 = m.getNode("m3");
addFoundPair(np1, np2, t, foundPairs);
if(np3!=null) addFoundPair(np2, np3, t, foundPairs);
}
} catch (Exception e) {
// shouldn't happen....
throw new RuntimeException(e);
}
}
private void addFoundPair(Tree np1, Tree np2, Tree t,
Set<Pair<Integer, Integer>> foundPairs) {
Tree head1 = np1.headTerminal(headFinder);
Tree head2 = np2.headTerminal(headFinder);
int h1 = ((CoreMap) head1.label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
int h2 = ((CoreMap) head2.label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
Pair<Integer, Integer> p = new Pair<>(h1, h2);
foundPairs.add(p);
}
private static final TregexPattern appositionPattern = TregexPattern.compile("NP=m1 < (NP=m2 $.. (/,/ $.. NP=m3))");
private static final TregexPattern appositionPattern2 = TregexPattern.compile("NP=m1 < (NP=m2 $.. (/,/ $.. (SBAR < (WHNP < WP|WDT=m3))))");
private static final TregexPattern appositionPattern3 = TregexPattern.compile("/^NP(?:-TMP|-ADV)?$/=m1 < (NP=m2 $- /^,$/ $-- NP=m3 !$ CC|CONJP)");
private static final TregexPattern appositionPattern4 = TregexPattern.compile("/^NP(?:-TMP|-ADV)?$/=m1 < (PRN=m2 < (NP < /^NNS?|CD$/ $-- /^-LRB-$/ $+ /^-RRB-$/))");
private void findAppositions(Tree tree, Set<Pair<Integer, Integer>> appos) {
findTreePattern(tree, appositionPattern, appos);
findTreePattern(tree, appositionPattern2, appos);
findTreePattern(tree, appositionPattern3, appos);
findTreePattern(tree, appositionPattern4, appos);
}
private static final TregexPattern predicateNominativePattern = TregexPattern.compile("S < (NP=m1 $.. (VP < ((/VB/ < /^(am|are|is|was|were|'m|'re|'s|be)$/) $.. NP=m2)))");
private static final TregexPattern predicateNominativePattern2 = TregexPattern.compile("S < (NP=m1 $.. (VP < (VP < ((/VB/ < /^(be|been|being)$/) $.. NP=m2))))");
private void findPredicateNominatives(Tree tree, Set<Pair<Integer, Integer>> preNomi) {
// String predicateNominativePattern2 = "NP=m1 $.. (VP < ((/VB/ < /^(am|are|is|was|were|'m|'re|'s|be)$/) $.. NP=m2))";
findTreePattern(tree, predicateNominativePattern, preNomi);
findTreePattern(tree, predicateNominativePattern2, preNomi);
}
private static final TregexPattern relativePronounPattern = TregexPattern.compile("NP < (NP=m1 $.. (SBAR < (WHNP < WP|WDT=m2)))");
private void findRelativePronouns(Tree tree, Set<Pair<Integer, Integer>> relativePronounPairs) {
findTreePattern(tree, relativePronounPattern, relativePronounPairs);
}
private static void markListMemberRelation(List<Mention> orderedMentions) {
for(Mention m1 : orderedMentions){
for(Mention m2 : orderedMentions){
// Mark if m2 and m1 are in list relationship
if (m1.isListMemberOf(m2)) {
m2.addListMember(m1);
m1.addBelongsToList(m2);
} else if (m2.isListMemberOf(m1)) {
m1.addListMember(m2);
m2.addBelongsToList(m1);
}
}
}
}
private static void markMentionRelation(List<Mention> orderedMentions, Set<Pair<Integer, Integer>> foundPairs, String flag) {
for(Mention m1 : orderedMentions){
for(Mention m2 : orderedMentions){
// Ignore if m2 and m1 are in list relationship
if (m1.isListMemberOf(m2) || m2.isListMemberOf(m1) || m1.isMemberOfSameList(m2)) {
SieveCoreferenceSystem.logger.finest("Not checking '" + m1 + "' and '" + m2 + "' for " + flag + ": in list relationship");
continue;
}
for(Pair<Integer, Integer> foundPair: foundPairs){
if((foundPair.first == m1.headIndex && foundPair.second == m2.headIndex)){
switch (flag) {
case "APPOSITION":
m2.addApposition(m1);
break;
case "PREDICATE_NOMINATIVE":
m2.addPredicateNominatives(m1);
break;
case "RELATIVE_PRONOUN":
m2.addRelativePronoun(m1);
break;
default:
throw new RuntimeException("check flag in markMentionRelation (dcoref/MentionExtractor.java)");
}
}
}
}
}
}
/**
* Finds the tree the matches this span exactly
* @param tree Leaves must be indexed!
* @param first First element in the span (first position has offset 1)
* @param last Last element included in the span (first position has offset 1)
*/
public static Tree findExactMatch(Tree tree, int first, int last) {
List<Tree> leaves = tree.getLeaves();
int thisFirst = ((CoreMap) leaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class);
int thisLast = ((CoreMap) leaves.get(leaves.size() - 1).label()).get(CoreAnnotations.IndexAnnotation.class);
if(thisFirst == first && thisLast == last) {
return tree;
} else {
Tree [] kids = tree.children();
for(Tree k: kids){
Tree t = findExactMatch(k, first, last);
if(t != null) return t;
}
}
return null;
}
/** Load Stanford Processor: skip unnecessary annotator */
protected static StanfordCoreNLP loadStanfordProcessor(Properties props) {
boolean replicateCoNLL = Boolean.parseBoolean(props.getProperty(Constants.REPLICATECONLL_PROP, "false"));
Properties pipelineProps = new Properties(props);
StringBuilder annoSb = new StringBuilder("");
if (!Constants.USE_GOLD_POS && !replicateCoNLL) {
annoSb.append("pos, lemma");
} else {
annoSb.append("lemma");
}
if(Constants.USE_TRUECASE) {
annoSb.append(", truecase");
}
if (!Constants.USE_GOLD_NE && !replicateCoNLL) {
annoSb.append(", ner");
}
if (!Constants.USE_GOLD_PARSES && !replicateCoNLL) {
annoSb.append(", parse");
}
String annoStr = annoSb.toString();
SieveCoreferenceSystem.logger.info("MentionExtractor ignores specified annotators, using annotators=" + annoStr);
pipelineProps.setProperty("annotators", annoStr);
return new StanfordCoreNLP(pipelineProps, false);
}
public static void initializeUtterance(List<CoreLabel> tokens) {
for(CoreLabel l : tokens){
if (l.get(CoreAnnotations.UtteranceAnnotation.class) == null) {
l.set(CoreAnnotations.UtteranceAnnotation.class, 0);
}
}
}
}