package edu.stanford.nlp.coref.md;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.coref.data.Mention;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.MultiTokenTag;
import edu.stanford.nlp.parser.common.ParserAnnotations;
import edu.stanford.nlp.parser.common.ParserConstraint;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.trees.Trees;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.IntPair;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
/**
* Interface for finding coref mentions in a document.
*
* @author Angel Chang
*/
public abstract class CorefMentionFinder {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(CorefMentionFinder.class);
protected Locale lang;
protected HeadFinder headFinder;
protected Annotator parserProcessor;
protected boolean allowReparsing;
protected static final TregexPattern npOrPrpMentionPattern = TregexPattern.compile("/^(?:NP|PN|PRP)/");
private static final boolean VERBOSE = false;
/** Get all the predicted mentions for a document.
*
* @param doc The syntactically annotated document
* @param dict Dictionaries for coref.
* @return For each of the List of sentences in the document, a List of Mention objects
*/
public abstract List<List<Mention>> findMentions(Annotation doc, Dictionaries dict, Properties props);
protected static void extractPremarkedEntityMentions(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph basicDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
SemanticGraph enhancedDependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
if (enhancedDependency == null) {
enhancedDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
}
int beginIndex = -1;
for (CoreLabel w : sent) {
MultiTokenTag t = w.get(CoreAnnotations.MentionTokenAnnotation.class);
if (t != null) {
// Part of a mention
if (t.isStart()) {
// Start of mention
beginIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1;
}
if (t.isEnd()) {
// end of mention
int endIndex = w.get(CoreAnnotations.IndexAnnotation.class);
if (beginIndex >= 0) {
IntPair mSpan = new IntPair(beginIndex, endIndex);
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIndex, endIndex, sent, basicDependency, enhancedDependency, new ArrayList<>(sent.subList(beginIndex, endIndex)));
mentions.add(m);
mentionSpanSet.add(mSpan);
beginIndex = -1;
} else {
Redwood.log("Start of marked mention not found in sentence: "
+ t + " at tokenIndex=" + (w.get(CoreAnnotations.IndexAnnotation.class)-1)+ " for "
+ s.get(CoreAnnotations.TextAnnotation.class));
}
}
}
}
}
/** Extract enumerations (A, B, and C) */
protected static final TregexPattern enumerationsMentionPattern = TregexPattern.compile("NP < (/^(?:NP|NNP|NML)/=m1 $.. (/^CC|,/ $.. /^(?:NP|NNP|NML)/=m2))");
protected static void extractEnumerations(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
SemanticGraph basicDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
SemanticGraph enhancedDependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
if (enhancedDependency == null) {
enhancedDependency = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
}
TregexPattern tgrepPattern = enumerationsMentionPattern;
TregexMatcher matcher = tgrepPattern.matcher(tree);
Map<IntPair, Tree> spanToMentionSubTree = Generics.newHashMap();
while (matcher.find()) {
matcher.getMatch();
Tree m1 = matcher.getNode("m1");
Tree m2 = matcher.getNode("m2");
List<Tree> mLeaves = m1.getLeaves();
int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1;
int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class);
spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m1);
mLeaves = m2.getLeaves();
beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1;
endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class);
spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m2);
}
for (Map.Entry<IntPair, Tree> spanMention : spanToMentionSubTree.entrySet()) {
IntPair span = spanMention.getKey();
if (!mentionSpanSet.contains(span) && !insideNE(span, namedEntitySpanSet)) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, span.get(0), span.get(1), sent, basicDependency, enhancedDependency,
new ArrayList<>(sent.subList(span.get(0), span.get(1))), spanMention.getValue());
mentions.add(m);
mentionSpanSet.add(span);
}
}
}
/** Check whether a mention is inside of a named entity */
protected static boolean insideNE(IntPair mSpan, Set<IntPair> namedEntitySpanSet) {
for (IntPair span : namedEntitySpanSet){
if(span.get(0) <= mSpan.get(0) && mSpan.get(1) <= span.get(1)) return true;
}
return false;
}
public static boolean inStopList(Mention m) {
String mentionSpan = m.spanToString().toLowerCase(Locale.ENGLISH);
if (mentionSpan.equals("u.s.") || mentionSpan.equals("u.k.")
|| mentionSpan.equals("u.s.s.r")) return true;
if (mentionSpan.equals("there") || mentionSpan.startsWith("etc.")
|| mentionSpan.equals("ltd.")) return true;
if (mentionSpan.startsWith("'s ")) return true;
// if (mentionSpan.endsWith("etc.")) return true;
return false;
}
protected void removeSpuriousMentions(Annotation doc, List<List<Mention>> predictedMentions, Dictionaries dict, boolean removeNested, Locale lang) {
if(lang == Locale.ENGLISH) removeSpuriousMentionsEn(doc, predictedMentions, dict);
else if (lang == Locale.CHINESE) removeSpuriousMentionsZh(doc, predictedMentions, dict, removeNested);
}
protected void removeSpuriousMentionsEn(Annotation doc, List<List<Mention>> predictedMentions, Dictionaries dict) {
List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
for(int i=0 ; i < predictedMentions.size() ; i++) {
CoreMap s = sentences.get(i);
List<Mention> mentions = predictedMentions.get(i);
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
Set<Mention> remove = Generics.newHashSet();
for(Mention m : mentions){
String headPOS = m.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class);
// non word such as 'hmm'
if(dict.nonWords.contains(m.headString)) remove.add(m);
// adjective form of nations
// the [American] policy -> not mention
// speak in [Japanese] -> mention
// check if the mention is noun and the next word is not noun
if (dict.isAdjectivalDemonym(m.spanToString())) {
if(!headPOS.startsWith("N")
|| (m.endIndex < sent.size() && sent.get(m.endIndex).tag().startsWith("N")) ) {
remove.add(m);
}
}
// stop list (e.g., U.S., there)
if (inStopList(m)) remove.add(m);
}
mentions.removeAll(remove);
}
}
protected void removeSpuriousMentionsZh(Annotation doc, List<List<Mention>> predictedMentions, Dictionaries dict, boolean removeNested) {
List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
// this goes through each sentence -- predictedMentions has a list for each sentence
for (int i=0, sz = predictedMentions.size(); i < sz ; i++) {
List<Mention> mentions = predictedMentions.get(i);
List<CoreLabel> sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
Set<Mention> remove = Generics.newHashSet();
for (Mention m : mentions) {
if (m.headWord.ner().matches("PERCENT|MONEY|QUANTITY|CARDINAL")) {
remove.add(m);
if (VERBOSE) log.info("MENTION FILTERING number NER: " + m.spanToString());
} else if (m.originalSpan.size()==1 && m.headWord.tag().equals("CD")) {
remove.add(m);
if (VERBOSE) log.info("MENTION FILTERING number: " + m.spanToString());
} else if (dict.removeWords.contains(m.spanToString())) {
remove.add(m);
if (VERBOSE) log.info("MENTION FILTERING removeWord: " + m.spanToString());
} else if (mentionContainsRemoveChars(m, dict.removeChars)) {
remove.add(m);
if (VERBOSE) log.info("MENTION FILTERING removeChars: " + m.spanToString());
} else if (m.headWord.tag().equals("PU")) {
// punctuation-only mentions
remove.add(m);
if (VERBOSE) log.info("MENTION FILTERING Punctuation only mention: " + m.spanToString());
} else if (mentionIsDemonym(m, dict.countries)) {
// demonyms -- this seems to be a no-op on devset. Maybe not working?
remove.add(m);
if (VERBOSE) log.info("MENTION FILTERING Removed demonym: " + m.spanToString());
} else if (m.spanToString().equals("问题") && m.startIndex > 0 &&
sent.get(m.startIndex - 1).word().endsWith("没")) {
// 没 问题 - this is maybe okay but having 问题 on removeWords was dangerous
remove.add(m);
if (VERBOSE) log.info("MENTION FILTERING Removed meiyou: " + m.spanToString());
} else if (mentionIsRangren(m, sent)) {
remove.add(m);
if (VERBOSE) log.info("MENTION FILTERING Removed rangren: " + m.spanToString());
} else if (m.spanToString().equals("你") && m.startIndex < sent.size() - 1 &&
sent.get(m.startIndex + 1).word().startsWith("知道")) {
// 你 知道
remove.add(m);
if (VERBOSE) log.info("MENTION FILTERING Removed nizhidao: " + m.spanToString());
// The words that used to be in this case are now handled more generallyin removeCharsZh
// } else if (m.spanToString().contains("什么") || m.spanToString().contains("多少")) {
// remove.add(m);
// if (VERBOSE) log.info("MENTION FILTERING Removed many/few mention ending: " + m.spanToString());
} else if (m.spanToString().endsWith("的")) {
remove.add(m);
if (VERBOSE) log.info("MENTION FILTERING Removed de ending mention: " + m.spanToString());
// omit this case, it decreases performance. A few useful interrogative pronouns are now in the removeChars list
// } else if (mentionIsInterrogativePronoun(m, dict.interrogativePronouns)) {
// remove.add(m);
// if (VERBOSE) log.info("MENTION FILTERING Removed interrogative pronoun: " + m.spanToString());
}
// 的 handling
// if(m.startIndex>0 && sent.get(m.startIndex-1).word().equals("的")) {
// // remove.add(m);
// Tree t = sentences.get(i).get(TreeAnnotation.class);
// Tree mTree = m.mentionSubTree;
// if(mTree==null) continue;
// for(Tree p : t.pathNodeToNode(mTree, t)) {
// if(mTree==p) continue;
// if(p.value().equals("NP")) {
// remove.add(m);
// }
// }
// }
} // for each mention
// nested mention with shared headword (except apposition, enumeration): pick larger one
if (removeNested) {
for (Mention m1 : mentions){
for (Mention m2 : mentions){
if (m1==m2 || remove.contains(m1) || remove.contains(m2)) continue;
if (m1.sentNum==m2.sentNum && m1.headWord==m2.headWord && m2.insideIn(m1)) {
if (m2.endIndex < sent.size() && (sent.get(m2.endIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals(",")
|| sent.get(m2.endIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CC"))) {
continue;
}
remove.add(m2);
}
}
}
}
mentions.removeAll(remove);
} // for each sentence
}
private static boolean mentionContainsRemoveChars(Mention m, Set<String> removeChars) {
String spanString = m.spanToString();
for (String ch : removeChars) {
if (spanString.contains(ch)) {
return true;
}
}
return false;
}
private static boolean mentionIsDemonym(Mention m, Set<String> countries) {
String lastWord = m.originalSpan.get(m.originalSpan.size()-1).word();
return lastWord.length() > 0 && m.spanToString().endsWith("人") &&
countries.contains(lastWord.substring(0, lastWord.length()-1));
}
private static boolean mentionIsRangren(Mention m, List<CoreLabel> sent) {
if (m.spanToString().equals("人") && m.startIndex > 0) {
String priorWord = sent.get(m.startIndex - 1).word();
// cdm [2016]: This test matches everything because of the 3rd clause! That can't be right!
if (priorWord.endsWith("让") || priorWord.endsWith("令") || priorWord.endsWith("")) {
return true;
}
}
return false;
}
private static boolean mentionIsInterrogativePronoun(Mention m, Set<String> interrogatives) {
// handling interrogative pronouns
for (CoreLabel cl : m.originalSpan) {
// if (dict.interrogativePronouns.contains(m.spanToString())) remove.add(m);
if (interrogatives.contains(cl.word())) {
return true;
}
}
return false;
}
// extract mentions which have same string as another stand-alone mention
protected static void extractNamedEntityModifiers(List<CoreMap> sentences, List<Set<IntPair>> mentionSpanSetList, List<List<Mention>> predictedMentions, Set<String> neStrings) {
for (int i=0, sz = sentences.size(); i < sz ; i++ ) {
List<Mention> mentions = predictedMentions.get(i);
CoreMap sent = sentences.get(i);
List<CoreLabel> tokens = sent.get(TokensAnnotation.class);
Set<IntPair> mentionSpanSet = mentionSpanSetList.get(i);
for (int j=0, tSize=tokens.size(); j < tSize; j++) {
for (String ne : neStrings) {
int len = ne.split(" ").length;
if (j+len > tokens.size()) continue;
StringBuilder sb = new StringBuilder();
for(int k=0 ; k < len ; k++) {
sb.append(tokens.get(k+j).word()).append(" ");
}
String phrase = sb.toString().trim();
int beginIndex = j;
int endIndex = j+len;
// include "'s" if it belongs to this named entity
if( endIndex < tokens.size() && tokens.get(endIndex).word().equals("'s") && tokens.get(endIndex).tag().equals("POS")) {
Tree tree = sent.get(TreeAnnotation.class);
Tree sToken = tree.getLeaves().get(beginIndex);
Tree eToken = tree.getLeaves().get(endIndex);
Tree join = tree.joinNode(sToken, eToken);
Tree sJoin = join.getLeaves().get(0);
Tree eJoin = join.getLeaves().get(join.getLeaves().size()-1);
if(sToken == sJoin && eToken == eJoin) {
endIndex++;
}
}
// include DT if it belongs to this named entity
if( beginIndex > 0 && tokens.get(beginIndex-1).tag().equals("DT")) {
Tree tree = sent.get(TreeAnnotation.class);
Tree sToken = tree.getLeaves().get(beginIndex-1);
Tree eToken = tree.getLeaves().get(endIndex-1);
Tree join = tree.joinNode(sToken, eToken);
Tree sJoin = join.getLeaves().get(0);
Tree eJoin = join.getLeaves().get(join.getLeaves().size()-1);
if(sToken == sJoin && eToken == eJoin) {
beginIndex--;
}
}
IntPair span = new IntPair(beginIndex, endIndex);
if(phrase.equalsIgnoreCase(ne) && !mentionSpanSet.contains(span)) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIndex, endIndex, tokens,
sent.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class),
sent.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class) != null
? sent.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class)
: sent.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class),
new ArrayList<>(tokens.subList(beginIndex, endIndex)));
mentions.add(m);
mentionSpanSet.add(span);
}
}
}
}
}
protected static void addNamedEntityStrings(CoreMap s, Set<String> neStrings, Set<IntPair> namedEntitySpanSet) {
List<CoreLabel> tokens = s.get(TokensAnnotation.class);
for(IntPair p : namedEntitySpanSet) {
StringBuilder sb = new StringBuilder();
for(int idx=p.get(0) ; idx < p.get(1) ; idx++) {
sb.append(tokens.get(idx).word()).append(" ");
}
String str = sb.toString().trim();
if(str.endsWith(" 's")) {
str = str.substring(0, str.length()-3);
}
neStrings.add(str);
}
}
// temporary for debug
protected static void addGoldMentions(List<CoreMap> sentences,
List<Set<IntPair>> mentionSpanSetList,
List<List<Mention>> predictedMentions, List<List<Mention>> allGoldMentions) {
for (int i=0, sz = sentences.size(); i < sz; i++) {
List<Mention> mentions = predictedMentions.get(i);
CoreMap sent = sentences.get(i);
List<CoreLabel> tokens = sent.get(TokensAnnotation.class);
Set<IntPair> mentionSpanSet = mentionSpanSetList.get(i);
List<Mention> golds = allGoldMentions.get(i);
for (Mention g : golds) {
IntPair pair = new IntPair(g.startIndex, g.endIndex);
if(!mentionSpanSet.contains(pair)) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, g.startIndex, g.endIndex, tokens,
sent.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class),
sent.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class) != null
? sent.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class)
: sent.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class),
new ArrayList<>(tokens.subList(g.startIndex, g.endIndex)));
mentions.add(m);
mentionSpanSet.add(pair);
}
}
}
}
public void findHead(CoreMap s, List<Mention> mentions) {
Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
tree.indexSpans(0);
for (Mention m : mentions){
if (lang == Locale.CHINESE) {
findHeadChinese(sent, m);
} else {
CoreLabel head = (CoreLabel) findSyntacticHead(m, tree, sent).label();
m.headIndex = head.get(CoreAnnotations.IndexAnnotation.class)-1;
m.headWord = sent.get(m.headIndex);
m.headString = m.headWord.get(CoreAnnotations.TextAnnotation.class).toLowerCase(Locale.ENGLISH);
}
int start = m.headIndex - m.startIndex;
if (start < 0 || start >= m.originalSpan.size()) {
Redwood.log("Invalid index for head " + start + "=" + m.headIndex + "-" + m.startIndex
+ ": originalSpan=[" + StringUtils.joinWords(m.originalSpan, " ") + "], head=" + m.headWord);
Redwood.log("Setting head string to entire mention");
m.headIndex = m.startIndex;
m.headWord = m.originalSpan.size() > 0 ? m.originalSpan.get(0) : sent.get(m.startIndex);
m.headString = m.originalSpan.toString();
}
}
}
protected static void findHeadChinese(List<CoreLabel> sent, Mention m) {
int headPos = m.endIndex - 1;
// Skip trailing punctuations
while (headPos > m.startIndex && sent.get(headPos).tag().equals("PU")) {
headPos--;
}
// If we got right to the end without finding non punctuation, reset to end again
if (headPos == m.startIndex && sent.get(headPos).tag().equals("PU")) {
headPos = m.endIndex - 1;
}
if (sent.get(headPos).originalText().equals("自己") && m.endIndex != m.startIndex) {
headPos--;
}
m.headIndex = headPos;
m.headWord = sent.get(headPos);
m.headString = m.headWord.get(CoreAnnotations.TextAnnotation.class);
}
public Tree findSyntacticHead(Mention m, Tree root, List<CoreLabel> tokens) {
// mention ends with 's
int endIdx = m.endIndex;
if (m.originalSpan.size() > 0) {
String lastWord = m.originalSpan.get(m.originalSpan.size()-1).get(CoreAnnotations.TextAnnotation.class);
if((lastWord.equals("'s") || lastWord.equals("'"))
&& m.originalSpan.size() != 1 ) endIdx--;
}
Tree exactMatch = findTreeWithSpan(root, m.startIndex, endIdx);
//
// found an exact match
//
if (exactMatch != null) {
return safeHead(exactMatch, endIdx);
}
// no exact match found
// in this case, we parse the actual extent of the mention, embedded in a sentence
// context, so as to make the parser work better :-)
if (allowReparsing) {
int approximateness = 0;
List<CoreLabel> extentTokens = new ArrayList<>();
extentTokens.add(initCoreLabel("It", "PRP"));
extentTokens.add(initCoreLabel("was", "VBD"));
final int ADDED_WORDS = 2;
for (int i = m.startIndex; i < endIdx; i++) {
// Add everything except separated dashes! The separated dashes mess with the parser too badly.
CoreLabel label = tokens.get(i);
if ( ! "-".equals(label.word())) {
extentTokens.add(tokens.get(i));
} else {
approximateness++;
}
}
extentTokens.add(initCoreLabel(".", "."));
// constrain the parse to the part we're interested in.
// Starting from ADDED_WORDS comes from skipping "It was".
// -1 to exclude the period.
// We now let it be any kind of nominal constituent, since there
// are VP and S ones
ParserConstraint constraint = new ParserConstraint(ADDED_WORDS, extentTokens.size() - 1, Pattern.compile(".*"));
List<ParserConstraint> constraints = Collections.singletonList(constraint);
Tree tree = parse(extentTokens, constraints);
convertToCoreLabels(tree); // now unnecessary, as parser uses CoreLabels?
tree.indexSpans(m.startIndex - ADDED_WORDS); // remember it has ADDED_WORDS extra words at the beginning
Tree subtree = findPartialSpan(tree, m.startIndex);
// There was a possible problem that with a crazy parse, extentHead could be one of the added words, not a real word!
// Now we make sure in findPartialSpan that it can't be before the real start, and in safeHead, we disallow something
// passed the right end (that is, just that final period).
Tree extentHead = safeHead(subtree, endIdx);
assert(extentHead != null);
// extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree
// Because we deleted dashes, it's index will be >= the index in the extent parse tree
CoreLabel l = (CoreLabel) extentHead.label();
Tree realHead = funkyFindLeafWithApproximateSpan(root, l.value(), l.get(CoreAnnotations.BeginIndexAnnotation.class), approximateness);
assert(realHead != null);
return realHead;
}
// If reparsing wasn't allowed, try to find a span in the tree
// which happens to have the head
Tree wordMatch = findTreeWithSmallestSpan(root, m.startIndex, endIdx);
if (wordMatch != null) {
Tree head = safeHead(wordMatch, endIdx);
if (head != null) {
int index = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class)-1;
if (index >= m.startIndex && index < endIdx) {
return head;
}
}
}
// If that didn't work, guess that it's the last word
int lastNounIdx = endIdx-1;
for(int i=m.startIndex ; i < m.endIndex ; i++) {
if(tokens.get(i).tag().startsWith("N")) lastNounIdx = i;
else if(tokens.get(i).tag().startsWith("W")) break;
}
List<Tree> leaves = root.getLeaves();
Tree endLeaf = leaves.get(lastNounIdx);
return endLeaf;
}
/** Find the tree that covers the portion of interest. */
private static Tree findPartialSpan(final Tree root, final int start) {
CoreLabel label = (CoreLabel) root.label();
int startIndex = label.get(CoreAnnotations.BeginIndexAnnotation.class);
if (startIndex == start) {
return root;
}
for (Tree kid : root.children()) {
CoreLabel kidLabel = (CoreLabel) kid.label();
int kidStart = kidLabel.get(CoreAnnotations.BeginIndexAnnotation.class);
int kidEnd = kidLabel.get(CoreAnnotations.EndIndexAnnotation.class);
if (kidStart <= start && kidEnd > start) {
return findPartialSpan(kid, start);
}
}
throw new RuntimeException("Shouldn't happen: " + start + " " + root);
}
private static Tree funkyFindLeafWithApproximateSpan(Tree root, String token, int index, int approximateness) {
// log.info("Searching " + root + "\n for " + token + " at position " + index + " (plus up to " + approximateness + ")");
List<Tree> leaves = root.getLeaves();
for (Tree leaf : leaves) {
CoreLabel label = CoreLabel.class.cast(leaf.label());
Integer indexInteger = label.get(CoreAnnotations.IndexAnnotation.class);
if (indexInteger == null) continue;
int ind = indexInteger - 1;
if (token.equals(leaf.value()) && ind >= index && ind <= index + approximateness) {
return leaf;
}
}
// this shouldn't happen
// throw new RuntimeException("RuleBasedCorefMentionFinder: ERROR: Failed to find head token");
Redwood.log("RuleBasedCorefMentionFinder: Failed to find head token:\n" +
"Tree is: " + root + "\n" +
"token = |" + token + "|" + index + "|, approx=" + approximateness);
for (Tree leaf : leaves) {
if (token.equals(leaf.value())) {
// log.info("Found it at position " + ind + "; returning " + leaf);
return leaf;
}
}
int fallback = Math.max(0, leaves.size() - 2);
Redwood.log("RuleBasedCorefMentionFinder: Last resort: returning as head: " + leaves.get(fallback));
return leaves.get(fallback); // last except for the added period.
}
private static CoreLabel initCoreLabel(String token, String posTag) {
CoreLabel label = new CoreLabel();
label.set(CoreAnnotations.TextAnnotation.class, token);
label.set(CoreAnnotations.ValueAnnotation.class, token);
label.set(CoreAnnotations.PartOfSpeechAnnotation.class, posTag);
return label;
}
private Tree parse(List<CoreLabel> tokens) {
return parse(tokens, null);
}
private Tree parse(List<CoreLabel> tokens,
List<ParserConstraint> constraints) {
CoreMap sent = new Annotation("");
sent.set(CoreAnnotations.TokensAnnotation.class, tokens);
sent.set(ParserAnnotations.ConstraintAnnotation.class, constraints);
Annotation doc = new Annotation("");
List<CoreMap> sents = new ArrayList<>(1);
sents.add(sent);
doc.set(CoreAnnotations.SentencesAnnotation.class, sents);
getParser().annotate(doc);
sents = doc.get(CoreAnnotations.SentencesAnnotation.class);
return sents.get(0).get(TreeCoreAnnotations.TreeAnnotation.class);
}
private Annotator getParser() {
if(parserProcessor == null){
parserProcessor = StanfordCoreNLP.getExistingAnnotator("parse");
assert(parserProcessor != null);
}
return parserProcessor;
}
// This probably isn't needed now; everything is always a core label. But no-op.
private static void convertToCoreLabels(Tree tree) {
Label l = tree.label();
if (! (l instanceof CoreLabel)) {
CoreLabel cl = new CoreLabel();
cl.setValue(l.value());
tree.setLabel(cl);
}
for (Tree kid : tree.children()) {
convertToCoreLabels(kid);
}
}
private Tree safeHead(Tree top, int endIndex) {
// The trees passed in do not have the CoordinationTransformer
// applied, but that just means the SemanticHeadFinder results are
// slightly worse.
Tree head = top.headTerminal(headFinder);
// One obscure failure case is that the added period becomes the head. Disallow this.
if (head != null) {
Integer headIndexInteger = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class);
if (headIndexInteger != null) {
int headIndex = headIndexInteger - 1;
if (headIndex < endIndex) {
return head;
}
}
}
// if no head found return the right-most leaf
List<Tree> leaves = top.getLeaves();
int candidate = leaves.size() - 1;
while (candidate >= 0) {
head = leaves.get(candidate);
Integer headIndexInteger = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class);
if (headIndexInteger != null) {
int headIndex = headIndexInteger - 1;
if (headIndex < endIndex) {
return head;
}
}
candidate--;
}
// fallback: return top
return top;
}
static Tree findTreeWithSmallestSpan(Tree tree, int start, int end) {
List<Tree> leaves = tree.getLeaves();
Tree startLeaf = leaves.get(start);
Tree endLeaf = leaves.get(end - 1);
return Trees.getLowestCommonAncestor(Arrays.asList(startLeaf, endLeaf), tree);
}
private static Tree findTreeWithSpan(Tree tree, int start, int end) {
CoreLabel l = (CoreLabel) tree.label();
if (l != null && l.containsKey(CoreAnnotations.BeginIndexAnnotation.class) && l.containsKey(CoreAnnotations.EndIndexAnnotation.class)) {
int myStart = l.get(CoreAnnotations.BeginIndexAnnotation.class);
int myEnd = l.get(CoreAnnotations.EndIndexAnnotation.class);
if (start == myStart && end == myEnd){
// found perfect match
return tree;
} else if (end < myStart) {
return null;
} else if (start >= myEnd) {
return null;
}
}
// otherwise, check inside children - a match is possible
for (Tree kid : tree.children()) {
if (kid == null) continue;
Tree ret = findTreeWithSpan(kid, start, end);
// found matching child
if (ret != null) return ret;
}
// no match
return null;
}
public static boolean partitiveRule(Mention m, List<CoreLabel> sent, Dictionaries dict) {
return m.startIndex >= 2
&& sent.get(m.startIndex - 1).get(CoreAnnotations.TextAnnotation.class).equalsIgnoreCase("of")
&& dict.parts.contains(sent.get(m.startIndex - 2).get(CoreAnnotations.TextAnnotation.class).toLowerCase(Locale.ENGLISH));
}
/** Check whether pleonastic 'it'. E.g., It is possible that ... */
private static final TregexPattern[] pleonasticPatterns = getPleonasticPatterns();
public static boolean isPleonastic(Mention m, Tree tree) {
if ( ! m.spanToString().equalsIgnoreCase("it")) return false;
for (TregexPattern p : pleonasticPatterns) {
if (checkPleonastic(m, tree, p)) {
// SieveCoreferenceSystem.logger.fine("RuleBasedCorefMentionFinder: matched pleonastic pattern '" + p + "' for " + tree);
return true;
}
}
return false;
}
public static boolean isPleonasticDebug(Mention m, Tree tree, StringBuilder sbLog) {
if ( ! m.spanToString().equalsIgnoreCase("it")) return false;
boolean isPleonastic = false;
int patternIdx = -1;
int matchedPattern = -1;
for (TregexPattern p : pleonasticPatterns) {
patternIdx++;
if (checkPleonastic(m, tree, p)) {
// SieveCoreferenceSystem.logger.fine("RuleBasedCorefMentionFinder: matched pleonastic pattern '" + p + "' for " + tree);
isPleonastic = true;
matchedPattern = patternIdx;
}
}
sbLog.append("PLEONASTIC IT: mention ID: "+m.mentionID +"\thastwin: "+m.hasTwin+"\tpleonastic it? "+isPleonastic+"\tcorrect? "+(m.hasTwin!=isPleonastic)+"\tmatched pattern: "+matchedPattern+"\n");
sbLog.append(m.contextParseTree.pennString()).append("\n");
sbLog.append("PLEONASTIC IT END\n");
return isPleonastic;
}
private static TregexPattern[] getPleonasticPatterns() {
final String[] patterns = {
// cdm 2013: I spent a while on these patterns. I fixed a syntax error in five patterns ($.. split with space), so it now shouldn't exception in checkPleonastic. This gave 0.02% on CoNLL11 dev
// I tried some more precise patterns but they didn't help. Indeed, they tended to hurt vs. the higher recall patterns.
//"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (VP < (VBN $.. /S|SBAR/))))", // overmatches
// "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN < expected|hoped $.. @SBAR))))", // this one seems more accurate, but ...
"@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN $.. @S|SBAR))))", // in practice, go with this one (best results)
"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP $.. (/S|SBAR/))))",
"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP < (/S|SBAR/))))",
// "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@ADJP < (/^(?:JJ|VB)/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay)$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))", // does worse than above 2 on CoNLL11 dev
"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP < /S|SBAR/)))",
"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP $.. ADVP $.. /S|SBAR/)))",
// "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@NP $.. @ADVP $.. @SBAR)))", // cleft examples, generalized to not need ADVP; but gave worse CoNLL12 dev numbers....
// these next 5 had buggy space in "$ ..", which I fixed
"NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (VP < (VBN $.. /S|SBAR/))))))",
"NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP $.. (/S|SBAR/))))))", // extraposed. OK 1/2 correct; need non-adverbial case
"NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP < (/S|SBAR/))))))", // OK: 3/3 good matches on dev; but 3/4 wrong on WSJ
// certain can be either but relatively likely pleonastic with it ... be
// "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (MD $.. (@VP < ((/^V.*/ < /^(?:be|become)/) $.. (@ADJP < (/^JJ/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay))$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))))", // GOOD REPLACEMENT ; 2nd clause is for extraposed ones
"NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP < /S|SBAR/)))))",
"NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP $.. ADVP $.. /S|SBAR/)))))",
"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:seems|appears|means|follows)/) $.. /S|SBAR/))",
"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:turns|turned)/) $.. PRT $.. /S|SBAR/))"
};
TregexPattern[] tgrepPatterns = new TregexPattern[patterns.length];
for (int i = 0; i < tgrepPatterns.length; i++) {
tgrepPatterns[i] = TregexPattern.compile(patterns[i]);
}
return tgrepPatterns;
}
private static boolean checkPleonastic(Mention m, Tree tree, TregexPattern tgrepPattern) {
try {
TregexMatcher matcher = tgrepPattern.matcher(tree);
while (matcher.find()) {
Tree np1 = matcher.getNode("m1");
if (((CoreLabel)np1.label()).get(CoreAnnotations.BeginIndexAnnotation.class)+1 == m.headWord.get(CoreAnnotations.IndexAnnotation.class)) {
return true;
}
}
} catch (Exception e) {
e.printStackTrace();
}
return false;
}
}