// // StanfordCoreNLP -- a suite of NLP tools // Copyright (c) 2009-2010 The Board of Trustees of // The Leland Stanford Junior University. All Rights Reserved. // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. // // For more information, bug reports, fixes, contact: // Christopher Manning // Dept of Computer Science, Gates 1A // Stanford CA 94305-9010 // USA // package edu.stanford.nlp.coref.hybrid.sieve; import edu.stanford.nlp.util.logging.Redwood; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Properties; import java.util.Set; import edu.stanford.nlp.coref.CorefRules; import edu.stanford.nlp.coref.data.CorefCluster; import edu.stanford.nlp.coref.data.Dictionaries; import edu.stanford.nlp.coref.data.Document; import edu.stanford.nlp.coref.data.Mention; import edu.stanford.nlp.coref.data.Dictionaries.MentionType; import edu.stanford.nlp.coref.data.Dictionaries.Number; import edu.stanford.nlp.coref.data.Dictionaries.Person; import edu.stanford.nlp.coref.data.Document.DocType; import edu.stanford.nlp.coref.hybrid.HybridCorefPrinter; import edu.stanford.nlp.coref.hybrid.HybridCorefProperties; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation; import edu.stanford.nlp.trees.Tree; /** * Base class for a Coref Sieve. * Each sieve extends this class, and set flags for its own options in the constructor. * * @author heeyoung * @author mihais */ public abstract class DeterministicCorefSieve extends Sieve { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(DeterministicCorefSieve.class); public final DcorefSieveOptions flags; public DeterministicCorefSieve() { super(); this.classifierType = ClassifierType.RULE; flags = new DcorefSieveOptions(); } public DeterministicCorefSieve(Properties props) { super(props); this.classifierType = ClassifierType.RULE; flags = new DcorefSieveOptions(); } public void findCoreferentAntecedent(Mention m, int mIdx, Document document, Dictionaries dict, Properties props, StringBuilder sbLog) throws Exception { // check for skip: first mention only, discourse salience if(!this.flags.USE_SPEAKERMATCH && !this.flags.USE_DISCOURSEMATCH && !this.flags.USE_APPOSITION && !this.flags.USE_PREDICATENOMINATIVES && this.skipThisMention(document, m, document.corefClusters.get(m.corefClusterID), dict)) { return; } Set<Mention> roleSet = document.roleSet; for (int sentJ = m.sentNum; sentJ >= 0; sentJ--) { List<Mention> l = Sieve.getOrderedAntecedents(m, sentJ, mIdx, document.predictedMentions, dict); if(maxSentDist != -1 && m.sentNum - sentJ > maxSentDist) continue; // TODO: do we need this? // Sort mentions by length whenever we have two mentions beginning at the same position and having the same head for(int i = 0; i < l.size(); i++) { for(int j = 0; j < l.size(); j++) { if(l.get(i).headString.equals(l.get(j).headString) && l.get(i).startIndex == l.get(j).startIndex && l.get(i).sameSentence(l.get(j)) && j > i && l.get(i).spanToString().length() > l.get(j).spanToString().length()) { l.set(j, l.set(i, l.get(j))); // log.info("antecedent ordering changed!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); } } } for (Mention ant : l) { if(skipForAnalysis(ant, m, props)) continue; // m2 - antecedent of m1 // Skip singletons according to the singleton predictor // (only for non-NE mentions) // Recasens, de Marneffe, and Potts (NAACL 2013) if (m.isSingleton && m.mentionType != MentionType.PROPER && ant.isSingleton && ant.mentionType != MentionType.PROPER) continue; if (m.corefClusterID == ant.corefClusterID) continue; if(!mType.contains(m.mentionType) || !aType.contains(ant.mentionType)) continue; if(m.mentionType == MentionType.PRONOMINAL) { if(!matchedMentionType(m, mTypeStr)) continue; if(!matchedMentionType(ant, aTypeStr)) continue; } CorefCluster c1 = document.corefClusters.get(m.corefClusterID); CorefCluster c2 = document.corefClusters.get(ant.corefClusterID); assert(c1 != null); assert(c2 != null); if (this.useRoleSkip()) { if (m.isRoleAppositive(ant, dict)) { roleSet.add(m); } else if (ant.isRoleAppositive(m, dict)) { roleSet.add(ant); } continue; } if (this.coreferent(document, c1, c2, m, ant, dict, roleSet)) { // print logs for analysis // if (doScore()) { // printLogs(c1, c2, m1, m2, document, currentSieve); // } // print dcoref log if(HybridCorefProperties.debug(props)) { sbLog.append(HybridCorefPrinter.printErrorLogDcoref(m, ant, document, dict, mIdx, this.getClass().getName())); } int removeID = c1.clusterID; // System.out.println("Merging ant "+c2+" with "+c1); CorefCluster.mergeClusters(c2, c1); document.mergeIncompatibles(c2, c1); document.mergeAcronymCache(c2, c1); // logger.warning("Removing cluster " + removeID + ", merged with " + c2.getClusterID()); document.corefClusters.remove(removeID); return; } } } // End of "LOOP" } public String flagsToString() { return flags.toString(); } public boolean useRoleSkip() { return flags.USE_ROLE_SKIP; } /** Skip this mention? (search pruning) */ public boolean skipThisMention(Document document, Mention m1, CorefCluster c, Dictionaries dict) { boolean skip = false; // only do for the first mention in its cluster // if(!flags.USE_EXACTSTRINGMATCH && !flags.USE_ROLEAPPOSITION && !flags.USE_PREDICATENOMINATIVES if(!flags.USE_ROLEAPPOSITION && !flags.USE_PREDICATENOMINATIVES // CHINESE CHANGE && !flags.USE_ACRONYM && !flags.USE_APPOSITION && !flags.USE_RELATIVEPRONOUN && !c.getFirstMention().equals(m1)) { return true; } if(m1.appositions == null && m1.predicateNominatives == null && (m1.lowercaseNormalizedSpanString().startsWith("a ") || m1.lowercaseNormalizedSpanString().startsWith("an ")) && !flags.USE_EXACTSTRINGMATCH) { skip = true; // A noun phrase starting with an indefinite article - unlikely to have an antecedent (e.g. "A commission" was set up to .... ) } if(dict.indefinitePronouns.contains(m1.lowercaseNormalizedSpanString())) { skip = true; // An indefinite pronoun - unlikely to have an antecedent (e.g. "Some" say that... ) } for(String indef : dict.indefinitePronouns){ if(m1.lowercaseNormalizedSpanString().startsWith(indef + " ")) { skip = true; // A noun phrase starting with an indefinite adjective - unlikely to have an antecedent (e.g. "Another opinion" on the topic is...) break; } } return skip; } public boolean checkEntityMatch( Document document, CorefCluster mentionCluster, CorefCluster potentialAntecedent, Dictionaries dict, Set<Mention> roleSet) { return false; } /** * Checks if two clusters are coreferent according to our sieve pass constraints * @param document * @throws Exception */ public boolean coreferent(Document document, CorefCluster mentionCluster, CorefCluster potentialAntecedent, Mention mention2, Mention ant, Dictionaries dict, Set<Mention> roleSet) throws Exception { boolean ret = false; Mention mention = mentionCluster.getRepresentativeMention(); if (flags.USE_INCOMPATIBLES) { // Check our list of incompatible mentions and don't cluster them together // Allows definite no's from previous sieves to propagate down if (document.isIncompatible(mentionCluster, potentialAntecedent)) { return false; } } if (flags.DO_PRONOUN && Math.abs(mention2.sentNum-ant.sentNum) > 3 && mention2.person!=Person.I && mention2.person!=Person.YOU) { return false; } if (mention2.lowercaseNormalizedSpanString().equals("this") && Math.abs(mention2.sentNum-ant.sentNum) > 3) { return false; } if (mention2.person==Person.YOU && document.docType==DocType.ARTICLE && mention2.headWord.get(CoreAnnotations.SpeakerAnnotation.class).equals("PER0")) { return false; } if (document.conllDoc != null) { if (ant.generic && ant.person==Person.YOU) return false; if (mention2.generic) return false; } // chinese newswire contains coref nested NPs with shared headword Chen & Ng if(lang != Locale.CHINESE || document.docInfo == null || !document.docInfo.getOrDefault("DOC_ID","").contains("nw")) { if(mention2.insideIn(ant) || ant.insideIn(mention2)) return false; } if(flags.USE_SPEAKERMATCH) { String mSpeaker = mention2.headWord.get(SpeakerAnnotation.class); String aSpeaker = ant.headWord.get(SpeakerAnnotation.class); // <I> from same speaker if(mention2.person == Person.I && ant.person == Person.I) return (mSpeaker.equals(aSpeaker)); // <I> - speaker if( (mention2.person == Person.I && mSpeaker.equals(Integer.toString(ant.mentionID))) || (ant.person == Person.I && aSpeaker.equals(Integer.toString(mention2.mentionID))) ) return true; } if(flags.USE_DISCOURSEMATCH) { String mString = mention.lowercaseNormalizedSpanString(); String antString = ant.lowercaseNormalizedSpanString(); // mention and ant both belong to the same speaker cluster if (mention.speakerInfo != null && mention.speakerInfo == ant.speakerInfo) { return true; } // (I - I) in the same speaker's quotation. if (mention.number==Number.SINGULAR && dict.firstPersonPronouns.contains(mString) && ant.number==Number.SINGULAR && dict.firstPersonPronouns.contains(antString) && CorefRules.entitySameSpeaker(document, mention, ant)){ return true; } // (speaker - I) if ((mention.number==Number.SINGULAR && dict.firstPersonPronouns.contains(mString)) && CorefRules.antecedentIsMentionSpeaker(document, mention, ant, dict)) { if (mention.speakerInfo == null && ant.speakerInfo != null) { mention.speakerInfo = ant.speakerInfo; } return true; } // (I - speaker) if ((ant.number==Number.SINGULAR && dict.firstPersonPronouns.contains(antString)) && CorefRules.antecedentIsMentionSpeaker(document, ant, mention, dict)) { if (ant.speakerInfo == null && mention.speakerInfo != null) { ant.speakerInfo = mention.speakerInfo; } return true; } // Can be iffy if more than two speakers... but still should be okay most of the time if (dict.secondPersonPronouns.contains(mString) && dict.secondPersonPronouns.contains(antString) && CorefRules.entitySameSpeaker(document, mention, ant)) { return true; } // previous I - you or previous you - I in two person conversation if (((mention.person==Person.I && ant.person==Person.YOU || (mention.person==Person.YOU && ant.person==Person.I)) && (mention.headWord.get(CoreAnnotations.UtteranceAnnotation.class)-ant.headWord.get(CoreAnnotations.UtteranceAnnotation.class) == 1) && document.docType==DocType.CONVERSATION)) { return true; } if (dict.reflexivePronouns.contains(mention.headString) && CorefRules.entitySubjectObject(mention, ant)){ return true; } } if (!flags.USE_EXACTSTRINGMATCH && !flags.USE_RELAXED_EXACTSTRINGMATCH && !flags.USE_APPOSITION && !flags.USE_WORDS_INCLUSION) { for(Mention m : mentionCluster.getCorefMentions()) { for(Mention a : potentialAntecedent.getCorefMentions()){ // angelx - not sure about the logic here, disable (code was also refactored from original) // vv gabor - re-enabled code (seems to improve performance) vv if(m.person!=Person.I && a.person!=Person.I && (CorefRules.antecedentIsMentionSpeaker(document, m, a, dict) || CorefRules.antecedentIsMentionSpeaker(document, a, m, dict))) { document.addIncompatible(m, a); return false; } // ^^ end block of code in question ^^ int dist = Math.abs(m.headWord.get(CoreAnnotations.UtteranceAnnotation.class) - a.headWord.get(CoreAnnotations.UtteranceAnnotation.class)); if(document.docType!=DocType.ARTICLE && dist==1 && !CorefRules.entitySameSpeaker(document, m, a)) { String mSpeaker = document.speakers.get(m.headWord.get(CoreAnnotations.UtteranceAnnotation.class)); String aSpeaker = document.speakers.get(a.headWord.get(CoreAnnotations.UtteranceAnnotation.class)); if(m.person==Person.I && a.person==Person.I) { document.addIncompatible(m, a); return false; } if(m.person==Person.YOU && a.person==Person.YOU) { document.addIncompatible(m, a); return false; } // This is weak since we can refer to both speakers if(m.person==Person.WE && a.person==Person.WE) { document.addIncompatible(m, a); return false; } } } } if(document.docType==DocType.ARTICLE) { for(Mention m : mentionCluster.getCorefMentions()) { for(Mention a : potentialAntecedent.getCorefMentions()){ if(CorefRules.entitySubjectObject(m, a)) { document.addIncompatible(m, a); return false; } } } } } // Incompatibility constraints - do before match checks if(flags.USE_iwithini && CorefRules.entityIWithinI(mention, ant, dict)) { document.addIncompatible(mention, ant); return false; } // Match checks if(flags.USE_EXACTSTRINGMATCH && CorefRules.entityExactStringMatch(mention, ant, dict, roleSet)){ return true; } // if(flags.USE_EXACTSTRINGMATCH && Rules.entityExactStringMatch(mentionCluster, potentialAntecedent, dict, roleSet)){ // return true; // } if (flags.USE_NAME_MATCH && checkEntityMatch(document, mentionCluster, potentialAntecedent, dict, roleSet)) { ret = true; } if(flags.USE_RELAXED_EXACTSTRINGMATCH && CorefRules.entityRelaxedExactStringMatch(mentionCluster, potentialAntecedent, mention, ant, dict, roleSet)){ return true; } if(flags.USE_APPOSITION && CorefRules.entityIsApposition(mentionCluster, potentialAntecedent, mention, ant)) { return true; } if(flags.USE_PREDICATENOMINATIVES && CorefRules.entityIsPredicateNominatives(mentionCluster, potentialAntecedent, mention, ant)) { return true; } if(flags.USE_ACRONYM && CorefRules.entityIsAcronym(document, mentionCluster, potentialAntecedent)) { return true; } if(flags.USE_RELATIVEPRONOUN && CorefRules.entityIsRelativePronoun(mention, ant)){ return true; } if(flags.USE_DEMONYM && mention.isDemonym(ant, dict)){ return true; } if(flags.USE_ROLEAPPOSITION){ if(lang==Locale.CHINESE) ret = false; else if(CorefRules.entityIsRoleAppositive(mentionCluster, potentialAntecedent, mention, ant, dict)) ret = true; } if(flags.USE_INCLUSION_HEADMATCH && CorefRules.entityHeadsAgree(mentionCluster, potentialAntecedent, mention, ant, dict)){ ret = true; } if(flags.USE_RELAXED_HEADMATCH && CorefRules.entityRelaxedHeadsAgreeBetweenMentions(mentionCluster, potentialAntecedent, mention, ant) ){ ret = true; } if(flags.USE_WORDS_INCLUSION && ret && ! CorefRules.entityWordsIncluded(mentionCluster, potentialAntecedent, mention, ant)) { return false; } if(flags.USE_INCOMPATIBLE_MODIFIER && ret && CorefRules.entityHaveIncompatibleModifier(mentionCluster, potentialAntecedent)) { return false; } if(flags.USE_PROPERHEAD_AT_LAST && ret && !CorefRules.entitySameProperHeadLastWord(mentionCluster, potentialAntecedent, mention, ant)) { return false; } if(flags.USE_ATTRIBUTES_AGREE && !CorefRules.entityAttributesAgree(mentionCluster, potentialAntecedent)) { return false; } if(flags.USE_DIFFERENT_LOCATION && CorefRules.entityHaveDifferentLocation(mention, ant, dict)) { if(flags.USE_PROPERHEAD_AT_LAST && ret && mention.goldCorefClusterID!=ant.goldCorefClusterID) { } return false; } if(flags.USE_NUMBER_IN_MENTION && CorefRules.entityNumberInLaterMention(mention, ant)) { if(flags.USE_PROPERHEAD_AT_LAST && ret && mention.goldCorefClusterID!=ant.goldCorefClusterID) { } return false; } if(flags.USE_DISTANCE && CorefRules.entityTokenDistance(mention2, ant)){ return false; } if(flags.USE_COREF_DICT){ // Head match if(ant.headWord.lemma().equals(mention2.headWord.lemma())) return false; // Constraint: ignore pairs commonNoun - properNoun if(ant.mentionType != MentionType.PROPER && ( mention2.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP") || !mention2.headWord.word().substring(1).equals(mention2.headWord.word().substring(1).toLowerCase()) ) ) return false; // Constraint: ignore plurals if(ant.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNS") && mention2.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNS")) return false; // Constraint: ignore mentions with indefinite determiners if(dict.indefinitePronouns.contains(ant.originalSpan.get(0).lemma()) || dict.indefinitePronouns.contains(mention2.originalSpan.get(0).lemma())) return false; // Constraint: ignore coordinated mentions if(ant.isCoordinated() || mention2.isCoordinated()) return false; // Constraint: context incompatibility if(CorefRules.contextIncompatible(mention2, ant, dict)) return false; // Constraint: sentence context incompatibility when the mentions are common nouns if(CorefRules.sentenceContextIncompatible(mention2, ant, dict)) return false; if(CorefRules.entityClusterAllCorefDictionary(mentionCluster, potentialAntecedent, dict, 1, 8)) return true; if(CorefRules.entityCorefDictionary(mention, ant, dict, 2, 2)) return true; if(CorefRules.entityCorefDictionary(mention, ant, dict, 3, 2)) return true; if(CorefRules.entityCorefDictionary(mention, ant, dict, 4, 2)) return true; } if(flags.DO_PRONOUN){ Mention m; if (mention.predicateNominatives!=null && mention.predicateNominatives.contains(mention2)) { m = mention2; } else { m = mention; } boolean mIsPronoun = (m.isPronominal() || dict.allPronouns.contains(m.toString())); boolean attrAgree = HybridCorefProperties.useDefaultPronounAgreement(props)? CorefRules.entityAttributesAgree(mentionCluster, potentialAntecedent): CorefRules.entityAttributesAgree(mentionCluster, potentialAntecedent, lang); if(mIsPronoun && attrAgree){ if(dict.demonymSet.contains(ant.lowercaseNormalizedSpanString()) && dict.notOrganizationPRP.contains(m.headString)){ document.addIncompatible(m, ant); return false; } if(CorefRules.entityPersonDisagree(document, mentionCluster, potentialAntecedent, dict)){ document.addIncompatible(m, ant); return false; } return true; } } if(flags.USE_CHINESE_HEAD_MATCH) { if (mention2.headWord == ant.headWord && mention2.insideIn(ant)) { if(!document.isCoref(mention2, ant)) { // TODO: exclude conjunction // log.info("error in chinese head match: "+mention2.spanToString()+"\t"+ant.spanToString()); } return true; } } return ret; } /** * Orders the antecedents for the given mention (m1) * @param antecedentSentence * @param mySentence * @param orderedMentions * @param orderedMentionsBySentence * @param m1 * @param m1Position * @param corefClusters * @param dict * @return An ordering of potential antecedents depending on same/different sentence, etc. */ public List<Mention> getOrderedAntecedents( int antecedentSentence, int mySentence, List<Mention> orderedMentions, List<List<Mention>> orderedMentionsBySentence, Mention m1, int m1Position, Map<Integer, CorefCluster> corefClusters, Dictionaries dict) { List<Mention> orderedAntecedents = new ArrayList<>(); // ordering antecedents if (antecedentSentence == mySentence) { // same sentence orderedAntecedents.addAll(orderedMentions.subList(0, m1Position)); if(flags.DO_PRONOUN && m1.isPronominal()) { // TODO orderedAntecedents = sortMentionsForPronoun(orderedAntecedents, m1); } if(dict.relativePronouns.contains(m1.spanToString())) Collections.reverse(orderedAntecedents); } else { // previous sentence orderedAntecedents.addAll(orderedMentionsBySentence.get(antecedentSentence)); } return orderedAntecedents; } /** Divides a sentence into clauses and sort the antecedents for pronoun matching */ private static List<Mention> sortMentionsForPronoun(List<Mention> l, Mention m1) { List<Mention> sorted = new ArrayList<>(); Tree tree = m1.contextParseTree; Tree current = m1.mentionSubTree; if(tree==null || current==null) return l; while(true){ current = current.ancestor(1, tree); if(current.label().value().startsWith("S")){ for(Mention m : l){ if(!sorted.contains(m) && current.dominates(m.mentionSubTree)) sorted.add(m); } } if(current.ancestor(1, tree)==null) break; } if(l.size()!=sorted.size()) { sorted=l; } else if(!l.equals(sorted)){ for(int i=0; i<l.size(); i++){ Mention ml = l.get(i); Mention msorted = sorted.get(i); } } else { } return sorted; } }