DeterministicCorefSieve.java example

Explorer
CoreNLP-master
//
// StanfordCoreNLP -- a suite of NLP tools
// Copyright (c) 2009-2010 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//

package edu.stanford.nlp.coref.hybrid.sieve;
import edu.stanford.nlp.util.logging.Redwood;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import edu.stanford.nlp.coref.CorefRules;
import edu.stanford.nlp.coref.data.CorefCluster;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.coref.data.Document;
import edu.stanford.nlp.coref.data.Mention;
import edu.stanford.nlp.coref.data.Dictionaries.MentionType;
import edu.stanford.nlp.coref.data.Dictionaries.Number;
import edu.stanford.nlp.coref.data.Dictionaries.Person;
import edu.stanford.nlp.coref.data.Document.DocType;
import edu.stanford.nlp.coref.hybrid.HybridCorefPrinter;
import edu.stanford.nlp.coref.hybrid.HybridCorefProperties;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation;
import edu.stanford.nlp.trees.Tree;

/**
 *  Base class for a Coref Sieve.
 *  Each sieve extends this class, and set flags for its own options in the constructor.
 *
 *  @author heeyoung
 *  @author mihais
 */
public abstract class DeterministicCorefSieve extends Sieve  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(DeterministicCorefSieve.class);

  public final DcorefSieveOptions flags;

  public DeterministicCorefSieve() {
    super();
    this.classifierType = ClassifierType.RULE;
    flags = new DcorefSieveOptions();
  }
  public DeterministicCorefSieve(Properties props) {
    super(props);
    this.classifierType = ClassifierType.RULE;
    flags = new DcorefSieveOptions();
  }

  public void findCoreferentAntecedent(Mention m, int mIdx, Document document, Dictionaries dict, Properties props, StringBuilder sbLog) throws Exception {

    // check for skip: first mention only, discourse salience
    if(!this.flags.USE_SPEAKERMATCH && !this.flags.USE_DISCOURSEMATCH && !this.flags.USE_APPOSITION && !this.flags.USE_PREDICATENOMINATIVES
        && this.skipThisMention(document, m, document.corefClusters.get(m.corefClusterID), dict)) {
      return;
    }

    Set<Mention> roleSet = document.roleSet;
    for (int sentJ = m.sentNum; sentJ >= 0; sentJ--) {
      List<Mention> l = Sieve.getOrderedAntecedents(m, sentJ, mIdx, document.predictedMentions, dict);
      if(maxSentDist != -1 && m.sentNum - sentJ > maxSentDist) continue;

      // TODO: do we need this?
      // Sort mentions by length whenever we have two mentions beginning at the same position and having the same head
      for(int i = 0; i < l.size(); i++) {
        for(int j = 0; j < l.size(); j++) {
          if(l.get(i).headString.equals(l.get(j).headString) &&
              l.get(i).startIndex == l.get(j).startIndex &&
              l.get(i).sameSentence(l.get(j)) && j > i &&
              l.get(i).spanToString().length() > l.get(j).spanToString().length()) {
            l.set(j, l.set(i, l.get(j)));
//              log.info("antecedent ordering changed!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
          }
        }
      }

      for (Mention ant : l) {
        if(skipForAnalysis(ant, m, props)) continue;

        // m2 - antecedent of m1

        // Skip singletons according to the singleton predictor
        // (only for non-NE mentions)
        // Recasens, de Marneffe, and Potts (NAACL 2013)
        if (m.isSingleton && m.mentionType != MentionType.PROPER && ant.isSingleton && ant.mentionType != MentionType.PROPER) continue;
        if (m.corefClusterID == ant.corefClusterID) continue;

        if(!mType.contains(m.mentionType) || !aType.contains(ant.mentionType)) continue;
        if(m.mentionType == MentionType.PRONOMINAL) {
          if(!matchedMentionType(m, mTypeStr)) continue;
          if(!matchedMentionType(ant, aTypeStr)) continue;
        }
        CorefCluster c1 = document.corefClusters.get(m.corefClusterID);
        CorefCluster c2 = document.corefClusters.get(ant.corefClusterID);
        assert(c1 != null);
        assert(c2 != null);

        if (this.useRoleSkip()) {
          if (m.isRoleAppositive(ant, dict)) {
            roleSet.add(m);
          } else if (ant.isRoleAppositive(m, dict)) {
            roleSet.add(ant);
          }
          continue;
        }
        if (this.coreferent(document, c1, c2, m, ant, dict, roleSet)) {
          // print logs for analysis
//            if (doScore()) {
//              printLogs(c1, c2, m1, m2, document, currentSieve);
//            }

          // print dcoref log
          if(HybridCorefProperties.debug(props)) {
            sbLog.append(HybridCorefPrinter.printErrorLogDcoref(m, ant, document, dict, mIdx, this.getClass().getName()));
          }

          int removeID = c1.clusterID;
//          System.out.println("Merging ant "+c2+" with "+c1);
          CorefCluster.mergeClusters(c2, c1);
          document.mergeIncompatibles(c2, c1);
          document.mergeAcronymCache(c2, c1);
//            logger.warning("Removing cluster " + removeID + ", merged with " + c2.getClusterID());
          document.corefClusters.remove(removeID);
          return;
        }
      }
    } // End of "LOOP"
  }

  public String flagsToString() { return flags.toString(); }

  public boolean useRoleSkip() { return flags.USE_ROLE_SKIP; }

  /** Skip this mention? (search pruning) */
  public boolean skipThisMention(Document document, Mention m1, CorefCluster c, Dictionaries dict) {
    boolean skip = false;

    // only do for the first mention in its cluster
//    if(!flags.USE_EXACTSTRINGMATCH && !flags.USE_ROLEAPPOSITION && !flags.USE_PREDICATENOMINATIVES
    if(!flags.USE_ROLEAPPOSITION && !flags.USE_PREDICATENOMINATIVES   // CHINESE CHANGE
        && !flags.USE_ACRONYM && !flags.USE_APPOSITION && !flags.USE_RELATIVEPRONOUN
        && !c.getFirstMention().equals(m1)) {
      return true;
    }

    if(m1.appositions == null && m1.predicateNominatives == null
        && (m1.lowercaseNormalizedSpanString().startsWith("a ") || m1.lowercaseNormalizedSpanString().startsWith("an "))
        && !flags.USE_EXACTSTRINGMATCH)  {
      skip = true; // A noun phrase starting with an indefinite article - unlikely to have an antecedent (e.g. "A commission" was set up to .... )
    }
    if(dict.indefinitePronouns.contains(m1.lowercaseNormalizedSpanString()))  {
      skip = true; // An indefinite pronoun - unlikely to have an antecedent (e.g. "Some" say that... )
    }
    for(String indef : dict.indefinitePronouns){
      if(m1.lowercaseNormalizedSpanString().startsWith(indef + " ")) {
        skip = true; // A noun phrase starting with an indefinite adjective - unlikely to have an antecedent (e.g. "Another opinion" on the topic is...)
        break;
      }
    }

    return skip;
  }

  public boolean checkEntityMatch(
          Document document,
          CorefCluster mentionCluster,
          CorefCluster potentialAntecedent,
          Dictionaries dict,
          Set<Mention> roleSet)
  {
    return false;
  }
  /**
   * Checks if two clusters are coreferent according to our sieve pass constraints
   * @param document
   * @throws Exception
   */
  public boolean coreferent(Document document, CorefCluster mentionCluster,
      CorefCluster potentialAntecedent,
      Mention mention2,
      Mention ant,
      Dictionaries dict,
      Set<Mention> roleSet) throws Exception {

    boolean ret = false;
    Mention mention = mentionCluster.getRepresentativeMention();
    if (flags.USE_INCOMPATIBLES) {
      // Check our list of incompatible mentions and don't cluster them together
      // Allows definite no's from previous sieves to propagate down
      if (document.isIncompatible(mentionCluster, potentialAntecedent)) {
        return false;
      }
    }
    if (flags.DO_PRONOUN && Math.abs(mention2.sentNum-ant.sentNum) > 3 &&
        mention2.person!=Person.I && mention2.person!=Person.YOU) {
      return false;
    }
    if (mention2.lowercaseNormalizedSpanString().equals("this") && Math.abs(mention2.sentNum-ant.sentNum) > 3) {
      return false;
    }
    if (mention2.person==Person.YOU && document.docType==DocType.ARTICLE &&
        mention2.headWord.get(CoreAnnotations.SpeakerAnnotation.class).equals("PER0")) {
      return false;
    }
    if (document.conllDoc != null) {
      if (ant.generic && ant.person==Person.YOU) return false;
      if (mention2.generic) return false;
    }

    // chinese newswire contains coref nested NPs with shared headword  Chen & Ng
    if(lang != Locale.CHINESE || document.docInfo == null || !document.docInfo.getOrDefault("DOC_ID","").contains("nw")) {
      if(mention2.insideIn(ant) || ant.insideIn(mention2)) return false;
    }

    if(flags.USE_SPEAKERMATCH) {
      String mSpeaker = mention2.headWord.get(SpeakerAnnotation.class);
      String aSpeaker = ant.headWord.get(SpeakerAnnotation.class);

      // <I> from same speaker
      if(mention2.person == Person.I && ant.person == Person.I) return (mSpeaker.equals(aSpeaker));

      // <I> - speaker
      if( (mention2.person == Person.I && mSpeaker.equals(Integer.toString(ant.mentionID)))
          || (ant.person == Person.I && aSpeaker.equals(Integer.toString(mention2.mentionID))) ) return true;
    }
    if(flags.USE_DISCOURSEMATCH) {
      String mString = mention.lowercaseNormalizedSpanString();
      String antString = ant.lowercaseNormalizedSpanString();

      // mention and ant both belong to the same speaker cluster
      if (mention.speakerInfo != null && mention.speakerInfo == ant.speakerInfo) {
        return true;
      }

      // (I - I) in the same speaker's quotation.
      if (mention.number==Number.SINGULAR && dict.firstPersonPronouns.contains(mString)
          && ant.number==Number.SINGULAR && dict.firstPersonPronouns.contains(antString)
          && CorefRules.entitySameSpeaker(document, mention, ant)){
        return true;
      }
      // (speaker - I)
      if ((mention.number==Number.SINGULAR && dict.firstPersonPronouns.contains(mString))
              && CorefRules.antecedentIsMentionSpeaker(document, mention, ant, dict)) {
        if (mention.speakerInfo == null && ant.speakerInfo != null) { mention.speakerInfo = ant.speakerInfo; }
        return true;
      }
      // (I - speaker)
      if ((ant.number==Number.SINGULAR && dict.firstPersonPronouns.contains(antString))
              && CorefRules.antecedentIsMentionSpeaker(document, ant, mention, dict)) {
        if (ant.speakerInfo == null && mention.speakerInfo != null) { ant.speakerInfo = mention.speakerInfo; }
        return true;
      }
      // Can be iffy if more than two speakers... but still should be okay most of the time
      if (dict.secondPersonPronouns.contains(mString)
          && dict.secondPersonPronouns.contains(antString)
          && CorefRules.entitySameSpeaker(document, mention, ant)) {
        return true;
      }
      // previous I - you or previous you - I in two person conversation
      if (((mention.person==Person.I && ant.person==Person.YOU
          || (mention.person==Person.YOU && ant.person==Person.I))
          && (mention.headWord.get(CoreAnnotations.UtteranceAnnotation.class)-ant.headWord.get(CoreAnnotations.UtteranceAnnotation.class) == 1)
          && document.docType==DocType.CONVERSATION)) {
        return true;
      }
      if (dict.reflexivePronouns.contains(mention.headString) && CorefRules.entitySubjectObject(mention, ant)){
        return true;
      }
    }
    if (!flags.USE_EXACTSTRINGMATCH && !flags.USE_RELAXED_EXACTSTRINGMATCH
        && !flags.USE_APPOSITION && !flags.USE_WORDS_INCLUSION) {
      for(Mention m : mentionCluster.getCorefMentions()) {
        for(Mention a : potentialAntecedent.getCorefMentions()){
          // angelx - not sure about the logic here, disable (code was also refactored from original)
          // vv gabor - re-enabled code (seems to improve performance) vv
          if(m.person!=Person.I && a.person!=Person.I &&
            (CorefRules.antecedentIsMentionSpeaker(document, m, a, dict) || CorefRules.antecedentIsMentionSpeaker(document, a, m, dict))) {
            document.addIncompatible(m, a);
            return false;
          }
          // ^^ end block of code in question ^^
          int dist = Math.abs(m.headWord.get(CoreAnnotations.UtteranceAnnotation.class) - a.headWord.get(CoreAnnotations.UtteranceAnnotation.class));
          if(document.docType!=DocType.ARTICLE && dist==1 && !CorefRules.entitySameSpeaker(document, m, a)) {
            String mSpeaker = document.speakers.get(m.headWord.get(CoreAnnotations.UtteranceAnnotation.class));
            String aSpeaker = document.speakers.get(a.headWord.get(CoreAnnotations.UtteranceAnnotation.class));
            if(m.person==Person.I && a.person==Person.I) {
              document.addIncompatible(m, a);
              return false;
            }
            if(m.person==Person.YOU && a.person==Person.YOU) {
              document.addIncompatible(m, a);
              return false;
            }
            // This is weak since we can refer to both speakers
            if(m.person==Person.WE && a.person==Person.WE) {
              document.addIncompatible(m, a);
              return false;
            }
          }
        }
      }
      if(document.docType==DocType.ARTICLE) {
        for(Mention m : mentionCluster.getCorefMentions()) {
          for(Mention a : potentialAntecedent.getCorefMentions()){
            if(CorefRules.entitySubjectObject(m, a)) {
              document.addIncompatible(m, a);
              return false;
            }
          }
        }
      }
    }

    // Incompatibility constraints - do before match checks
    if(flags.USE_iwithini && CorefRules.entityIWithinI(mention, ant, dict)) {
      document.addIncompatible(mention, ant);
      return false;
    }

    // Match checks
    if(flags.USE_EXACTSTRINGMATCH && CorefRules.entityExactStringMatch(mention, ant, dict, roleSet)){
      return true;
    }
//    if(flags.USE_EXACTSTRINGMATCH && Rules.entityExactStringMatch(mentionCluster, potentialAntecedent, dict, roleSet)){
//      return true;
//    }
    if (flags.USE_NAME_MATCH && checkEntityMatch(document, mentionCluster, potentialAntecedent, dict, roleSet)) {
      ret = true;
    }

    if(flags.USE_RELAXED_EXACTSTRINGMATCH && CorefRules.entityRelaxedExactStringMatch(mentionCluster, potentialAntecedent, mention, ant, dict, roleSet)){
      return true;
    }
    if(flags.USE_APPOSITION && CorefRules.entityIsApposition(mentionCluster, potentialAntecedent, mention, ant)) {
      return true;
    }
    if(flags.USE_PREDICATENOMINATIVES && CorefRules.entityIsPredicateNominatives(mentionCluster, potentialAntecedent, mention, ant)) {
      return true;
    }

    if(flags.USE_ACRONYM && CorefRules.entityIsAcronym(document, mentionCluster, potentialAntecedent)) {
      return true;
    }
    if(flags.USE_RELATIVEPRONOUN && CorefRules.entityIsRelativePronoun(mention, ant)){
      return true;
    }
    if(flags.USE_DEMONYM && mention.isDemonym(ant, dict)){
      return true;
    }

    if(flags.USE_ROLEAPPOSITION){
      if(lang==Locale.CHINESE)
        ret = false;
      else
        if(CorefRules.entityIsRoleAppositive(mentionCluster, potentialAntecedent, mention, ant, dict))
          ret = true;
    }
    if(flags.USE_INCLUSION_HEADMATCH && CorefRules.entityHeadsAgree(mentionCluster, potentialAntecedent, mention, ant, dict)){
      ret = true;
    }
    if(flags.USE_RELAXED_HEADMATCH && CorefRules.entityRelaxedHeadsAgreeBetweenMentions(mentionCluster, potentialAntecedent, mention, ant) ){
      ret = true;
    }

    if(flags.USE_WORDS_INCLUSION && ret && ! CorefRules.entityWordsIncluded(mentionCluster, potentialAntecedent, mention, ant)) {
      return false;
    }

    if(flags.USE_INCOMPATIBLE_MODIFIER && ret && CorefRules.entityHaveIncompatibleModifier(mentionCluster, potentialAntecedent)) {
      return false;
    }
    if(flags.USE_PROPERHEAD_AT_LAST && ret && !CorefRules.entitySameProperHeadLastWord(mentionCluster, potentialAntecedent, mention, ant)) {
      return false;
    }
    if(flags.USE_ATTRIBUTES_AGREE && !CorefRules.entityAttributesAgree(mentionCluster, potentialAntecedent)) {
      return false;
    }
    if(flags.USE_DIFFERENT_LOCATION
        && CorefRules.entityHaveDifferentLocation(mention, ant, dict)) {
      if(flags.USE_PROPERHEAD_AT_LAST  && ret && mention.goldCorefClusterID!=ant.goldCorefClusterID) {
      }
      return false;
    }
    if(flags.USE_NUMBER_IN_MENTION
        && CorefRules.entityNumberInLaterMention(mention, ant)) {
      if(flags.USE_PROPERHEAD_AT_LAST  && ret && mention.goldCorefClusterID!=ant.goldCorefClusterID) {
      }
      return false;
    }

    if(flags.USE_DISTANCE && CorefRules.entityTokenDistance(mention2, ant)){
      return false;
    }

    if(flags.USE_COREF_DICT){

      // Head match
      if(ant.headWord.lemma().equals(mention2.headWord.lemma())) return false;

      // Constraint: ignore pairs commonNoun - properNoun
      if(ant.mentionType != MentionType.PROPER &&
         ( mention2.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")
           || !mention2.headWord.word().substring(1).equals(mention2.headWord.word().substring(1).toLowerCase()) ) ) return false;

      // Constraint: ignore plurals
      if(ant.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNS")
          && mention2.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNS")) return false;

      // Constraint: ignore mentions with indefinite determiners
      if(dict.indefinitePronouns.contains(ant.originalSpan.get(0).lemma())
          || dict.indefinitePronouns.contains(mention2.originalSpan.get(0).lemma())) return false;

      // Constraint: ignore coordinated mentions
      if(ant.isCoordinated() || mention2.isCoordinated()) return false;

      // Constraint: context incompatibility
      if(CorefRules.contextIncompatible(mention2, ant, dict)) return false;

      // Constraint: sentence context incompatibility when the mentions are common nouns
      if(CorefRules.sentenceContextIncompatible(mention2, ant, dict)) return false;

      if(CorefRules.entityClusterAllCorefDictionary(mentionCluster, potentialAntecedent, dict, 1, 8)) return true;
      if(CorefRules.entityCorefDictionary(mention, ant, dict, 2, 2)) return true;
      if(CorefRules.entityCorefDictionary(mention, ant, dict, 3, 2)) return true;
      if(CorefRules.entityCorefDictionary(mention, ant, dict, 4, 2)) return true;
    }

    if(flags.DO_PRONOUN){
      Mention m;
      if (mention.predicateNominatives!=null && mention.predicateNominatives.contains(mention2)) {
        m = mention2;
      } else {
        m = mention;
      }

      boolean mIsPronoun = (m.isPronominal() || dict.allPronouns.contains(m.toString()));
      boolean attrAgree = HybridCorefProperties.useDefaultPronounAgreement(props)?
          CorefRules.entityAttributesAgree(mentionCluster, potentialAntecedent):
            CorefRules.entityAttributesAgree(mentionCluster, potentialAntecedent, lang);

      if(mIsPronoun && attrAgree){

        if(dict.demonymSet.contains(ant.lowercaseNormalizedSpanString()) && dict.notOrganizationPRP.contains(m.headString)){
          document.addIncompatible(m, ant);
          return false;
        }
        if(CorefRules.entityPersonDisagree(document, mentionCluster, potentialAntecedent, dict)){
          document.addIncompatible(m, ant);
          return false;
        }
        return true;
      }
    }

    if(flags.USE_CHINESE_HEAD_MATCH) {
      if (mention2.headWord == ant.headWord && mention2.insideIn(ant)) {
        if(!document.isCoref(mention2, ant)) {
          // TODO: exclude conjunction
          // log.info("error in chinese head match: "+mention2.spanToString()+"\t"+ant.spanToString());
        }
        return true;
      }
    }

    return ret;
  }

  /**
   * Orders the antecedents for the given mention (m1)
   * @param antecedentSentence
   * @param mySentence
   * @param orderedMentions
   * @param orderedMentionsBySentence
   * @param m1
   * @param m1Position
   * @param corefClusters
   * @param dict
   * @return An ordering of potential antecedents depending on same/different sentence, etc.
   */
  public List<Mention> getOrderedAntecedents(
      int antecedentSentence,
      int mySentence,
      List<Mention> orderedMentions,
      List<List<Mention>> orderedMentionsBySentence,
      Mention m1,
      int m1Position,
      Map<Integer, CorefCluster> corefClusters,
      Dictionaries dict) {
    List<Mention> orderedAntecedents = new ArrayList<>();

    // ordering antecedents
    if (antecedentSentence == mySentence) {   // same sentence
      orderedAntecedents.addAll(orderedMentions.subList(0, m1Position));
      if(flags.DO_PRONOUN && m1.isPronominal()) {    // TODO
        orderedAntecedents = sortMentionsForPronoun(orderedAntecedents, m1);
      }
      if(dict.relativePronouns.contains(m1.spanToString())) Collections.reverse(orderedAntecedents);
    } else {    // previous sentence
      orderedAntecedents.addAll(orderedMentionsBySentence.get(antecedentSentence));
    }

    return orderedAntecedents;
  }

  /** Divides a sentence into clauses and sort the antecedents for pronoun matching  */
  private static List<Mention> sortMentionsForPronoun(List<Mention> l, Mention m1) {
    List<Mention> sorted = new ArrayList<>();
    Tree tree = m1.contextParseTree;
    Tree current = m1.mentionSubTree;
    if(tree==null || current==null) return l;
    while(true){
      current = current.ancestor(1, tree);
      if(current.label().value().startsWith("S")){
        for(Mention m : l){
          if(!sorted.contains(m) && current.dominates(m.mentionSubTree)) sorted.add(m);
        }
      }
      if(current.ancestor(1, tree)==null) break;
    }
    if(l.size()!=sorted.size()) {
      sorted=l;
    } else if(!l.equals(sorted)){
      for(int i=0; i<l.size(); i++){
        Mention ml = l.get(i);
        Mention msorted = sorted.get(i);
      }
    } else {
    }
    return sorted;
  }

}