CorefRules.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.coref;

import java.lang.reflect.Method;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Pattern;

import edu.stanford.nlp.coref.data.CorefCluster;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.coref.data.Document;
import edu.stanford.nlp.coref.data.Mention;
import edu.stanford.nlp.coref.data.Semantics;
import edu.stanford.nlp.coref.data.SpeakerInfo;
import edu.stanford.nlp.coref.data.Dictionaries.Animacy;
import edu.stanford.nlp.coref.data.Dictionaries.Gender;
import edu.stanford.nlp.coref.data.Dictionaries.MentionType;
import edu.stanford.nlp.coref.data.Dictionaries.Number;
import edu.stanford.nlp.coref.data.Dictionaries.Person;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.math.NumberMatchingRegex;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.stats.IntCounter;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.Sets;


/**
 * Rules for coref system (mention detection, entity coref, event coref)
 * The name of the method for mention detection starts with detection,
 * for entity coref starts with entity, and for event coref starts with event.
 *
 * @author heeyoung, recasens
 */
public class CorefRules {
  public static boolean entityBothHaveProper(CorefCluster mentionCluster,
      CorefCluster potentialAntecedent) {
    boolean mentionClusterHaveProper = false;
    boolean potentialAntecedentHaveProper = false;

    for (Mention m : mentionCluster.corefMentions) {
      if (m.mentionType==MentionType.PROPER) {
        mentionClusterHaveProper = true;
        break;
      }
    }
    for (Mention a : potentialAntecedent.corefMentions) {
      if (a.mentionType==MentionType.PROPER) {
        potentialAntecedentHaveProper = true;
        break;
      }
    }
    return (mentionClusterHaveProper && potentialAntecedentHaveProper);
  }
  public static boolean entitySameProperHeadLastWord(CorefCluster mentionCluster,
      CorefCluster potentialAntecedent, Mention mention, Mention ant) {
    for (Mention m : mentionCluster.getCorefMentions()){
      for (Mention a : potentialAntecedent.getCorefMentions()) {
        if (entitySameProperHeadLastWord(m, a)) return true;
      }
    }
    return false;
  }

  public static boolean entityAlias(CorefCluster mentionCluster, CorefCluster potentialAntecedent,
      Semantics semantics, Dictionaries dict) throws Exception {

    Mention mention = mentionCluster.getRepresentativeMention();
    Mention antecedent = potentialAntecedent.getRepresentativeMention();
    if(mention.mentionType!=MentionType.PROPER
        || antecedent.mentionType!=MentionType.PROPER) return false;

    Method meth = semantics.wordnet.getClass().getMethod("alias", new Class[]{Mention.class, Mention.class});
    if((Boolean) meth.invoke(semantics.wordnet, new Object[]{mention, antecedent})) {
      return true;
    }
    return false;
  }
  public static boolean entityIWithinI(CorefCluster mentionCluster,
      CorefCluster potentialAntecedent, Dictionaries dict) {
    for(Mention m : mentionCluster.getCorefMentions()) {
      for(Mention a : potentialAntecedent.getCorefMentions()) {
        if(entityIWithinI(m, a, dict)) return true;
      }
    }
    return false;
  }
  public static boolean entityPersonDisagree(Document document, CorefCluster mentionCluster, CorefCluster potentialAntecedent, Dictionaries dict){
    boolean disagree = false;
    for(Mention m : mentionCluster.getCorefMentions()) {
      for(Mention ant : potentialAntecedent.getCorefMentions()) {
        if(entityPersonDisagree(document, m, ant, dict)) {
          disagree = true;
          break;
        }
      }
    }
    if(disagree) return true;
    else return false;
  }

  private static final List<String> entityWordsToExclude =
          Arrays.asList(new String[]{ "the","this", "mr.", "miss", "mrs.", "dr.", "ms.", "inc.", "ltd.", "corp.", "'s"});
  /** Word inclusion except stop words  */
  public static boolean entityWordsIncluded(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Mention mention, Mention ant) {
    Set<String> wordsExceptStopWords = Generics.newHashSet(mentionCluster.words);
    wordsExceptStopWords.removeAll(entityWordsToExclude);
    wordsExceptStopWords.remove(mention.headString.toLowerCase());
    if(potentialAntecedent.words.containsAll(wordsExceptStopWords)) return true;
    else return false;
  }

  /** Compatible modifier only  */
  public static boolean entityHaveIncompatibleModifier(CorefCluster mentionCluster, CorefCluster potentialAntecedent) {
    for(Mention m : mentionCluster.corefMentions){
      for(Mention ant : potentialAntecedent.corefMentions){
        if(entityHaveIncompatibleModifier(m, ant)) return true;
      }
    }
    return false;
  }
  public static boolean entityIsRoleAppositive(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Mention m1, Mention m2, Dictionaries dict) {
    if(!entityAttributesAgree(mentionCluster, potentialAntecedent)) return false;
    return m1.isRoleAppositive(m2, dict) || m2.isRoleAppositive(m1, dict);
  }
  public static boolean entityIsRelativePronoun(Mention m1, Mention m2) {
      return m1.isRelativePronoun(m2) || m2.isRelativePronoun(m1);
  }

  public static boolean entityIsAcronym(Document document, CorefCluster mentionCluster, CorefCluster potentialAntecedent) {
    Pair<Integer, Integer> idPair = Pair.makePair(Math.min(mentionCluster.clusterID, potentialAntecedent.clusterID), Math.max(mentionCluster.clusterID, potentialAntecedent.clusterID));
    if(!document.acronymCache.containsKey(idPair)) {
      boolean isAcronym = false;
      for(Mention m : mentionCluster.corefMentions){
        if(m.isPronominal()) continue;
        for(Mention ant : potentialAntecedent.corefMentions){
          if(isAcronym(m.originalSpan, ant.originalSpan)) isAcronym = true;
        }
      }
      document.acronymCache.put(idPair, isAcronym);
    }
    return document.acronymCache.get(idPair);
  }

  public static boolean isAcronym(List<CoreLabel> first, List<CoreLabel> second) {
    if (first.size() > 1 && second.size() > 1) {
      return false;
    }
    if (first.size() == 0 && second.size() == 0) { return false; }
    List<CoreLabel> longer;
    List<CoreLabel> shorter;

    if (first.size() == second.size()) {
      String firstWord = first.get(0).get(CoreAnnotations.TextAnnotation.class);
      String secondWord = second.get(0).get(CoreAnnotations.TextAnnotation.class);
      longer = (firstWord.length() > secondWord.length()) ? first : second;
      shorter = (firstWord.length() > secondWord.length()) ? second : first;
    } else {
      longer = (first.size() > 0 && first.size() > second.size()) ? first : second;
      shorter = (second.size() > 0 && first.size() > second.size()) ? second : first;
    }

    String acronym = shorter.size() > 0 ? shorter.get(0).get(CoreAnnotations.TextAnnotation.class) : "<UNK>";
    // This check is not strictly necessary, but it saves a chunk of
    // time iterating through the text of the longer mention
    for (int acronymPos = 0; acronymPos < acronym.length(); ++acronymPos) {
      if (acronym.charAt(acronymPos) < 'A' || acronym.charAt(acronymPos) > 'Z') {
        return false;
      }
    }
    int acronymPos = 0;
    for (CoreLabel aLonger1 : longer) {
      String word = aLonger1.get(CoreAnnotations.TextAnnotation.class);
      for (int charNum = 0; charNum < word.length(); ++charNum) {
        if (word.charAt(charNum) >= 'A' && word.charAt(charNum) <= 'Z') {
          // This triggers if there were more "acronym" characters in
          // the longer mention than in the shorter mention
          if (acronymPos >= acronym.length()) {
            return false;
          }
          if (acronym.charAt(acronymPos) != word.charAt(charNum)) {
            return false;
          }
          ++acronymPos;
        }
      }
    }
    if (acronymPos != acronym.length()) {
      return false;
    }
    for (CoreLabel aLonger : longer) {
      if (aLonger.get(CoreAnnotations.TextAnnotation.class).contains(acronym)) {
        return false;
      }
    }

    return true;
  }

  public static boolean entityIsPredicateNominatives(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Mention m1, Mention m2) {
    if(!entityAttributesAgree(mentionCluster, potentialAntecedent)) return false;
    if ((m1.startIndex <= m2.startIndex && m1.endIndex >= m2.endIndex)
            || (m1.startIndex >= m2.startIndex && m1.endIndex <= m2.endIndex)) {
      return false;
    }
    return m1.isPredicateNominatives(m2) || m2.isPredicateNominatives(m1);
  }

  public static boolean entityIsApposition(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Mention m1, Mention m2) {
    if(!entityAttributesAgree(mentionCluster, potentialAntecedent)) return false;
    if(m1.mentionType==MentionType.PROPER && m2.mentionType==MentionType.PROPER) return false;
    if(m1.nerString.equals("LOCATION")) return false;
    return m1.isApposition(m2) || m2.isApposition(m1);
  }

  public static boolean entityAttributesAgree(CorefCluster mentionCluster, CorefCluster potentialAntecedent){
    return entityAttributesAgree(mentionCluster, potentialAntecedent, false);
  }

  public static boolean entityAttributesAgree(CorefCluster mentionCluster, CorefCluster potentialAntecedent, boolean ignoreGender){

    boolean hasExtraAnt = false;
    boolean hasExtraThis = false;

    // number
    if(!mentionCluster.numbers.contains(Number.UNKNOWN)){
      for(Number n : potentialAntecedent.numbers){
        if(n!=Number.UNKNOWN && !mentionCluster.numbers.contains(n)) {
          hasExtraAnt = true;
          break;
        }
      }
    }
    if(!potentialAntecedent.numbers.contains(Number.UNKNOWN)){
      for(Number n : mentionCluster.numbers){
        if(n!=Number.UNKNOWN && !potentialAntecedent.numbers.contains(n)) {
          hasExtraThis = true;
          break;
        }
      }
    }

    if(hasExtraAnt && hasExtraThis) return false;

    // gender
    hasExtraAnt = false;
    hasExtraThis = false;

    if (!ignoreGender) {
      if(!mentionCluster.genders.contains(Gender.UNKNOWN)){
        for(Gender g : potentialAntecedent.genders){
          if(g!=Gender.UNKNOWN && !mentionCluster.genders.contains(g)) {
            hasExtraAnt = true;
            break;
          }
        }
      }
      if(!potentialAntecedent.genders.contains(Gender.UNKNOWN)){
        for(Gender g : mentionCluster.genders){
          if(g!=Gender.UNKNOWN && !potentialAntecedent.genders.contains(g)) {
            hasExtraThis = true;
            break;
          }
        }
      }
    }
    if(hasExtraAnt && hasExtraThis) return false;

    // animacy
    hasExtraAnt = false;
    hasExtraThis = false;

    if(!mentionCluster.animacies.contains(Animacy.UNKNOWN)){
      for(Animacy a : potentialAntecedent.animacies){
        if(a!=Animacy.UNKNOWN && !mentionCluster.animacies.contains(a)) {
          hasExtraAnt = true;
          break;
        }
      }
    }
    if(!potentialAntecedent.animacies.contains(Animacy.UNKNOWN)){
      for(Animacy a : mentionCluster.animacies){
        if(a!=Animacy.UNKNOWN && !potentialAntecedent.animacies.contains(a)) {
          hasExtraThis = true;
          break;
        }
      }
    }
    if(hasExtraAnt && hasExtraThis) return false;

    // NE type
    hasExtraAnt = false;
    hasExtraThis = false;

    if(!mentionCluster.nerStrings.contains("O") && !mentionCluster.nerStrings.contains("MISC")){
      for(String ne : potentialAntecedent.nerStrings){
        if(!ne.equals("O") && !ne.equals("MISC") && !mentionCluster.nerStrings.contains(ne)) {
          hasExtraAnt = true;
          break;
        }
      }
    }
    if(!potentialAntecedent.nerStrings.contains("O") && !potentialAntecedent.nerStrings.contains("MISC")){
      for(String ne : mentionCluster.nerStrings){
        if(!ne.equals("O") && !ne.equals("MISC") && !potentialAntecedent.nerStrings.contains(ne)) {
          hasExtraThis = true;
          break;
        }
      }
    }
    return ! (hasExtraAnt && hasExtraThis);
  }

  private static <E> boolean attributeSetDisagree(Set<E> s1,Set<E> s2){
    int minSize = Math.min(s1.size(), s2.size());
    // intersection being smaller than the smaller set means both sets
    // have extra elements
    if (minSize > Sets.intersection(s1, s2).size())
      return true;
    return false;
  }

  private static <E> void pruneAttributes(Set<E> attrs, Set<E> unknown) {
    if (attrs.size() > unknown.size())
      attrs.removeAll(unknown);
  }

  private static <E> void pruneAttributes(Set<E> attrs, E unknown) {
    if (attrs.size() > 1)
      attrs.remove(unknown);
  }

  private static final Set<String> UNKNOWN_NER = new HashSet<>(Arrays.asList("MISC","O"));
  private static boolean entityAttributesAgreeChinese(CorefCluster mentionCluster, CorefCluster potentialAntecedent){

    pruneAttributes(mentionCluster.numbers,Number.UNKNOWN);
    pruneAttributes(mentionCluster.genders,Gender.UNKNOWN);
    pruneAttributes(mentionCluster.animacies,Animacy.UNKNOWN);
    pruneAttributes(mentionCluster.nerStrings,UNKNOWN_NER);

    pruneAttributes(potentialAntecedent.numbers,Number.UNKNOWN);
    pruneAttributes(potentialAntecedent.genders,Gender.UNKNOWN);
    pruneAttributes(potentialAntecedent.animacies,Animacy.UNKNOWN);
    pruneAttributes(potentialAntecedent.nerStrings,UNKNOWN_NER);

    if(attributeSetDisagree(mentionCluster.numbers,potentialAntecedent.numbers)
        || attributeSetDisagree(mentionCluster.genders,potentialAntecedent.genders)
        || attributeSetDisagree(mentionCluster.animacies,potentialAntecedent.animacies)
        || attributeSetDisagree(mentionCluster.nerStrings,potentialAntecedent.nerStrings))
      return false;

    return true;
  }

  public static boolean entityAttributesAgree(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Locale lang) {
    if (lang == Locale.CHINESE ) {
      return entityAttributesAgreeChinese(mentionCluster,potentialAntecedent);
    }
    return entityAttributesAgree(mentionCluster, potentialAntecedent);
  }

  public static boolean entityRelaxedHeadsAgreeBetweenMentions(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Mention m, Mention ant) {
    if(m.isPronominal() || ant.isPronominal()) return false;
    if(m.headsAgree(ant)) return true;
    return false;
  }

  public static boolean entityHeadsAgree(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Mention m, Mention ant, Dictionaries dict) {
    boolean headAgree = false;
    if(m.isPronominal() || ant.isPronominal()
        || dict.allPronouns.contains(m.lowercaseNormalizedSpanString())
        || dict.allPronouns.contains(ant.lowercaseNormalizedSpanString())) return false;
    for(Mention a : potentialAntecedent.corefMentions){
      if(a.headString.equals(m.headString)) headAgree= true;
    }
    return headAgree;
  }
  public static boolean entityExactStringMatch(CorefCluster mentionCluster, CorefCluster potentialAntecedent, Dictionaries dict, Set<Mention> roleSet){
    boolean matched = false;
    for(Mention m : mentionCluster.corefMentions){
      if(roleSet!=null && roleSet.contains(m)) return false;
      if(m.isPronominal()) {
        continue;
      }
      String mSpan = m.lowercaseNormalizedSpanString();
      if(dict.allPronouns.contains(mSpan)) {
        continue;
      }
      for(Mention ant : potentialAntecedent.corefMentions){
        if(ant.isPronominal()) {
          continue;
        }
        String antSpan = ant.lowercaseNormalizedSpanString();
        if(dict.allPronouns.contains(antSpan)) continue;
        if(mSpan.equals(antSpan)) matched = true;
        if(mSpan.equals(antSpan+" 's") || antSpan.equals(mSpan+" 's")) matched = true;
      }
    }
    return matched;
  }
  public static boolean entityExactStringMatch(Mention m, Mention ant, Dictionaries dict, Set<Mention> roleSet){
    boolean matched = false;
    if(roleSet!=null && roleSet.contains(m)) return false;
    if(m.isPronominal() || ant.isPronominal()) return false;

    String mSpan = m.lowercaseNormalizedSpanString();
    if(dict.allPronouns.contains(mSpan)) return false;

    String antSpan = ant.lowercaseNormalizedSpanString();
    if(dict.allPronouns.contains(antSpan)) return false;

    if(mSpan.equals(antSpan)) matched = true;
    if(mSpan.equals(antSpan+" 's") || antSpan.equals(mSpan+" 's")) matched = true;
    return matched;
  }

  /**
   * Exact string match except phrase after head (only for proper noun):
   * For dealing with a error like {@literal "[Mr. Bickford] <- [Mr. Bickford , an 18-year mediation veteran] }"
   */
  public static boolean entityRelaxedExactStringMatch(
      CorefCluster mentionCluster,
      CorefCluster potentialAntecedent,
      Mention mention,
      Mention ant,
      Dictionaries dict,
      Set<Mention> roleSet){
    if(roleSet!=null && roleSet.contains(mention)) return false;
    if(mention.mentionType == MentionType.LIST || ant.mentionType == MentionType.LIST) return false;
    if(mention.isPronominal() || ant.isPronominal()
        || dict.allPronouns.contains(mention.lowercaseNormalizedSpanString())
        || dict.allPronouns.contains(ant.lowercaseNormalizedSpanString())) return false;
    String mentionSpan = mention.removePhraseAfterHead();
    String antSpan = ant.removePhraseAfterHead();
    if(mentionSpan.equals("") || antSpan.equals("")) return false;

    if(mentionSpan.equals(antSpan) || mentionSpan.equals(antSpan+" 's") || antSpan.equals(mentionSpan+" 's")){
      return true;
    }
    return false;
  }

  /** Check whether two mentions are in i-within-i relation (Chomsky, 1981) */
  public static boolean entityIWithinI(Mention m1, Mention m2, Dictionaries dict){
    // check for nesting: i-within-i
    if(!m1.isApposition(m2) && !m2.isApposition(m1)
        && !m1.isRelativePronoun(m2) && !m2.isRelativePronoun(m1)
        && !m1.isRoleAppositive(m2, dict) && !m2.isRoleAppositive(m1, dict)
    ){
      if(m1.includedIn(m2) || m2.includedIn(m1)){
        return true;
      }
    }
    return false;
  }


  /** Check whether later mention has incompatible modifier */
  public static boolean entityHaveIncompatibleModifier(Mention m, Mention ant) {
    if(!ant.headString.equalsIgnoreCase(m.headString)) return false;   // only apply to same head mentions
    boolean thisHasExtra = false;
    int lengthThis = m.originalSpan.size();
    int lengthM = ant.originalSpan.size();
    Set<String> thisWordSet = Generics.newHashSet();
    Set<String> antWordSet = Generics.newHashSet();
    Set<String> locationModifier = Generics.newHashSet(Arrays.asList("east", "west", "north", "south",
        "eastern", "western", "northern", "southern", "upper", "lower"));

    for (int i=0; i< lengthThis ; i++){
      String w1 = m.originalSpan.get(i).get(CoreAnnotations.TextAnnotation.class).toLowerCase();
      String pos1 = m.originalSpan.get(i).get(CoreAnnotations.PartOfSpeechAnnotation.class);
      if (!(pos1.startsWith("N") || pos1.startsWith("JJ") || pos1.equals("CD")
            || pos1.startsWith("V")) || w1.equalsIgnoreCase(m.headString)) {
        continue;
      }
      thisWordSet.add(w1);
    }
    for (int j=0 ; j < lengthM ; j++){
      String w2 = ant.originalSpan.get(j).get(CoreAnnotations.TextAnnotation.class).toLowerCase();
      antWordSet.add(w2);
    }
    for (String w : thisWordSet){
      if(!antWordSet.contains(w)) {
        thisHasExtra = true;
        break;
      }
    }
    boolean hasLocationModifier = false;
    for(String l : locationModifier){
      if(antWordSet.contains(l) && !thisWordSet.contains(l)) {
        hasLocationModifier = true;
        break;
      }
    }
    return (thisHasExtra || hasLocationModifier);
  }
  /** Check whether two mentions have different locations */
  private static final Set<String> locationModifier = Generics.newHashSet(Arrays.asList("east", "west", "north", "south",
          "eastern", "western", "northern", "southern", "northwestern", "southwestern", "northeastern",
          "southeastern", "upper", "lower"));
  public static boolean entityHaveDifferentLocation(Mention m, Mention a, Dictionaries dict) {

    // state and country cannot be coref
    if ((dict.statesAbbreviation.containsKey(a.spanToString()) || dict.statesAbbreviation.containsValue(a.spanToString()))
          && (m.headString.equalsIgnoreCase("country") || m.headString.equalsIgnoreCase("nation"))) {
      return true;
    }

    Set<String> locationM = Generics.newHashSet();
    Set<String> locationA = Generics.newHashSet();
    String mString = m.lowercaseNormalizedSpanString();
    String aString = a.lowercaseNormalizedSpanString();

    for (CoreLabel w : m.originalSpan){
      String text = w.get(CoreAnnotations.TextAnnotation.class);
      String lowercased = text.toLowerCase();
      if (locationModifier.contains(lowercased)) return true;
      if (w.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals("LOCATION")) {
        String loc = text;
        if(dict.statesAbbreviation.containsKey(loc)) loc = dict.statesAbbreviation.get(loc);
        locationM.add(lowercased);
      }
    }
    for (CoreLabel w : a.originalSpan){
      String text = w.get(CoreAnnotations.TextAnnotation.class);
      String lowercased = text.toLowerCase();
      if (locationModifier.contains(lowercased)) return true;
      if (w.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals("LOCATION")) {
        String loc = text;
        if(dict.statesAbbreviation.containsKey(loc)) loc = dict.statesAbbreviation.get(loc);
        locationA.add(lowercased);
      }
    }
    boolean mHasExtra = false;
    boolean aHasExtra = false;
    for (String s : locationM) {
      if (!aString.contains(s)) {
        mHasExtra = true;
        break;
      }
    }
    for (String s : locationA) {
      if (!mString.contains(s)) {
        aHasExtra = true;
        break;
      }
    }
    if(mHasExtra && aHasExtra) {
      return true;
    }
    return false;
  }

  /** Check whether two mentions have the same proper head words */
  public static boolean entitySameProperHeadLastWord(Mention m, Mention a) {
    if(!m.headString.equalsIgnoreCase(a.headString)
        || !m.sentenceWords.get(m.headIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")
        || !a.sentenceWords.get(a.headIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")) {
      return false;
    }
    if(!m.removePhraseAfterHead().toLowerCase().endsWith(m.headString)
        || !a.removePhraseAfterHead().toLowerCase().endsWith(a.headString)) {
      return false;
    }
    Set<String> mProperNouns = Generics.newHashSet();
    Set<String> aProperNouns = Generics.newHashSet();
    for (CoreLabel w : m.sentenceWords.subList(m.startIndex, m.headIndex)){
      if (w.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")) {
        mProperNouns.add(w.get(CoreAnnotations.TextAnnotation.class));
      }
    }
    for (CoreLabel w : a.sentenceWords.subList(a.startIndex, a.headIndex)){
      if (w.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")) {
        aProperNouns.add(w.get(CoreAnnotations.TextAnnotation.class));
      }
    }
    boolean mHasExtra = false;
    boolean aHasExtra = false;
    for (String s : mProperNouns) {
      if (!aProperNouns.contains(s)) {
        mHasExtra = true;
        break;
      }
    }
    for (String s : aProperNouns) {
      if (!mProperNouns.contains(s)) {
        aHasExtra = true;
        break;
      }
    }
    if(mHasExtra && aHasExtra) return false;
    return true;
  }

  private static final Set<String> NUMBERS = Generics.newHashSet(Arrays.asList(new String[]{
          "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "hundred", "thousand", "million", "billion"}));
  /** Check whether there is a new number in later mention */
  public static boolean entityNumberInLaterMention(Mention mention, Mention ant) {
    Set<String> antecedentWords = Generics.newHashSet();
    for (CoreLabel w : ant.originalSpan){
      antecedentWords.add(w.get(CoreAnnotations.TextAnnotation.class));
    }
    for (CoreLabel w : mention.originalSpan) {
      String word = w.get(CoreAnnotations.TextAnnotation.class);
      // Note: this is locale specific for English and ascii numerals
      if (NumberMatchingRegex.isDouble(word)) {
        if (!antecedentWords.contains(word)) return true;
      } else {
        if (NUMBERS.contains(word.toLowerCase()) && !antecedentWords.contains(word)) return true;
      }
    }
    return false;
  }

  /** Have extra proper noun except strings involved in semantic match */
  public static boolean entityHaveExtraProperNoun(Mention m, Mention a, Set<String> exceptWords) {
    Set<String> mProper = Generics.newHashSet();
    Set<String> aProper = Generics.newHashSet();
    String mString = m.spanToString();
    String aString = a.spanToString();

    for (CoreLabel w : m.originalSpan){
      if (w.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")) {
        mProper.add(w.get(CoreAnnotations.TextAnnotation.class));
      }
    }
    for (CoreLabel w : a.originalSpan){
      if (w.get(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("NNP")) {
        aProper.add(w.get(CoreAnnotations.TextAnnotation.class));
      }
    }
    boolean mHasExtra = false;
    boolean aHasExtra = false;


    for (String s : mProper) {
      if (!aString.contains(s) && !exceptWords.contains(s.toLowerCase())) {
        mHasExtra = true;
        break;
      }
    }
    for (String s : aProper) {
      if (!mString.contains(s) && !exceptWords.contains(s.toLowerCase())) {
        aHasExtra = true;
        break;
      }
    }

    if(mHasExtra && aHasExtra) {
      return true;
    }
    return false;
  }

  /** Is the speaker for mention the same entity as the ant entity? */
  public static boolean antecedentIsMentionSpeaker(Document document,
                                                   Mention mention, Mention ant, Dictionaries dict) {
    if(document.speakerPairs.contains(new Pair<>(mention.mentionID, ant.mentionID))) {
      return true;
    }

    if(antecedentMatchesMentionSpeakerAnnotation(mention, ant, document)) {
      return true;
    }
    return false;
  }

  public static final Pattern WHITESPACE_PATTERN = Pattern.compile(" +");

  /**
   * The antecedent matches the speaker annotation found in the mention
   */
  public static boolean antecedentMatchesMentionSpeakerAnnotation(Mention mention, Mention ant, Document document) {
    if (mention.headWord == null) {
      return false;
    }
    String speaker = mention.headWord.get(CoreAnnotations.SpeakerAnnotation.class);

    if (speaker == null) {
      return false;
    }

    SpeakerInfo speakerInfo = (document != null)? document.getSpeakerInfo(speaker):null;
    if (speakerInfo != null) {
      return (mentionMatchesSpeaker(ant, speakerInfo, false));
    }

    // CAN'T get speaker info - take alternate path

    // We optimize a little here: if the name has no spaces, which is
    // the common case, then it is unnecessarily expensive to call
    // regex split
    if (speaker.indexOf(" ") >= 0) {
      // Perhaps we could optimize this, too, but that would be trickier
      for (String s : WHITESPACE_PATTERN.split(speaker)) {
        if (ant.headString.equalsIgnoreCase(s)) return true;
      }
    } else {
      if (ant.headString.equalsIgnoreCase(speaker)) return true;
    }
    return false;
  }

  public static boolean mentionMatchesSpeaker(Mention mention, SpeakerInfo speakerInfo, boolean strictMatch) {
    // Got info about this speaker
    if (mention.speakerInfo != null) {
      if (mention.speakerInfo == speakerInfo) return true;
    }
    if (speakerInfo.containsMention(mention)) return true;
    if (strictMatch) {
      String spkstr = SpeakerInfo.WHITESPACE_PATTERN.matcher(speakerInfo.getSpeakerName()).replaceAll("");
      String mstr = SpeakerInfo.WHITESPACE_PATTERN.matcher(mention.spanToString()).replaceAll("");
      if (spkstr.equalsIgnoreCase(mstr)) {
        speakerInfo.addMention(mention);
        return true;
      }
    } else {
      // speaker strings are pre-split
      if(!mention.headWord.tag().startsWith("NNP")) return false;
      for (String s : speakerInfo.getSpeakerNameStrings()) {
        if (mention.headString.equalsIgnoreCase(s)) {
          speakerInfo.addMention(mention);
          return true;
        }
      }
      if (speakerInfo.getSpeakerDesc() != null) {
        String spkDescStr = SpeakerInfo.WHITESPACE_PATTERN.matcher(speakerInfo.getSpeakerDesc()).replaceAll("");
        String mstr = SpeakerInfo.WHITESPACE_PATTERN.matcher(mention.spanToString()).replaceAll("");
        if (spkDescStr.equalsIgnoreCase(mstr)) return true;
      }
    }
    return false;
  }

  public static boolean entityPersonDisagree(Document document, Mention m, Mention ant, Dictionaries dict) {
    boolean sameSpeaker = entitySameSpeaker(document, m, ant);

    if(sameSpeaker && m.person!=ant.person) {
      if ((m.person == Person.IT && ant.person == Person.THEY)
           || (m.person == Person.THEY && ant.person == Person.IT) || (m.person == Person.THEY && ant.person == Person.THEY)) {
        return false;
      } else if (m.person != Person.UNKNOWN && ant.person != Person.UNKNOWN)
        return true;
    }
    if(sameSpeaker) {
      if(!ant.isPronominal()) {
        if(m.person==Person.I || m.person==Person.WE || m.person==Person.YOU) return true;
      } else if(!m.isPronominal()) {
        if(ant.person==Person.I || ant.person==Person.WE || ant.person==Person.YOU) return true;
      }
    }
    if(m.person==Person.YOU && m != ant && ant.appearEarlierThan(m)) {
      assert !m.appearEarlierThan(ant);
      int mUtter = m.headWord.get(CoreAnnotations.UtteranceAnnotation.class);
      if (document.speakers.containsKey(mUtter - 1)) {
        String previousSpeaker = document.speakers.get(mUtter - 1);
        int previousSpeakerCorefClusterID = getSpeakerClusterId(document, previousSpeaker);
        if (previousSpeakerCorefClusterID < 0) {
          return true;
        }
        if (ant.corefClusterID != previousSpeakerCorefClusterID && ant.person != Person.I) {
          return true;
        }
      } else {
        return true;
      }
    } else if (ant.person==Person.YOU && m != ant && m.appearEarlierThan(ant)) {
      assert !(ant.appearEarlierThan(m));
      int aUtter = ant.headWord.get(CoreAnnotations.UtteranceAnnotation.class);
      if (document.speakers.containsKey(aUtter - 1)) {
        String previousSpeaker = document.speakers.get(aUtter - 1);
        int previousSpeakerCorefClusterID = getSpeakerClusterId(document, previousSpeaker);
        if (previousSpeakerCorefClusterID < 0) {
          return true;
        }
        if (m.corefClusterID != previousSpeakerCorefClusterID && m.person != Person.I) {
          return true;
        }
      } else {
        return true;
      }
    }
    return false;
  }

  /** Do the mentions share the same speaker? */
  public static boolean entitySameSpeaker(Document document, Mention m, Mention ant) {
    String mSpeakerStr = m.headWord.get(CoreAnnotations.SpeakerAnnotation.class);
    if (mSpeakerStr == null) {
      return false;
    }
    String antSpeakerStr = ant.headWord.get(CoreAnnotations.SpeakerAnnotation.class);
    if (antSpeakerStr == null) {
      return false;
    }

    // Speakers are the same if the speaker strings are the same (most common case?)
    if (mSpeakerStr.equals(antSpeakerStr)) {
      return true;
    } else {
      // Speakers are also the same if they map to the same cluster id...
      int mSpeakerClusterID = getSpeakerClusterId(document, mSpeakerStr);
      int antSpeakerClusterID = getSpeakerClusterId(document, antSpeakerStr);
      if (mSpeakerClusterID >= 0 && antSpeakerClusterID >= 0) {
        return (mSpeakerClusterID == antSpeakerClusterID);
      } else {
        return false;
      }
    }
  }

  /**
   * Given the name of a speaker, returns the coref cluster id it belongs to (-1 if no cluster)
   * @param document The document to search in
   * @param speakerString The name to search for
   * @return cluster id
   */
  public static int getSpeakerClusterId(Document document, String speakerString) {
    int speakerClusterId = -1;
    // try looking up cluster id from speaker info
    SpeakerInfo speakerInfo = null;
    if (speakerString != null) {
      speakerInfo = document.getSpeakerInfo(speakerString);
      if (speakerInfo != null) {
        speakerClusterId = speakerInfo.getCorefClusterId();
      }
    }
    if (speakerClusterId < 0 && speakerString != null && NumberMatchingRegex.isDecimalInteger(speakerString)) {
      // speakerString is number so is mention id
      try {
        int speakerMentionId = Integer.parseInt(speakerString);
        Mention mention = document.predictedMentionsByID.get(speakerMentionId);
        if (mention != null) {
          speakerClusterId = mention.corefClusterID;
          if (speakerInfo != null) speakerInfo.addMention(mention);
        }
      } catch (Exception e) {
      }
    }
    return speakerClusterId;
  }

  public static boolean entitySubjectObject(Mention m1, Mention m2) {
    if(m1.sentNum != m2.sentNum) return false;
    if(m1.dependingVerb==null || m2.dependingVerb ==null) return false;
    if (m1.dependingVerb == m2.dependingVerb
         && ((m1.isSubject && (m2.isDirectObject || m2.isIndirectObject || m2.isPrepositionObject))
              || (m2.isSubject && (m1.isDirectObject || m1.isIndirectObject || m1.isPrepositionObject)))) {
      return true;
    }
    return false;
  }

  // Return true if the two mentions are less than n mentions apart in the same sent
  public static boolean entityTokenDistance(Mention m1, Mention m2) {
    if( (m2.sentNum == m1.sentNum) && (m1.startIndex - m2.startIndex < 6) ) return true;
    return false;
  }

  // COREF_DICT strict: all the mention pairs between the two clusters must match in the dict
  public static boolean entityClusterAllCorefDictionary(CorefCluster menCluster, CorefCluster antCluster,
      Dictionaries dict, int dictColumn, int freq){
    boolean ret = false;
    for(Mention men : menCluster.getCorefMentions()){
      if(men.isPronominal()) continue;
      for(Mention ant : antCluster.getCorefMentions()){
        if(ant.isPronominal() || men.headWord.lemma().equals(ant.headWord.lemma())) continue;
        if(entityCorefDictionary(men, ant, dict, dictColumn, freq)){
          ret = true;
        } else {
          return false;
        }
      }
    }
    return ret;
  }

   // COREF_DICT pairwise: the two mentions match in the dict
   public static boolean entityCorefDictionary(Mention men, Mention ant, Dictionaries dict, int dictVersion, int freq){

     Pair<String, String> mention_pair = new Pair<>(
             men.getSplitPattern()[dictVersion - 1].toLowerCase(),
             ant.getSplitPattern()[dictVersion - 1].toLowerCase());

     int high_freq = -1;
     if(dictVersion == 1){
       high_freq = 75;
     } else if(dictVersion == 2){
       high_freq = 16;
     } else if(dictVersion == 3){
       high_freq = 16;
     } else if(dictVersion == 4){
       high_freq = 16;
     }

     if(dict.corefDict.get(dictVersion-1).getCount(mention_pair) > high_freq) return true;

     if(dict.corefDict.get(dictVersion-1).getCount(mention_pair) > freq){
         if(dict.corefDictPMI.getCount(mention_pair) > 0.18) return true;
         if(!dict.corefDictPMI.containsKey(mention_pair)) return true;
     }
     return false;
   }

   public static boolean contextIncompatible(Mention men, Mention ant, Dictionaries dict) {
     String antHead = ant.headWord.word();
     if ( (ant.mentionType == MentionType.PROPER)
           && ant.sentNum != men.sentNum
           && !isContextOverlapping(ant,men)
           && dict.NE_signatures.containsKey(antHead)) {
       IntCounter<String> ranks = Counters.toRankCounter(dict.NE_signatures.get(antHead));
       List<String> context;
       if (!men.getPremodifierContext().isEmpty()) {
         context = men.getPremodifierContext();
       } else {
         context = men.getContext();
       }
       if (!context.isEmpty()) {
         int highestRank = 100000;
         for (String w: context) {
           if (ranks.containsKey(w) && ranks.getIntCount(w) < highestRank) {
             highestRank = ranks.getIntCount(w);
           }
           // check in the other direction
           if (dict.NE_signatures.containsKey(w)) {
             IntCounter<String> reverseRanks = Counters.toRankCounter(dict.NE_signatures.get(w));
             if (reverseRanks.containsKey(antHead) && reverseRanks.getIntCount(antHead) < highestRank) {
               highestRank = reverseRanks.getIntCount(antHead);
             }
           }
         }
         if (highestRank > 10) return true;
       }
     }
     return false;
   }

   public static boolean sentenceContextIncompatible(Mention men, Mention ant, Dictionaries dict) {
     if ( (ant.mentionType != MentionType.PROPER)
          && (ant.sentNum != men.sentNum)
          && (men.mentionType != MentionType.PROPER)
          && !isContextOverlapping(ant,men)) {
       List<String> context1 = !ant.getPremodifierContext().isEmpty() ? ant.getPremodifierContext() : ant.getContext();
       List<String> context2 = !men.getPremodifierContext().isEmpty() ? men.getPremodifierContext() : men.getContext();
       if (!context1.isEmpty() && !context2.isEmpty()) {
         int highestRank = 100000;
         for (String w1: context1) {
           for (String w2: context2) {
             // check the forward direction
             if (dict.NE_signatures.containsKey(w1)) {
               IntCounter<String> ranks = Counters.toRankCounter(dict.NE_signatures.get(w1));
               if (ranks.containsKey(w2) && ranks.getIntCount(w2) < highestRank) {
                 highestRank = ranks.getIntCount(w2);
               }
             }
             // check in the other direction
             if (dict.NE_signatures.containsKey(w2)) {
               IntCounter<String> reverseRanks = Counters.toRankCounter(dict.NE_signatures.get(w2));
               if (reverseRanks.containsKey(w1) && reverseRanks.getIntCount(w1) < highestRank) {
                 highestRank = reverseRanks.getIntCount(w1);
               }
             }
           }
         }
         if (highestRank > 10) return true;
       }
     }
     return false;
   }

   private static boolean isContextOverlapping(Mention m1, Mention m2) {
     Set<String> context1 = Generics.newHashSet();
     Set<String> context2 = Generics.newHashSet();
     context1.addAll(m1.getContext());
     context2.addAll(m2.getContext());
     return Sets.intersects(context1, context2);
   }


}