RFSieve.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.coref.hybrid.sieve;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;

import edu.stanford.nlp.coref.CorefProperties;
import edu.stanford.nlp.coref.CorefRules;
import edu.stanford.nlp.coref.data.CorefChain;
import edu.stanford.nlp.coref.data.CorefCluster;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.coref.data.Dictionaries.Animacy;
import edu.stanford.nlp.coref.data.Dictionaries.Gender;
import edu.stanford.nlp.coref.data.Dictionaries.MentionType;
import edu.stanford.nlp.coref.data.Dictionaries.Number;
import edu.stanford.nlp.coref.data.Dictionaries.Person;
import edu.stanford.nlp.coref.data.Document;
import edu.stanford.nlp.coref.data.Document.DocType;
import edu.stanford.nlp.coref.data.Mention;
import edu.stanford.nlp.coref.hybrid.HybridCorefPrinter;
import edu.stanford.nlp.coref.hybrid.HybridCorefProperties;
import edu.stanford.nlp.coref.hybrid.rf.RandomForest;
import edu.stanford.nlp.coref.md.RuleBasedCorefMentionFinder;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.RVFDatum;
import edu.stanford.nlp.math.ArrayMath;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.logging.Redwood;

public class RFSieve extends Sieve  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(RFSieve.class);

  private static final long serialVersionUID = -4090017054885920527L;

  // for RF sieve
  public RandomForest rf;

  /** the probability threshold for merging two mentions */
  public double thresMerge;

  // constructor for RF sieve
  public RFSieve(RandomForest rf, Properties props, String sievename) {
    super(props, sievename);
    this.rf = rf;
    this.props = props;
    this.classifierType = ClassifierType.RF;
  }

  public void findCoreferentAntecedent(Mention m, int mIdx, Document document, Dictionaries dict, Properties props, StringBuilder sbLog) throws Exception {
    int sentIdx = m.sentNum;

    Counter<Integer> probs = new ClassicCounter<>();

    int mentionDist = 0;
    for(int sentDist=0 ; sentDist <= Math.min(this.maxSentDist, sentIdx) ; sentDist++) {
      List<Mention> candidates = getOrderedAntecedents(m, sentIdx-sentDist, mIdx, document.predictedMentions, dict);


      for(Mention candidate : candidates) {
        if(skipForAnalysis(candidate, m, props)) continue;
        if(candidate == m) continue;
        if(!aType.contains(candidate.mentionType)) continue;
        if(m.mentionType == MentionType.PRONOMINAL) {
          if(!matchedMentionType(m, mTypeStr)) continue;
          if(!matchedMentionType(candidate, aTypeStr)) continue;
        }

        if(sentDist==0 && m.appearEarlierThan(candidate)) continue;   // ignore cataphora
        mentionDist++;

        RVFDatum<Boolean, String> datum = extractDatum(m, candidate, document, mentionDist, dict, props, sievename);

        double probTrue = 0;
        if(this.classifierType == ClassifierType.RF) {
          probTrue = this.rf.probabilityOfTrue(datum);
        }

        probs.setCount(candidate.mentionID, probTrue);
      }
    }

    if(HybridCorefProperties.debug(props)) {
      sbLog.append(HybridCorefPrinter.printErrorLog(m, document, probs, mIdx, dict, this));
    }

    if(probs.size() > 0 && Counters.max(probs) > this.thresMerge) {
      // merge highest prob candidate
      int antID = Counters.argmax(probs);

      Sieve.merge(document, m.mentionID, antID);
    }
  }
  public static RVFDatum<Boolean, String> extractDatum(Mention m, Mention candidate,
      Document document, int mentionDist, Dictionaries dict, Properties props, String sievename) {
    try {

      boolean label = (document.goldMentions==null)? false : document.isCoref(m, candidate);
      Counter<String> features = new ClassicCounter<>();
      CorefCluster mC = document.corefClusters.get(m.corefClusterID);
      CorefCluster aC = document.corefClusters.get(candidate.corefClusterID);

      CoreLabel mFirst = m.sentenceWords.get(m.startIndex);
      CoreLabel mLast = m.sentenceWords.get(m.endIndex-1);
      CoreLabel mPreceding = (m.startIndex>0)? m.sentenceWords.get(m.startIndex-1) : null;
      CoreLabel mFollowing = (m.endIndex < m.sentenceWords.size())? m.sentenceWords.get(m.endIndex) : null;

      CoreLabel aFirst = candidate.sentenceWords.get(candidate.startIndex);
      CoreLabel aLast = candidate.sentenceWords.get(candidate.endIndex-1);
      CoreLabel aPreceding = (candidate.startIndex>0)? candidate.sentenceWords.get(candidate.startIndex-1) : null;
      CoreLabel aFollowing = (candidate.endIndex < candidate.sentenceWords.size())? candidate.sentenceWords.get(candidate.endIndex) : null;


      ////////////////////////////////////////////////////////////////////////////////
      ///////    basic features: distance, doctype, mention length, roles ////////////
      ////////////////////////////////////////////////////////////////////////////////
      if(HybridCorefProperties.useBasicFeatures(props, sievename)) {
        int sentDist = m.sentNum - candidate.sentNum;
        features.incrementCount("SENTDIST", sentDist);
        features.incrementCount("MENTIONDIST", mentionDist);

        int minSentDist = sentDist;
        for(Mention a : aC.corefMentions) {
          minSentDist = Math.min(minSentDist, Math.abs(m.sentNum - a.sentNum));
        }
        features.incrementCount("MINSENTDIST", minSentDist);

        // When they are in the same sentence, divides a sentence into clauses and add such feature
        if(CorefProperties.useConstituencyParse(props)) {
          if(m.sentNum == candidate.sentNum) {
            int clauseCount = 0;
            Tree tree = m.contextParseTree;
            Tree current = m.mentionSubTree;

            while(true){
              current = current.ancestor(1, tree);
              if(current.label().value().startsWith("S")) {
                clauseCount++;
              }
              if(current.dominates(candidate.mentionSubTree)) break;
              if(current.label().value().equals("ROOT") || current.ancestor(1, tree)==null) break;
            }
            features.incrementCount("CLAUSECOUNT", clauseCount);
          }
        }

        if(document.docType == DocType.CONVERSATION) features.incrementCount("B-DOCTYPE-"+document.docType);
        if(m.headWord.get(SpeakerAnnotation.class).equalsIgnoreCase("PER0")) {
          features.incrementCount("B-SPEAKER-PER0");
        }

        if(document.docInfo!=null && document.docInfo.containsKey("DOC_ID")) {
          features.incrementCount("B-DOCSOURCE-"+document.docInfo.get("DOC_ID").split("/")[1]);
        }

        features.incrementCount("M-LENGTH", m.originalSpan.size());
        features.incrementCount("A-LENGTH", candidate.originalSpan.size());
        if(m.originalSpan.size() < candidate.originalSpan.size()) features.incrementCount("B-A-ISLONGER");
        features.incrementCount("A-SIZE", aC.getCorefMentions().size());
        features.incrementCount("M-SIZE", mC.getCorefMentions().size());

        String antRole = "A-NOROLE";
        String mRole = "M-NOROLE";

        if(m.isSubject) mRole = "M-SUBJ";
        if(m.isDirectObject) mRole = "M-DOBJ";
        if(m.isIndirectObject) mRole = "M-IOBJ";
        if(m.isPrepositionObject) mRole = "M-POBJ";

        if(candidate.isSubject) antRole = "A-SUBJ";
        if(candidate.isDirectObject) antRole = "A-DOBJ";
        if(candidate.isIndirectObject) antRole = "A-IOBJ";
        if(candidate.isPrepositionObject) antRole = "A-POBJ";

        features.incrementCount("B-"+mRole);
        features.incrementCount("B-"+antRole);
        features.incrementCount("B-"+antRole+"-"+mRole);

        if(HybridCorefProperties.combineObjectRoles(props, sievename)) {
          // combine all objects
          if(m.isDirectObject || m.isIndirectObject || m.isPrepositionObject
              || candidate.isDirectObject || candidate.isIndirectObject || candidate.isPrepositionObject) {
            if(m.isDirectObject || m.isIndirectObject || m.isPrepositionObject) {
              mRole = "M-OBJ";
              features.incrementCount("B-M-OBJ");
            }
            if(candidate.isDirectObject || candidate.isIndirectObject || candidate.isPrepositionObject) {
              antRole = "A-OBJ";
              features.incrementCount("B-A-OBJ");
            }
            features.incrementCount("B-"+antRole+"-"+mRole);
          }
        }

        if(mFirst.word().toLowerCase().matches("a|an")) {
          features.incrementCount("B-M-START-WITH-INDEFINITE");
        }
        if(aFirst.word().toLowerCase().matches("a|an")) {
          features.incrementCount("B-A-START-WITH-INDEFINITE");
        }
        if(mFirst.word().equalsIgnoreCase("the")) {
          features.incrementCount("B-M-START-WITH-DEFINITE");
        }
        if(aFirst.word().equalsIgnoreCase("the")) {
          features.incrementCount("B-A-START-WITH-DEFINITE");
        }

        if(dict.indefinitePronouns.contains(m.lowercaseNormalizedSpanString())) {
          features.incrementCount("B-M-INDEFINITE-PRONOUN");
        }
        if(dict.indefinitePronouns.contains(candidate.lowercaseNormalizedSpanString())) {
          features.incrementCount("B-A-INDEFINITE-PRONOUN");
        }
        if(dict.indefinitePronouns.contains(mFirst.word().toLowerCase())) {
          features.incrementCount("B-M-INDEFINITE-ADJ");
        }
        if(dict.indefinitePronouns.contains(aFirst.word().toLowerCase())){
          features.incrementCount("B-A-INDEFINITE-ADJ");
        }
        if(dict.reflexivePronouns.contains(m.headString)) {
          features.incrementCount("B-M-REFLEXIVE");
        }
        if(dict.reflexivePronouns.contains(candidate.headString)) {
          features.incrementCount("B-A-REFLEXIVE");
        }

        if(m.headIndex == m.endIndex-1) features.incrementCount("B-M-HEADEND");
        if(m.headIndex < m.endIndex-1) {
          CoreLabel headnext = m.sentenceWords.get(m.headIndex+1);
          if(headnext.word().matches("that|,") || headnext.tag().startsWith("W")) {
            features.incrementCount("B-M-HASPOSTPHRASE");
            if(mFirst.tag().equals("DT") && mFirst.word().toLowerCase().matches("the|this|these|those")) features.incrementCount("B-M-THE-HASPOSTPHRASE");
            else if(mFirst.word().toLowerCase().matches("a|an")) features.incrementCount("B-M-INDEFINITE-HASPOSTPHRASE");
          }
        }

        // shape feature from Bjorkelund & Kuhn
        StringBuilder sb = new StringBuilder();
        List<Mention> sortedMentions = new ArrayList<>(aC.corefMentions.size());
        sortedMentions.addAll(aC.corefMentions);
        Collections.sort(sortedMentions, new CorefChain.MentionComparator());
        for(Mention a : sortedMentions) {
          sb.append(a.mentionType).append("-");
        }
        features.incrementCount("B-A-SHAPE-"+sb.toString());

        sb = new StringBuilder();
        sortedMentions = new ArrayList<>(mC.corefMentions.size());
        sortedMentions.addAll(mC.corefMentions);
        Collections.sort(sortedMentions, new CorefChain.MentionComparator());
        for(Mention men : sortedMentions) {
          sb.append(men.mentionType).append("-");
        }
        features.incrementCount("B-M-SHAPE-"+sb.toString());

        if(CorefProperties.useConstituencyParse(props)) {
          sb = new StringBuilder();
          Tree mTree = m.contextParseTree;
          Tree mHead = mTree.getLeaves().get(m.headIndex).ancestor(1, mTree);
          for(Tree node : mTree.pathNodeToNode(mHead, mTree)){
            sb.append(node.value()).append("-");
            if(node.value().equals("S")) break;
          }
          features.incrementCount("B-M-SYNPATH-"+sb.toString());

          sb = new StringBuilder();
          Tree aTree = candidate.contextParseTree;
          Tree aHead = aTree.getLeaves().get(candidate.headIndex).ancestor(1, aTree);
          for(Tree node : aTree.pathNodeToNode(aHead, aTree)){
            sb.append(node.value()).append("-");
            if(node.value().equals("S")) break;
          }
          features.incrementCount("B-A-SYNPATH-"+sb.toString());
        }


        features.incrementCount("A-FIRSTAPPEAR", aC.representative.sentNum);
        features.incrementCount("M-FIRSTAPPEAR", mC.representative.sentNum);
        int docSize = document.predictedMentions.size();   // document size in # of sentences
        features.incrementCount("A-FIRSTAPPEAR-NORMALIZED", aC.representative.sentNum/docSize);
        features.incrementCount("M-FIRSTAPPEAR-NORMALIZED", mC.representative.sentNum/docSize);
      }

      ////////////////////////////////////////////////////////////////////////////////
      ///////    mention detection features                               ////////////
      ////////////////////////////////////////////////////////////////////////////////
      if(HybridCorefProperties.useMentionDetectionFeatures(props, sievename)) {
        // bare plurals
        if(m.originalSpan.size()==1 && m.headWord.tag().equals("NNS")) features.incrementCount("B-M-BAREPLURAL");
        if(candidate.originalSpan.size()==1 && candidate.headWord.tag().equals("NNS")) features.incrementCount("B-A-BAREPLURAL");

        // pleonastic it
        if(CorefProperties.useConstituencyParse(props)) {
          if(RuleBasedCorefMentionFinder.isPleonastic(m, m.contextParseTree)
              || RuleBasedCorefMentionFinder.isPleonastic(candidate, candidate.contextParseTree)) {
            features.incrementCount("B-PLEONASTICIT");
          }
        }

        // quantRule
        if(dict.quantifiers.contains(mFirst.word().toLowerCase(Locale.ENGLISH))) features.incrementCount("B-M-QUANTIFIER");
        if(dict.quantifiers.contains(aFirst.word().toLowerCase(Locale.ENGLISH))) features.incrementCount("B-A-QUANTIFIER");

        // starts with negation
        if(mFirst.word().toLowerCase(Locale.ENGLISH).matches("none|no|nothing|not")
            || aFirst.word().toLowerCase(Locale.ENGLISH).matches("none|no|nothing|not")) {
          features.incrementCount("B-NEGATIVE-START");
        }

        // parititive rule
        if(RuleBasedCorefMentionFinder.partitiveRule(m, m.sentenceWords, dict)) features.incrementCount("B-M-PARTITIVE");
        if(RuleBasedCorefMentionFinder.partitiveRule(candidate, candidate.sentenceWords, dict)) features.incrementCount("B-A-PARTITIVE");

        // %
        if(m.headString.equals("%")) features.incrementCount("B-M-HEAD%");
        if(candidate.headString.equals("%")) features.incrementCount("B-A-HEAD%");

        // adjective form of nations
        if(dict.isAdjectivalDemonym(m.spanToString())) features.incrementCount("B-M-ADJ-DEMONYM");
        if(dict.isAdjectivalDemonym(candidate.spanToString())) features.incrementCount("B-A-ADJ-DEMONYM");

        // ends with "etc."
        if(m.lowercaseNormalizedSpanString().endsWith("etc.")) features.incrementCount("B-M-ETC-END");
        if(candidate.lowercaseNormalizedSpanString().endsWith("etc.")) features.incrementCount("B-A-ETC-END");

      }

      ////////////////////////////////////////////////////////////////////////////////
      ///////    attributes, attributes agree                             ////////////
      ////////////////////////////////////////////////////////////////////////////////
      features.incrementCount("B-M-NUMBER-"+m.number);
      features.incrementCount("B-A-NUMBER-"+candidate.number);
      features.incrementCount("B-M-GENDER-"+m.gender);
      features.incrementCount("B-A-GENDER-"+candidate.gender);
      features.incrementCount("B-M-ANIMACY-"+m.animacy);
      features.incrementCount("B-A-ANIMACY-"+candidate.animacy);
      features.incrementCount("B-M-PERSON-"+m.person);
      features.incrementCount("B-A-PERSON-"+candidate.person);
      features.incrementCount("B-M-NETYPE-"+m.nerString);
      features.incrementCount("B-A-NETYPE-"+candidate.nerString);

      features.incrementCount("B-BOTH-NUMBER-"+candidate.number+"-"+m.number);
      features.incrementCount("B-BOTH-GENDER-"+candidate.gender+"-"+m.gender);
      features.incrementCount("B-BOTH-ANIMACY-"+candidate.animacy+"-"+m.animacy);
      features.incrementCount("B-BOTH-PERSON-"+candidate.person+"-"+m.person);
      features.incrementCount("B-BOTH-NETYPE-"+candidate.nerString+"-"+m.nerString);


      Set<Number> mcNumber = Generics.newHashSet();
      for(Number n : mC.numbers) {
        features.incrementCount("B-MC-NUMBER-"+n);
        mcNumber.add(n);
      }
      if(mcNumber.size()==1) {
        features.incrementCount("B-MC-CLUSTERNUMBER-"+mcNumber.iterator().next());
      } else {
        mcNumber.remove(Number.UNKNOWN);
        if(mcNumber.size() == 1) features.incrementCount("B-MC-CLUSTERNUMBER-"+mcNumber.iterator().next());
        else features.incrementCount("B-MC-CLUSTERNUMBER-CONFLICT");
      }

      Set<Gender> mcGender = Generics.newHashSet();
      for(Gender g : mC.genders) {
        features.incrementCount("B-MC-GENDER-"+g);
        mcGender.add(g);
      }
      if(mcGender.size()==1) {
        features.incrementCount("B-MC-CLUSTERGENDER-"+mcGender.iterator().next());
      } else {
        mcGender.remove(Gender.UNKNOWN);
        if(mcGender.size() == 1) features.incrementCount("B-MC-CLUSTERGENDER-"+mcGender.iterator().next());
        else features.incrementCount("B-MC-CLUSTERGENDER-CONFLICT");
      }

      Set<Animacy> mcAnimacy = Generics.newHashSet();
      for(Animacy a : mC.animacies) {
        features.incrementCount("B-MC-ANIMACY-"+a);
        mcAnimacy.add(a);
      }
      if(mcAnimacy.size()==1) {
        features.incrementCount("B-MC-CLUSTERANIMACY-"+mcAnimacy.iterator().next());
      } else {
        mcAnimacy.remove(Animacy.UNKNOWN);
        if(mcAnimacy.size() == 1) features.incrementCount("B-MC-CLUSTERANIMACY-"+mcAnimacy.iterator().next());
        else features.incrementCount("B-MC-CLUSTERANIMACY-CONFLICT");
      }

      Set<String> mcNER = Generics.newHashSet();
      for(String t : mC.nerStrings) {
        features.incrementCount("B-MC-NETYPE-"+t);
        mcNER.add(t);
      }
      if(mcNER.size()==1) {
        features.incrementCount("B-MC-CLUSTERNETYPE-"+mcNER.iterator().next());
      } else {
        mcNER.remove("O");
        if(mcNER.size() == 1) features.incrementCount("B-MC-CLUSTERNETYPE-"+mcNER.iterator().next());
        else features.incrementCount("B-MC-CLUSTERNETYPE-CONFLICT");
      }

      Set<Number> acNumber = Generics.newHashSet();
      for(Number n : aC.numbers) {
        features.incrementCount("B-AC-NUMBER-"+n);
        acNumber.add(n);
      }
      if(acNumber.size()==1) {
        features.incrementCount("B-AC-CLUSTERNUMBER-"+acNumber.iterator().next());
      } else {
        acNumber.remove(Number.UNKNOWN);
        if(acNumber.size() == 1) features.incrementCount("B-AC-CLUSTERNUMBER-"+acNumber.iterator().next());
        else features.incrementCount("B-AC-CLUSTERNUMBER-CONFLICT");
      }

      Set<Gender> acGender = Generics.newHashSet();
      for(Gender g : aC.genders) {
        features.incrementCount("B-AC-GENDER-"+g);
        acGender.add(g);
      }
      if(acGender.size()==1) {
        features.incrementCount("B-AC-CLUSTERGENDER-"+acGender.iterator().next());
      } else {
        acGender.remove(Gender.UNKNOWN);
        if(acGender.size() == 1) features.incrementCount("B-AC-CLUSTERGENDER-"+acGender.iterator().next());
        else features.incrementCount("B-AC-CLUSTERGENDER-CONFLICT");
      }

      Set<Animacy> acAnimacy = Generics.newHashSet();
      for(Animacy a : aC.animacies) {
        features.incrementCount("B-AC-ANIMACY-"+a);
        acAnimacy.add(a);
      }
      if(acAnimacy.size()==1) {
        features.incrementCount("B-AC-CLUSTERANIMACY-"+acAnimacy.iterator().next());
      } else {
        acAnimacy.remove(Animacy.UNKNOWN);
        if(acAnimacy.size() == 1) features.incrementCount("B-AC-CLUSTERANIMACY-"+acAnimacy.iterator().next());
        else features.incrementCount("B-AC-CLUSTERANIMACY-CONFLICT");
      }

      Set<String> acNER = Generics.newHashSet();
      for(String t : aC.nerStrings) {
        features.incrementCount("B-AC-NETYPE-"+t);
        acNER.add(t);
      }
      if(acNER.size()==1) {
        features.incrementCount("B-AC-CLUSTERNETYPE-"+acNER.iterator().next());
      } else {
        acNER.remove("O");
        if(acNER.size() == 1) features.incrementCount("B-AC-CLUSTERNETYPE-"+acNER.iterator().next());
        else features.incrementCount("B-AC-CLUSTERNETYPE-CONFLICT");
      }


      if(m.numbersAgree(candidate)) features.incrementCount("B-NUMBER-AGREE");
      if(m.gendersAgree(candidate)) features.incrementCount("B-GENDER-AGREE");
      if(m.animaciesAgree(candidate)) features.incrementCount("B-ANIMACY-AGREE");
      if(CorefRules.entityAttributesAgree(mC, aC)) features.incrementCount("B-ATTRIBUTES-AGREE");
      if(CorefRules.entityPersonDisagree(document, m, candidate, dict)) features.incrementCount("B-PERSON-DISAGREE");

      ////////////////////////////////////////////////////////////////////////////////
      ///////    dcoref rules                                             ////////////
      ////////////////////////////////////////////////////////////////////////////////
      if(HybridCorefProperties.useDcorefRules(props, sievename)) {
        if(CorefRules.entityIWithinI(m, candidate, dict)) features.incrementCount("B-i-within-i");
        if(CorefRules.antecedentIsMentionSpeaker(document, m, candidate, dict)) features.incrementCount("B-ANT-IS-SPEAKER");
        if(CorefRules.entitySameSpeaker(document, m, candidate)) features.incrementCount("B-SAME-SPEAKER");
        if(CorefRules.entitySubjectObject(m, candidate)) features.incrementCount("B-SUBJ-OBJ");
        for(Mention a : aC.corefMentions) {
          if(CorefRules.entitySubjectObject(m, a)) features.incrementCount("B-CLUSTER-SUBJ-OBJ");
        }

        if(CorefRules.entityPersonDisagree(document, m, candidate, dict)
            && CorefRules.entitySameSpeaker(document, m, candidate)) features.incrementCount("B-PERSON-DISAGREE-SAME-SPEAKER");

        if(CorefRules.entityIWithinI(mC, aC, dict)) features.incrementCount("B-ENTITY-IWITHINI");
        if(CorefRules.antecedentMatchesMentionSpeakerAnnotation(m, candidate, document)) features.incrementCount("B-ANT-IS-SPEAKER-OF-MENTION");

        Set<MentionType> mType = HybridCorefProperties.getMentionType(props, sievename);
        if(mType.contains(MentionType.PROPER) || mType.contains(MentionType.NOMINAL)) {
          if(m.headString.equals(candidate.headString)) features.incrementCount("B-HEADMATCH");
          if(CorefRules.entityHeadsAgree(mC, aC, m, candidate, dict)) features.incrementCount("B-HEADSAGREE");
          if(CorefRules.entityExactStringMatch(mC, aC, dict, document.roleSet)) features.incrementCount("B-EXACTSTRINGMATCH");
          if(CorefRules.entityHaveExtraProperNoun(m, candidate, new HashSet<>())) features.incrementCount("B-HAVE-EXTRA-PROPER-NOUN");
          if(CorefRules.entityBothHaveProper(mC, aC)) features.incrementCount("B-BOTH-HAVE-PROPER");
          if(CorefRules.entityHaveDifferentLocation(m, candidate, dict)) features.incrementCount("B-HAVE-DIFF-LOC");
          if(CorefRules.entityHaveIncompatibleModifier(mC, aC)) features.incrementCount("B-HAVE-INCOMPATIBLE-MODIFIER");
          if(CorefRules.entityIsAcronym(document, mC, aC)) features.incrementCount("B-IS-ACRONYM");
          if(CorefRules.entityIsApposition(mC, aC, m, candidate)) features.incrementCount("B-IS-APPOSITION");
          if(CorefRules.entityIsPredicateNominatives(mC, aC, m, candidate)) features.incrementCount("B-IS-PREDICATE-NOMINATIVES");
          if(CorefRules.entityIsRoleAppositive(mC, aC, m, candidate, dict)) features.incrementCount("B-IS-ROLE-APPOSITIVE");
          if(CorefRules.entityNumberInLaterMention(m, candidate)) features.incrementCount("B-NUMBER-IN-LATER");
          if(CorefRules.entityRelaxedExactStringMatch(mC, aC, m, candidate, dict, document.roleSet)) features.incrementCount("B-RELAXED-EXACT-STRING-MATCH");
          if(CorefRules.entityRelaxedHeadsAgreeBetweenMentions(mC, aC, m, candidate)) features.incrementCount("B-RELAXED-HEAD-AGREE");
          if(CorefRules.entitySameProperHeadLastWord(m, candidate)) features.incrementCount("B-SAME-PROPER-HEAD");
          if(CorefRules.entitySameProperHeadLastWord(mC, aC, m, candidate)) features.incrementCount("B-CLUSTER-SAME-PROPER-HEAD");
          if(CorefRules.entityWordsIncluded(mC, aC, m, candidate)) features.incrementCount("B-WORD-INCLUSION");
        }
        if(mType.contains(MentionType.LIST)) {
          features.incrementCount("NUM-LIST-", numEntitiesInList(m));
          if(m.spanToString().contains("two") || m.spanToString().contains("2") || m.spanToString().contains("both")) features.incrementCount("LIST-M-TWO");
          if(m.spanToString().contains("three") || m.spanToString().contains("3")) features.incrementCount("LIST-M-THREE");
          if(candidate.spanToString().contains("two")
              || candidate.spanToString().contains("2")
              || candidate.spanToString().contains("both")) {
            features.incrementCount("B-LIST-A-TWO");
          }
          if(candidate.spanToString().contains("three")
              || candidate.spanToString().contains("3")) {
            features.incrementCount("B-LIST-A-THREE");
          }
        }

        if(mType.contains(MentionType.PRONOMINAL)) {
          if(dict.firstPersonPronouns.contains(m.headString)) features.incrementCount("B-M-I");
          if(dict.secondPersonPronouns.contains(m.headString)) features.incrementCount("B-M-YOU");
          if(dict.thirdPersonPronouns.contains(m.headString)) features.incrementCount("B-M-3RDPERSON");
          if(dict.possessivePronouns.contains(m.headString)) features.incrementCount("B-M-POSSESSIVE");
          if(dict.neutralPronouns.contains(m.headString)) features.incrementCount("B-M-NEUTRAL");
          if(dict.malePronouns.contains(m.headString)) features.incrementCount("B-M-MALE");
          if(dict.femalePronouns.contains(m.headString)) features.incrementCount("B-M-FEMALE");

          if(dict.firstPersonPronouns.contains(candidate.headString)) features.incrementCount("B-A-I");
          if(dict.secondPersonPronouns.contains(candidate.headString)) features.incrementCount("B-A-YOU");
          if(dict.thirdPersonPronouns.contains(candidate.headString)) features.incrementCount("B-A-3RDPERSON");
          if(dict.possessivePronouns.contains(candidate.headString)) features.incrementCount("B-A-POSSESSIVE");
          if(dict.neutralPronouns.contains(candidate.headString)) features.incrementCount("B-A-NEUTRAL");
          if(dict.malePronouns.contains(candidate.headString)) features.incrementCount("B-A-MALE");
          if(dict.femalePronouns.contains(candidate.headString)) features.incrementCount("B-A-FEMALE");

          features.incrementCount("B-M-GENERIC-"+m.generic);
          features.incrementCount("B-A-GENERIC-"+candidate.generic);

          if(HybridCorefPrinter.dcorefPronounSieve.skipThisMention(document, m, mC, dict)) {
            features.incrementCount("B-SKIPTHISMENTION-true");
          }

          if(m.spanToString().equalsIgnoreCase("you") && mFollowing!=null && mFollowing.word().equalsIgnoreCase("know")) {
            features.incrementCount("B-YOUKNOW-PRECEDING-POS-" + ((mPreceding==null)? "NULL" : mPreceding.tag()) );
            features.incrementCount("B-YOUKNOW-PRECEDING-WORD-"+ ((mPreceding==null)? "NULL" : mPreceding.word().toLowerCase()) );
            CoreLabel nextword = (m.endIndex+1 < m.sentenceWords.size())? m.sentenceWords.get(m.endIndex+1) : null;
            features.incrementCount("B-YOUKNOW-FOLLOWING-POS-" + ((nextword==null)? "NULL" : nextword.tag()) );
            features.incrementCount("B-YOUKNOW-FOLLOWING-WORD-"+ ((nextword==null)? "NULL" : nextword.word().toLowerCase()) );
          }
          if(candidate.spanToString().equalsIgnoreCase("you") && aFollowing!=null && aFollowing.word().equalsIgnoreCase("know")) {
            features.incrementCount("B-YOUKNOW-PRECEDING-POS-" + ((aPreceding==null)? "NULL" : aPreceding.tag()) );
            features.incrementCount("B-YOUKNOW-PRECEDING-WORD-"+ ((aPreceding==null)? "NULL" : aPreceding.word().toLowerCase()) );
            CoreLabel nextword = (candidate.endIndex+1 < candidate.sentenceWords.size())? candidate.sentenceWords.get(candidate.endIndex+1) : null;
            features.incrementCount("B-YOUKNOW-FOLLOWING-POS-" + ((nextword==null)? "NULL" : nextword.tag()) );
            features.incrementCount("B-YOUKNOW-FOLLOWING-WORD-"+ ((nextword==null)? "NULL" : nextword.word().toLowerCase()) );
          }
        }

        // discourse match features
        if(m.person==Person.YOU && document.docType==DocType.ARTICLE && m.headWord.get(CoreAnnotations.SpeakerAnnotation.class).equals("PER0")) {
          features.incrementCount("B-DISCOURSE-M-YOU-GENERIC?");
        }
        if(candidate.generic && candidate.person==Person.YOU) features.incrementCount("B-DISCOURSE-A-YOU-GENERIC?");

        String mString = m.lowercaseNormalizedSpanString();
        String antString = candidate.lowercaseNormalizedSpanString();

        // I-I
        if(m.number==Number.SINGULAR && dict.firstPersonPronouns.contains(mString)
            && candidate.number==Number.SINGULAR && dict.firstPersonPronouns.contains(antString)
            && CorefRules.entitySameSpeaker(document, m, candidate)) {
          features.incrementCount("B-DISCOURSE-I-I-SAMESPEAKER");
        }

        // (speaker - I)
        if ((m.number==Number.SINGULAR && dict.firstPersonPronouns.contains(mString))
                && CorefRules.antecedentIsMentionSpeaker(document, m, candidate, dict)) {
          features.incrementCount("B-DISCOURSE-SPEAKER-I");
        }

        // (I - speaker)
        if ((candidate.number==Number.SINGULAR && dict.firstPersonPronouns.contains(antString))
                && CorefRules.antecedentIsMentionSpeaker(document, candidate, m, dict)) {
          features.incrementCount("B-DISCOURSE-I-SPEAKER");
        }
        // Can be iffy if more than two speakers... but still should be okay most of the time
        if (dict.secondPersonPronouns.contains(mString)
            && dict.secondPersonPronouns.contains(antString)
            && CorefRules.entitySameSpeaker(document, m, candidate)) {
          features.incrementCount("B-DISCOURSE-BOTH-YOU");
        }
        // previous I - you or previous you - I in two person conversation
        if (((m.person==Person.I && candidate.person==Person.YOU
            || (m.person==Person.YOU && candidate.person==Person.I))
            && (m.headWord.get(CoreAnnotations.UtteranceAnnotation.class)-candidate.headWord.get(CoreAnnotations.UtteranceAnnotation.class) == 1)
            && document.docType==DocType.CONVERSATION)) {
          features.incrementCount("B-DISCOURSE-I-YOU");
        }
        if (dict.reflexivePronouns.contains(m.headString) && CorefRules.entitySubjectObject(m, candidate)){
          features.incrementCount("B-DISCOURSE-REFLEXIVE");
        }
        if(m.person==Person.I && candidate.person==Person.I && !CorefRules.entitySameSpeaker(document, m, candidate)) {
          features.incrementCount("B-DISCOURSE-I-I-DIFFSPEAKER");
        }
        if(m.person==Person.YOU && candidate.person==Person.YOU && !CorefRules.entitySameSpeaker(document, m, candidate)) {
          features.incrementCount("B-DISCOURSE-YOU-YOU-DIFFSPEAKER");
        }
        if(m.person==Person.WE && candidate.person==Person.WE && !CorefRules.entitySameSpeaker(document, m, candidate)) {
          features.incrementCount("B-DISCOURSE-WE-WE-DIFFSPEAKER");
        }
      }

      ////////////////////////////////////////////////////////////////////////////////
      ///////    POS features                                             ////////////
      ////////////////////////////////////////////////////////////////////////////////
      if(HybridCorefProperties.usePOSFeatures(props, sievename)) {
        features.incrementCount("B-LEXICAL-M-HEADPOS-"+m.headWord.tag());
        features.incrementCount("B-LEXICAL-A-HEADPOS-"+candidate.headWord.tag());
        features.incrementCount("B-LEXICAL-M-FIRSTPOS-"+mFirst.tag());
        features.incrementCount("B-LEXICAL-A-FIRSTPOS-"+aFirst.tag());
        features.incrementCount("B-LEXICAL-M-LASTPOS-"+mLast.tag());
        features.incrementCount("B-LEXICAL-A-LASTPOS-"+aLast.tag());

        features.incrementCount("B-LEXICAL-M-PRECEDINGPOS-"+ ((mPreceding==null)? "NULL" : mPreceding.tag()) );
        features.incrementCount("B-LEXICAL-A-PRECEDINGPOS-"+ ((aPreceding==null)? "NULL" : aPreceding.tag()) );
        features.incrementCount("B-LEXICAL-M-FOLLOWINGPOS-"+ ((mFollowing==null)? "NULL" : mFollowing.tag()) );
        features.incrementCount("B-LEXICAL-A-FOLLOWINGPOS-"+ ((aFollowing==null)? "NULL" : aFollowing.tag()) );
      }

      ////////////////////////////////////////////////////////////////////////////////
      ///////    lexical features                                         ////////////
      ////////////////////////////////////////////////////////////////////////////////
      if(HybridCorefProperties.useLexicalFeatures(props, sievename)) {

        features.incrementCount("B-LEXICAL-M-HEADWORD-"+m.headString.toLowerCase());
        features.incrementCount("B-LEXICAL-A-HEADWORD-"+candidate.headString.toLowerCase());
        features.incrementCount("B-LEXICAL-M-FIRSTWORD-"+mFirst.word().toLowerCase());
        features.incrementCount("B-LEXICAL-A-FIRSTWORD-"+aFirst.word().toLowerCase());
        features.incrementCount("B-LEXICAL-M-LASTWORD-"+mLast.word().toLowerCase());
        features.incrementCount("B-LEXICAL-A-LASTWORD-"+aLast.word().toLowerCase());

        features.incrementCount("B-LEXICAL-M-PRECEDINGWORD-"+ ((mPreceding==null)? "NULL" : mPreceding.word().toLowerCase()) );
        features.incrementCount("B-LEXICAL-A-PRECEDINGWORD-"+ ((aPreceding==null)? "NULL" : aPreceding.word().toLowerCase()) );
        features.incrementCount("B-LEXICAL-M-FOLLOWINGWORD-"+ ((mFollowing==null)? "NULL" : mFollowing.word().toLowerCase()) );
        features.incrementCount("B-LEXICAL-A-FOLLOWINGWORD-"+ ((aFollowing==null)? "NULL" : aFollowing.word().toLowerCase()) );

        //extra headword, modifiers lexical features
        for(String mHead : mC.heads) {
          if(!aC.heads.contains(mHead)) features.incrementCount("B-LEXICAL-MC-EXTRAHEAD-"+mHead);
        }
        for(String mWord : mC.words) {
          if(!aC.words.contains(mWord)) features.incrementCount("B-LEXICAL-MC-EXTRAWORD-"+mWord);
        }
      }

      ////////////////////////////////////////////////////////////////////////////////
      ///////    word vector features                                     ////////////
      ////////////////////////////////////////////////////////////////////////////////

      // cosine
      if(HybridCorefProperties.useWordEmbedding(props, sievename)) {
        // dimension
        int dim = dict.vectors.entrySet().iterator().next().getValue().length;

        // distance between headword
        float[] mV = dict.vectors.get(m.headString.toLowerCase());
        float[] aV = dict.vectors.get(candidate.headString.toLowerCase());
        if(mV!=null && aV!=null) {
          features.incrementCount("WORDVECTOR-DIFF-HEADWORD", cosine(mV, aV));
        }

        mV = dict.vectors.get(mFirst.word().toLowerCase());
        aV = dict.vectors.get(aFirst.word().toLowerCase());
        if(mV!=null && aV!=null) {
          features.incrementCount("WORDVECTOR-DIFF-FIRSTWORD", cosine(mV, aV));
        }

        mV = dict.vectors.get(mLast.word().toLowerCase());
        aV = dict.vectors.get(aLast.word().toLowerCase());
        if(mV!=null && aV!=null) {
          features.incrementCount("WORDVECTOR-DIFF-LASTWORD", cosine(mV, aV));
        }

        if(mPreceding!=null && aPreceding!=null) {
          mV = dict.vectors.get(mPreceding.word().toLowerCase());
          aV = dict.vectors.get(aPreceding.word().toLowerCase());
          if(mV!=null && aV!=null) {
            features.incrementCount("WORDVECTOR-DIFF-PRECEDINGWORD", cosine(mV, aV));
          }
        }
        if(mFollowing!=null && aFollowing!=null) {
          mV = dict.vectors.get(mFollowing.word().toLowerCase());
          aV = dict.vectors.get(aFollowing.word().toLowerCase());
          if(mV!=null && aV!=null) {
            features.incrementCount("WORDVECTOR-DIFF-FOLLOWINGWORD", cosine(mV, aV));
          }
        }

        float[] aggreM = new float[dim];
        float[] aggreA = new float[dim];

        for(CoreLabel cl : m.originalSpan) {
          float[] v = dict.vectors.get(cl.word().toLowerCase());
          if(v==null) continue;
          ArrayMath.pairwiseAddInPlace(aggreM, v);
        }
        for(CoreLabel cl : candidate.originalSpan) {
          float[] v = dict.vectors.get(cl.word().toLowerCase());
          if(v==null) continue;
          ArrayMath.pairwiseAddInPlace(aggreA, v);
        }
        if(ArrayMath.L2Norm(aggreM)!=0 && ArrayMath.L2Norm(aggreA)!=0) {
          features.incrementCount("WORDVECTOR-AGGREGATE-DIFF", cosine(aggreM, aggreA));
        }

        int cnt = 0;
        double dist = 0;
        for(CoreLabel mcl : m.originalSpan) {
          for(CoreLabel acl : candidate.originalSpan) {
            mV = dict.vectors.get(mcl.word().toLowerCase());
            aV = dict.vectors.get(acl.word().toLowerCase());
            if(mV==null || aV==null) continue;
            cnt++;
            dist += cosine(mV, aV);
          }
        }
        features.incrementCount("WORDVECTOR-AVG-DIFF", dist/cnt);
      }

      return new RVFDatum<>(features, label);
    } catch (Exception e) {
      log.info("Datum Extraction failed in Sieve.java while processing document: "+document.docInfo.get("DOC_ID")+" part: "+document.docInfo.get("DOC_PART"));
      throw new RuntimeException(e);
    }
  }

  // assume the input vectors are normalized
  private static double cosine(float[] normalizedVector1, float[] normalizedVector2) {
    double inner = ArrayMath.innerProduct(normalizedVector1, normalizedVector2);
    return inner;
  }
  public static int numEntitiesInList(Mention m) {
    int num = 0;
    for(int i=1 ; i < m.originalSpan.size() ; i++) {
      CoreLabel cl = m.originalSpan.get(i);
      if(cl.word().equals(",")) num++;
      if((cl.word().equalsIgnoreCase("and") || cl.word().equalsIgnoreCase("or"))
          && !m.originalSpan.get(i-1).word().equals(",")) num++;
    }

    return num;
  }
}