FeatureExtractor.java example

Explorer
Stanford-NLP-master
- CoreNLP-master
package edu.stanford.nlp.coref.statistical;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import java.util.Set;

import edu.stanford.nlp.coref.CorefProperties;
import edu.stanford.nlp.coref.CorefRules;
import edu.stanford.nlp.coref.CorefUtils;
import edu.stanford.nlp.coref.data.CorefCluster;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.coref.data.Dictionaries.MentionType;
import edu.stanford.nlp.coref.data.Dictionaries.Number;
import edu.stanford.nlp.coref.data.Dictionaries.Person;
import edu.stanford.nlp.coref.data.Document;
import edu.stanford.nlp.coref.data.Document.DocType;
import edu.stanford.nlp.coref.data.Mention;
import edu.stanford.nlp.coref.md.RuleBasedCorefMentionFinder;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;

/**
 * A class for featurizing mention pairs and individual mentions.
 *
 * @author Kevin Clark
 */
public class FeatureExtractor {

  private static final int MIN_WORD_COUNT = 20;
  private static final int BIN_EXACT = 10;
  private static final double BIN_EXPONENT = 1.5;
  private static final Map<Integer, String> SINGLETON_FEATURES = new HashMap<>();
  static {
    SINGLETON_FEATURES.put(2, "animacy");
    SINGLETON_FEATURES.put(3, "person-coarse");
    SINGLETON_FEATURES.put(4, "number");
    SINGLETON_FEATURES.put(5, "position");
    SINGLETON_FEATURES.put(6, "relation");
    SINGLETON_FEATURES.put(7, "quantification");
    SINGLETON_FEATURES.put(8, "modifiers");
    SINGLETON_FEATURES.put(9, "negation");
    SINGLETON_FEATURES.put(10, "modal");
    SINGLETON_FEATURES.put(11, "attitude");
    SINGLETON_FEATURES.put(12, "coordination");
  }

  private final Dictionaries dictionaries;
  private final Set<String> vocabulary;
  private final Compressor<String> compressor;
  private final boolean useConstituencyParse;
  private final boolean useDocSource;

  public FeatureExtractor(Properties props, Dictionaries dictionaries,
      Compressor<String> compressor) {
    this(props, dictionaries, compressor, StatisticalCorefTrainer.wordCountsFile);
  }

  public FeatureExtractor(Properties props, Dictionaries dictionaries,
      Compressor<String> compressor, String wordCountsPath) {
    this(props, dictionaries, compressor, loadVocabulary(wordCountsPath));
  }

  public FeatureExtractor(Properties props, Dictionaries dictionaries,
      Compressor<String> compressor, Set<String> vocabulary) {
    this.dictionaries = dictionaries;
    this.compressor = compressor;
    this.vocabulary = vocabulary;
    this.useDocSource = CorefProperties.conll(props);
    this.useConstituencyParse = CorefProperties.useConstituencyParse(props);
  }

  private static Set<String> loadVocabulary(String wordCountsPath) {
    Set<String> vocabulary = new HashSet<>();
    try {
      Counter<String> counts = IOUtils.readObjectFromURLOrClasspathOrFileSystem(wordCountsPath);
      for (Map.Entry<String, Double> e : counts.entrySet()) {
        if (e.getValue() > MIN_WORD_COUNT) {
          vocabulary.add(e.getKey());
        }
      }
    } catch (Exception e) {
      throw new RuntimeException("Error loading word counts", e);
    }
    return vocabulary;
  }

  public DocumentExamples extract(int id, Document document,
      Map<Pair<Integer, Integer>, Boolean> labeledPairs) {
    return extract(id, document, labeledPairs, compressor);
  }

  public DocumentExamples extract(int id, Document document,
      Map<Pair<Integer, Integer>, Boolean> labeledPairs, Compressor<String> compressor) {
    List<Mention> mentionsList = CorefUtils.getSortedMentions(document);
    Map<Integer, List<Mention>> mentionsByHeadIndex = new HashMap<>();
    for (Mention m : mentionsList) {
      List<Mention> withIndex = mentionsByHeadIndex.get(m.headIndex);
      if (withIndex == null) {
        withIndex = new ArrayList<>();
        mentionsByHeadIndex.put(m.headIndex, withIndex);
      }
      withIndex.add(m);
    }

    Map<Integer, Mention> mentions = document.predictedMentionsByID;
    List<Example> examples = new ArrayList<>();
    Set<Integer> mentionsToExtract = new HashSet<>();
    for (Map.Entry<Pair<Integer, Integer>, Boolean> pair : labeledPairs.entrySet()) {
        Mention m1 = mentions.get(pair.getKey().first);
        Mention m2 = mentions.get(pair.getKey().second);
        mentionsToExtract.add(m1.mentionID);
        mentionsToExtract.add(m2.mentionID);
        CompressedFeatureVector features =
            compressor.compress(getFeatures(document, m1, m2));
        examples.add(new Example(id, m1, m2, pair.getValue() ? 1.0 : 0.0, features));
    }

    Map<Integer, CompressedFeatureVector> mentionFeatures = new HashMap<>();
    for (int mentionID : mentionsToExtract) {
        mentionFeatures.put(mentionID, compressor.compress(getFeatures(document,
            document.predictedMentionsByID.get(mentionID), mentionsByHeadIndex)));
    }

    return new DocumentExamples(id, examples, mentionFeatures);
  }

  private Counter<String> getFeatures(Document doc, Mention m,
      Map<Integer, List<Mention>> mentionsByHeadIndex) {
    Counter<String> features = new ClassicCounter<>();

    // type features
    features.incrementCount("mention-type=" + m.mentionType);
    features.incrementCount("gender=" + m.gender);
    features.incrementCount("person-fine=" + m.person);
    features.incrementCount("head-ne-type=" + m.nerString);
    List<String> singletonFeatures = m.getSingletonFeatures(dictionaries);
    for (Map.Entry<Integer, String> e : SINGLETON_FEATURES.entrySet()) {
      if (e.getKey() < singletonFeatures.size()) {
        features.incrementCount(e.getValue() + "=" + singletonFeatures.get(e.getKey()));
      }
    }

    // length and location features
    addNumeric(features, "mention-length", m.spanToString().length());
    addNumeric(features, "mention-words", m.originalSpan.size());
    addNumeric(features, "sentence-words", m.sentenceWords.size());
    features.incrementCount("sentence-words=" + bin(m.sentenceWords.size()));
    features.incrementCount("mention-position", m.mentionNum
        / (double) doc.predictedMentions.size());
    features.incrementCount("sentence-position", m.sentNum / (double) doc.numSentences);

    // lexical features
    CoreLabel firstWord = firstWord(m);
    CoreLabel lastWord = lastWord(m);
    CoreLabel headWord = headWord(m);
    CoreLabel prevWord = prevWord(m);
    CoreLabel nextWord = nextWord(m);
    CoreLabel prevprevWord = prevprevWord(m);
    CoreLabel nextnextWord = nextnextWord(m);
    String headPOS = getPOS(headWord);
    String firstPOS = getPOS(firstWord);
    String lastPOS = getPOS(lastWord);
    String prevPOS = getPOS(prevWord);
    String nextPOS = getPOS(nextWord);
    String prevprevPOS = getPOS(prevprevWord);
    String nextnextPOS = getPOS(nextnextWord);
    features.incrementCount("first-word=" + wordIndicator(firstWord, firstPOS));
    features.incrementCount("last-word=" + wordIndicator(lastWord, lastPOS));
    features.incrementCount("head-word=" + wordIndicator(headWord, headPOS));
    features.incrementCount("next-word=" + wordIndicator(nextWord, nextPOS));
    features.incrementCount("prev-word=" + wordIndicator(prevWord, prevPOS));
    features.incrementCount("next-bigram="
        + wordIndicator(nextWord, nextnextWord, nextPOS + "_" + nextnextPOS));
    features.incrementCount("prev-bigram="
        + wordIndicator(prevprevWord, prevWord, prevprevPOS + "_" + prevPOS));
    features.incrementCount("next-pos=" + nextPOS);
    features.incrementCount("prev-pos=" + prevPOS);
    features.incrementCount("first-pos=" + firstPOS);
    features.incrementCount("last-pos=" + lastPOS);
    features.incrementCount("next-pos-bigram=" + nextPOS + "_" + nextnextPOS);
    features.incrementCount("prev-pos-bigram=" + prevprevPOS + "_" + prevPOS);
    addDependencyFeatures(features, "parent", getDependencyParent(m), true);
    addFeature(features, "ends-with-head", m.headIndex == m.endIndex - 1);
    addFeature(features, "is-generic", m.originalSpan.size() == 1 && firstPOS.equals("NNS"));

    // syntax features
    IndexedWord w = m.headIndexedWord;
    String depPath = "";
    int depth = 0;
    while (w != null) {
      SemanticGraphEdge e = getDependencyParent(m, w);
      depth++;
      if (depth <= 3 && e != null) {
        depPath += (depPath.isEmpty() ? "" : "_") + e.getRelation().toString();
        features.incrementCount("dep-path=" + depPath);
        w = e.getSource();
      } else {
         w = null;
      }
    }
    if (useConstituencyParse) {
      int fullEmbeddingLevel = headEmbeddingLevel(m.contextParseTree, m.headIndex);
      int mentionEmbeddingLevel = headEmbeddingLevel(m.mentionSubTree, m.headIndex - m.startIndex);
      if (fullEmbeddingLevel != -1 && mentionEmbeddingLevel != -1) {
        features.incrementCount("mention-embedding-level="
            + bin(fullEmbeddingLevel - mentionEmbeddingLevel));
        features.incrementCount("head-embedding-level="
            + bin(mentionEmbeddingLevel));
      } else {
        features.incrementCount("undetermined-embedding-level");
      }
      features.incrementCount("num-embedded-nps=" + bin(numEmbeddedNps(m.mentionSubTree)));
      String syntaxPath = "";
      Tree tree = m.contextParseTree;
      Tree head = tree.getLeaves().get(m.headIndex).ancestor(1, tree);
      depth = 0;
      for (Tree node : tree.pathNodeToNode(head, tree)) {
        syntaxPath += node.value() + "-";
        features.incrementCount("syntax-path=" + syntaxPath);
        depth++;
        if (depth >= 4 || node.value().equals("S")) {
          break;
        }
      }
    }

    // mention containment features
    addFeature(features, "contained-in-other-mention",
        mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m.insideIn(m2)));
    addFeature(features, "contains-other-mention",
        mentionsByHeadIndex.get(m.headIndex).stream().anyMatch(m2 -> m != m2 && m2.insideIn(m)));

    // features from dcoref rules
    addFeature(features, "bare-plural", m.originalSpan.size() == 1 && headPOS.equals("NNS"));
    addFeature(features, "quantifier-start", dictionaries.quantifiers.contains(firstWord.word().toLowerCase()));
    addFeature(features, "negative-start", firstWord.word().toLowerCase().matches("none|no|nothing|not"));
    addFeature(features, "partitive", RuleBasedCorefMentionFinder.partitiveRule(m, m.sentenceWords, dictionaries));
    addFeature(features, "adjectival-demonym", dictionaries.isAdjectivalDemonym(m.spanToString()));
    if(doc.docType != DocType.ARTICLE && m.person == Person.YOU
        && nextWord != null && nextWord.word().equalsIgnoreCase("know")) {
           features.incrementCount("generic-you");
    }

    return features;
  }

  private Counter<String> getFeatures(Document doc, Mention m1, Mention m2) {
    assert(m1.appearEarlierThan(m2));
    Counter<String> features = new ClassicCounter<>();

    // global features
    features.incrementCount("bias");
    if (useDocSource) {
      features.incrementCount("doc-type=" + doc.docType);
      if(doc.docInfo != null && doc.docInfo.containsKey("DOC_ID")) {
        features.incrementCount("doc-source=" + doc.docInfo.get("DOC_ID").split("/")[1]);
      }
    }

    // singleton feature conjunctions
    List<String> singletonFeatures1 = m1.getSingletonFeatures(dictionaries);
    List<String> singletonFeatures2 = m2.getSingletonFeatures(dictionaries);
    for (Map.Entry<Integer, String> e : SINGLETON_FEATURES.entrySet()) {
      if (e.getKey() < singletonFeatures1.size() && e.getKey() < singletonFeatures2.size()) {
        features.incrementCount(e.getValue() + "=" + singletonFeatures1.get(e.getKey()) + "_" +
                singletonFeatures2.get(e.getKey()));
      }
    }
    SemanticGraphEdge p1 = getDependencyParent(m1);
    SemanticGraphEdge p2 = getDependencyParent(m2);
    features.incrementCount("dep-relations=" + (p1 == null ? "null" : p1.getRelation()) + "_"
        + (p2 == null ? "null" : p2.getRelation()));
    features.incrementCount("roles=" + getRole(m1) + "_" + getRole(m2));
    CoreLabel headCL1 = headWord(m1);
    CoreLabel headCL2 = headWord(m2);
    String headPOS1 = getPOS(headCL1);
    String headPOS2 = getPOS(headCL2);
    features.incrementCount("head-pos-s=" + headPOS1 + "_" + headPOS2);
    features.incrementCount("head-words=" + wordIndicator("h_" + headCL1.word().toLowerCase()
            + "_" + headCL2.word().toLowerCase(), headPOS1 + "_" + headPOS2));

    // agreement features
    addFeature(features, "animacies-agree", m2.animaciesAgree(m1));
    addFeature(features, "attributes-agree", m2.attributesAgree(m1, dictionaries));
    addFeature(features, "entity-types-agree", m2.entityTypesAgree(m1, dictionaries));
    addFeature(features, "numbers-agree", m2.numbersAgree(m1));
    addFeature(features, "genders-agree", m2.gendersAgree(m1));
    addFeature(features, "ner-strings-equal", m1.nerString.equals(m2.nerString));

    // string matching features
    addFeature(features, "antecedent-head-in-anaphor", headContainedIn(m1, m2));
    addFeature(features, "anaphor-head-in-antecedent", headContainedIn(m2, m1));
    if (m1.mentionType != MentionType.PRONOMINAL && m2.mentionType != MentionType.PRONOMINAL) {
      addFeature(features, "antecedent-in-anaphor",
          m2.spanToString().toLowerCase().contains(m1.spanToString().toLowerCase()));
      addFeature(features, "anaphor-in-antecedent",
          m1.spanToString().toLowerCase().contains(m2.spanToString().toLowerCase()));
      addFeature(features, "heads-equal", m1.headString.equalsIgnoreCase(m2.headString));
      addFeature(features, "heads-agree", m2.headsAgree(m1));
      addFeature(features, "exact-match", m1.toString().trim().toLowerCase().equals(
          m2.toString().trim().toLowerCase()));
      addFeature(features, "partial-match", relaxedStringMatch(m1, m2));

      double editDistance = StringUtils.editDistance(m1.spanToString(), m2.spanToString()) /
         (double) (m1.spanToString().length() + m2.spanToString().length());
      features.incrementCount("edit-distance", editDistance);
      features.incrementCount("edit-distance=" + ((int)(editDistance * 10) / 10.0));

      double headEditDistance = StringUtils.editDistance(m1.headString, m2.headString) /
          (double) (m1.headString.length() + m2.headString.length());
      features.incrementCount("head-edit-distance", headEditDistance);
      features.incrementCount("head-edit-distance=" + ((int)(headEditDistance * 10) / 10.0));
    }

    // distance features
    addNumeric(features, "mention-distance", m2.mentionNum -  m1.mentionNum);
    addNumeric(features, "sentence-distance", m2.sentNum -  m1.sentNum);
    if (m2.sentNum == m1.sentNum) {
        addNumeric(features, "word-distance", m2.startIndex -  m1.endIndex);
        if (m1.endIndex > m2.startIndex) {
            features.incrementCount("spans-intersect");
        }
    }

    // setup for dcoref features
    Set<Mention> ms1 = new HashSet<>();
    ms1.add(m1);
    Set<Mention> ms2 = new HashSet<>();
    ms2.add(m2);
    Random r = new Random();
    CorefCluster c1 = new CorefCluster(20000 + r.nextInt(10000), ms1);
    CorefCluster c2 = new CorefCluster(10000 + r.nextInt(10000), ms2);
    String s2 = m2.lowercaseNormalizedSpanString();
    String s1 = m1.lowercaseNormalizedSpanString();

    // discourse dcoref features
    addFeature(features, "mention-speaker-PER0",
        m2.headWord.get(SpeakerAnnotation.class).equalsIgnoreCase("PER0"));
    addFeature(features, "antecedent-is-anaphor-speaker", CorefRules.antecedentIsMentionSpeaker(doc, m2, m1, dictionaries));
    addFeature(features, "same-speaker", CorefRules.entitySameSpeaker(doc, m2, m1));
    addFeature(features, "person-disagree-same-speaker", CorefRules.entityPersonDisagree(doc, m2, m1, dictionaries)
        && CorefRules.entitySameSpeaker(doc, m2, m1));
    addFeature(features, "antecedent-matches-anaphor-speaker",
        CorefRules.antecedentMatchesMentionSpeakerAnnotation(m2, m1, doc));
    addFeature(features, "discourse-you-PER0", m2.person == Person.YOU
        && doc.docType == DocType.ARTICLE
        && m2.headWord.get(CoreAnnotations.SpeakerAnnotation.class).equals("PER0"));
    addFeature(features, "speaker-match-i-i", m2.number == Number.SINGULAR
        && dictionaries.firstPersonPronouns.contains(s1)
        && m1.number == Number.SINGULAR
        && dictionaries.firstPersonPronouns.contains(s2)
        && CorefRules.entitySameSpeaker(doc, m2, m1));
    addFeature(features, "speaker-match-speaker-i", m2.number == Number.SINGULAR
        && dictionaries.firstPersonPronouns.contains(s2)
        && CorefRules.antecedentIsMentionSpeaker(doc, m2, m1, dictionaries));
    addFeature(features, "speaker-match-i-speaker", m1.number == Number.SINGULAR
        && dictionaries.firstPersonPronouns.contains(s1)
        && CorefRules.antecedentIsMentionSpeaker(doc, m1, m2, dictionaries));
    addFeature(features, "speaker-match-you-you",
        dictionaries.secondPersonPronouns.contains(s1)
        && dictionaries.secondPersonPronouns.contains(s2)
        && CorefRules.entitySameSpeaker(doc, m2, m1));
    addFeature(features, "discourse-between-two-person", ((m2.person == Person.I
        && m1.person == Person.YOU
        || (m2.person == Person.YOU && m1.person == Person.I))
        && (m2.headWord.get(CoreAnnotations.UtteranceAnnotation.class)
                - m1.headWord.get(CoreAnnotations.UtteranceAnnotation.class) == 1)
        && doc.docType == DocType.CONVERSATION));
    addFeature(features, "incompatible-not-match", m1.person != Person.I
        && m2.person != Person.I
        && (CorefRules.antecedentIsMentionSpeaker(doc, m1, m2, dictionaries)
                || CorefRules.antecedentIsMentionSpeaker(doc, m2, m1, dictionaries)));
    int utteranceDist = Math.abs(m1.headWord.get(CoreAnnotations.UtteranceAnnotation.class) -
        m2.headWord.get(CoreAnnotations.UtteranceAnnotation.class));
    if(doc.docType != DocType.ARTICLE && utteranceDist == 1
          && !CorefRules.entitySameSpeaker(doc, m2, m1)) {
      addFeature(features, "speaker-mismatch-i-i", m1.person == Person.I
          && m2.person == Person.I);
      addFeature(features, "speaker-mismatch-you-you", m1.person == Person.YOU
          && m2.person == Person.YOU);
      addFeature(features, "speaker-mismatch-we-we", m1.person == Person.WE
          && m2.person == Person.WE);
    }

    // other dcoref features
    String firstWord1 = firstWord(m1).word().toLowerCase();
    addFeature(features, "indefinite-article-np", (m1.appositions == null &&
        m1.predicateNominatives == null &&
        (firstWord1.equals("a") || firstWord1.equals("an"))));
    addFeature(features, "far-this", m2.lowercaseNormalizedSpanString().equals("this") &&
        Math.abs(m2.sentNum - m1.sentNum) > 3);
    addFeature(features, "per0-you-in-article", m2.person == Person.YOU &&
        doc.docType == DocType.ARTICLE &&
        m2.headWord.get(CoreAnnotations.SpeakerAnnotation.class).equals("PER0"));
    addFeature(features, "inside-in", m2.insideIn(m1) || m1.insideIn(m2));
    addFeature(features, "indefinite-determiners",
        dictionaries.indefinitePronouns.contains(m1.originalSpan.get(0).lemma())
            || dictionaries.indefinitePronouns.contains(m2.originalSpan.get(0).lemma()));

    addFeature(features, "entity-attributes-agree", CorefRules.entityAttributesAgree(c2, c1));
    addFeature(features, "entity-token-distance", CorefRules.entityTokenDistance(m2, m1));
    addFeature(features, "i-within-i", CorefRules.entityIWithinI(m2, m1, dictionaries));
    addFeature(features, "exact-string-match", CorefRules.entityExactStringMatch(c2, c1,dictionaries, doc.roleSet));
    addFeature(features, "entity-relaxed-heads-agree",
        CorefRules.entityRelaxedHeadsAgreeBetweenMentions(c2, c1, m2, m1));
    addFeature(features, "is-acronym", CorefRules.entityIsAcronym(doc, c2, c1));
    addFeature(features, "demonym", m2.isDemonym(m1, dictionaries));
    addFeature(features, "incompatible-modifier", CorefRules.entityHaveIncompatibleModifier(m2, m1));
    addFeature(features, "head-lemma-match", m1.headWord.lemma().equals(m2.headWord.lemma()));
    addFeature(features, "words-included", CorefRules.entityWordsIncluded(c2, c1, m2, m1));
    addFeature(features, "extra-proper-noun", CorefRules.entityHaveExtraProperNoun(m2, m1, new HashSet<>()));
    addFeature(features, "number-in-later-mentions", CorefRules.entityNumberInLaterMention(m2, m1));
    addFeature(features, "sentence-context-incompatible",
        CorefRules.sentenceContextIncompatible(m2, m1, dictionaries));

    // syntax features
    if (useConstituencyParse) {
      if (m1.sentNum == m2.sentNum) {
        int clauseCount = 0;
        Tree tree = m2.contextParseTree;
        Tree current = m2.mentionSubTree;
        while (true) {
          current = current.ancestor(1, tree);
          if (current.label().value().startsWith("S")) {
            clauseCount++;
          }
          if (current.dominates(m1.mentionSubTree)) {
            break;
          }
          if (current.label().value().equals("ROOT") || current.ancestor(1, tree) == null) {
            break;
          }
        }
        features.incrementCount("clause-count", clauseCount);
        features.incrementCount("clause-count=" + bin(clauseCount));
      }
      if (RuleBasedCorefMentionFinder.isPleonastic(m2, m2.contextParseTree)
          || RuleBasedCorefMentionFinder.isPleonastic(m1, m1.contextParseTree)) {
        features.incrementCount("pleonastic-it");
      }
      if (maximalNp(m1.mentionSubTree) == maximalNp(m2.mentionSubTree)) {
        features.incrementCount("same-maximal-np");
      }
      boolean m1Embedded = headEmbeddingLevel(
        m1.mentionSubTree, m1.headIndex - m1.startIndex) > 1;
      boolean m2Embedded = headEmbeddingLevel(
        m2.mentionSubTree, m2.headIndex - m2.startIndex) > 1;
      features.incrementCount("embedding=" + m1Embedded + "_" + m2Embedded);
    }

    return features;
  }

  private static void addNumeric(Counter<String> features, String key, int value) {
    features.incrementCount(key + "=" + bin(value));
    features.incrementCount(key, value);
  }

  public static boolean relaxedStringMatch(Mention m1, Mention m2) {
    Set<String> propers = getPropers(m1);
    propers.retainAll(getPropers(m2));
    return !propers.isEmpty();
  }

  private static final Set<String> PROPERS = new HashSet<>();
  static {
    PROPERS.add("NN");
    PROPERS.add("NNS");
    PROPERS.add("NNP");
    PROPERS.add("NNPS");
  }
  private static Set<String> getPropers(Mention m) {
    Set<String> propers = new HashSet<>();
    for (int i = m.startIndex; i < m.endIndex; i++) {
      CoreLabel cl = m.sentenceWords.get(i);
      String POS = cl.get(CoreAnnotations.PartOfSpeechAnnotation.class);
      String word = cl.word().toLowerCase();
      if (PROPERS.contains(POS)) {
        propers.add(word);
      }
    }
    return propers;
  }

  private static void addFeature(Counter<String> features, String name, boolean value) {
    if (value) {
      features.incrementCount(name);
    }
  }

  private static String bin(int value) {
    return bin(value, BIN_EXACT, BIN_EXPONENT, Integer.MAX_VALUE);
  }

  private static String bin(int value, int binExact, double binExponent, int cap) {
    if (value < 0) {
      return "-" + bin(-value);
    }
    if (value > cap) {
      return cap + "+";
    }

    String bin = String.valueOf(value);
    if (value > binExact) {
      double start = Math.pow(binExponent, (int) (Math.log(value) / Math.log(binExponent)));
      bin = (int) start + "-" + (int) (start * binExponent);
    }
    return bin;
  }

  private static String getRole(Mention m) {
    if (m.isSubject) {
      return "subject";
    } else if (m.isDirectObject) {
      return "direct-object";
    } else if (m.isIndirectObject) {
      return "indirect-object";
    } else if (m.isPrepositionObject) {
      return "preposition-object";
    }
    return "unknown";
  }

  private static SemanticGraphEdge getDependencyParent(Mention m) {
    return getDependencyParent(m, m.headIndexedWord);
  }

  private static SemanticGraphEdge getDependencyParent(Mention m, IndexedWord w) {
    Iterator<SemanticGraphEdge> iterator = m.enhancedDependency.incomingEdgeIterator(w);
    return iterator.hasNext() ? iterator.next() : null;
  }

  private void addDependencyFeatures(Counter<String> features, String prefix,
      SemanticGraphEdge e, boolean addWord) {
    if (e == null) {
      features.incrementCount("no-" + prefix);
      return;
    }
    IndexedWord parent = e.getSource();
    String parentPOS = parent.tag();
    String parentWord = parent.word();
    String parentRelation = e.getRelation().toString();
    //String parentDir = e.getSource().beginPosition() < e.getTarget().beginPosition()
    //    ? "right" : "left";
    if (addWord) {
      features.incrementCount(prefix + "-word=" + wordIndicator(parentWord, parentPOS));
    }
    features.incrementCount(prefix + "-POS=" + parentPOS);
    features.incrementCount(prefix + "-relation=" + parentRelation);
    //features.incrementCount(prefix + "-direction=" + parentDir);
  }

  public Tree maximalNp(Tree mentionSubTree) {
    Tree maximalSubtree = mentionSubTree;
    for (Tree subtree : mentionSubTree.postOrderNodeList()) {
      if (!subtree.isLeaf() && !subtree.isPreTerminal()) {
        String label = ((CoreLabel) subtree.label())
            .get(CoreAnnotations.ValueAnnotation.class);
        if (label.equals("NP")) {
          maximalSubtree = subtree;
        }
      }
    }
    return maximalSubtree;
  }

  private int numEmbeddedNps(Tree mentionSubTree) {
    int embeddedNps = 0;
    for (Tree subtree : mentionSubTree.postOrderNodeList()) {
      if (!subtree.isLeaf() && !subtree.isPreTerminal()) {
        String label = ((CoreLabel) subtree.label())
            .get(CoreAnnotations.ValueAnnotation.class);
        if (label.equals("NP")) {
          embeddedNps++;
        }
      }
    }
    return embeddedNps;
  }

  private int headEmbeddingLevel(Tree tree, int headIndex) {
    int embeddingLevel = 0;
    try {
      Tree subtree = tree.getLeaves().get(headIndex);
      while (subtree != null) {
        String label = ((CoreLabel) subtree.label()).get(CoreAnnotations.ValueAnnotation.class);
        subtree = subtree.ancestor(1, tree);
        if (label.equals("NP")) {
          embeddingLevel++;
        }
      }
    } catch (Exception e) {
      return -1;
    }
    return embeddingLevel;
  }

  private static boolean headContainedIn(Mention m1, Mention m2) {
    String head = m1.headString;
    for (CoreLabel cl : m2.originalSpan) {
      if (head.equals(cl.word().toLowerCase())) {
        return true;
      }
    }
    return false;
  }

  private String wordIndicator(CoreLabel cl1, CoreLabel cl2, String POS) {
    String w1 = cl1 == null ? "NONE" : cl1.word().toLowerCase();
    String w2 = cl2 == null ? "NONE" : cl2.word().toLowerCase();
    return wordIndicator(w1 + "_" + w2, POS);
  }

  private String wordIndicator(CoreLabel cl, String POS) {
    if (cl == null) {
      return "NONE";
    }
    return wordIndicator(cl.word().toLowerCase(), POS);
  }

  private String wordIndicator(String word, String POS) {
    if (word == null) {
      return "NONE";
    }
    return vocabulary.contains(word) ? word : POS;
  }

  private static String getPOS(CoreLabel cl) {
    return cl == null ? "NONE" : cl.get(CoreAnnotations.PartOfSpeechAnnotation.class);
  }

  private static CoreLabel firstWord(Mention m) {
    return m.originalSpan.get(0);
  }

  private static CoreLabel headWord(Mention m) {
    return m.headWord;
  }

  private static CoreLabel lastWord(Mention m) {
    return m.originalSpan.get(m.originalSpan.size() - 1);
  }

  private static CoreLabel nextnextWord(Mention m) {
    return m.endIndex + 1 < m.sentenceWords.size() ? m.sentenceWords.get(m.endIndex + 1) : null;
  }

  private static CoreLabel nextWord(Mention m) {
    return m.endIndex < m.sentenceWords.size() ? m.sentenceWords.get(m.endIndex) : null;
  }

  private static CoreLabel prevWord(Mention m) {
    return m.startIndex > 0 ? m.sentenceWords.get(m.startIndex - 1) : null;
  }

  private static CoreLabel prevprevWord(Mention m) {
    return m.startIndex > 1 ? m.sentenceWords.get(m.startIndex - 2) : null;
  }
}