ChineseWordFeatureExtractor.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.parser.lexparser; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.ie.ChineseMorphFeatureSets;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.international.pennchinese.RadicalMap;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.stats.IntCounter;

import java.util.*;

/**
 * @author Galen Andrew
 */
public class ChineseWordFeatureExtractor implements WordFeatureExtractor  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ChineseWordFeatureExtractor.class);
  /**
   * 
   */
  private static final long serialVersionUID = -4327267414095852504L;
  boolean morpho;
  boolean chars;
  boolean rads;
  boolean useLength;
  boolean useFreq;
  boolean bigrams;
  boolean conjunctions;
  boolean mildConjunctions;

  public boolean turnOffWordFeatures = false;
  private IntCounter wordCounter;
  private ChineseMorphFeatureSets cmfs = null;
  private static final String featureDir = "gbfeatures";

  public void setFeatureLevel(int level) {
    morpho = false;
    chars = false;
    rads = false;
    useLength = false;
    useFreq = false;
    bigrams = false;
    conjunctions = false;
    mildConjunctions = false;

    switch (level) {
      case 3:
        bigrams = true;
        conjunctions = true;

      case 2:
        chars = true;

      case 1:
        morpho = true;
        mildConjunctions = true;
        loadFeatures();

      case 0:
        rads = true;

      case -1:
        useLength = true;
        useFreq = true;
        break;

      default:
        log.info("Feature level " + level + " is not supported in ChineseWordFeatureExtractor.");
        log.info("Using level 0");
        setFeatureLevel(0);
    }
  }

  /*
  public ChineseWordFeatureExtractor() {
    this(trees, 2);
  }
  */

  public ChineseWordFeatureExtractor(int featureLevel) {
    wordCounter = new IntCounter();
    setFeatureLevel(featureLevel);
  }

  public void train(Collection<Tree> trees) {
    train(trees, 1.0);
  }

  public void train(Collection<Tree> trees, double weight) {
    for (Tree tree : trees) {
      train(tree, weight);
    }
  }

  public void train(Tree tree, double weight) {
    train(tree.taggedYield(), weight);
  }

  public void train(List<TaggedWord> sentence, double weight) {
    for (TaggedWord word : sentence) {
      String wordString = word.word();
      wordCounter.incrementCount(wordString, weight);
    }
  }

  private void loadFeatures() {
    if (cmfs != null) return;
    cmfs = new ChineseMorphFeatureSets(featureDir);
    log.info("Total affix features: " + cmfs.getAffixFeatures().size());
  }

  private Collection<String> threshedFeatures;

  public void applyFeatureCountThreshold(Collection<String> data, int thresh) {
    IntCounter c = new IntCounter();
    for (String datum : data) {
      for (String feat : makeFeatures(datum)) {
        c.incrementCount(feat);
      }
    }
    threshedFeatures = c.keysAbove(thresh);
    log.info((c.size() - threshedFeatures.size()) + " word features removed due to thresholding.");
  }

  public Collection<String> makeFeatures(String word) {
    List<String> features = new ArrayList<>();
    if (morpho) {
      for (Map.Entry<String, Set<Character>> e : cmfs.getSingletonFeatures().entrySet()) {
        if (e.getValue().contains(word.charAt(0))) {
          features.add(e.getKey() + "-1");
        }
      }

      // Hooray for generics!!! :-)
      for (Map.Entry<String, Pair<Set<Character>, Set<Character>>> e : cmfs.getAffixFeatures().entrySet()) {
        boolean both = false;
        if (e.getValue().first().contains(word.charAt(0))) {
          features.add(e.getKey() + "-P");
          both = true;
        }
        if (e.getValue().second().contains(word.charAt(word.length() - 1))) {
          features.add(e.getKey() + "-S");
        } else {
          both = false;
        }
        if (both && mildConjunctions && !conjunctions) {
          features.add(e.getKey() + "-PS");
        }
      }

      if (conjunctions) {
        int max = features.size();
        for (int i=1; i<max; i++) {
          String s1 = features.get(i);
          for (int j=0; j<i; j++) {
            String s2 = features.get(j);
            features.add(s1 + "&&" + s2);
          }
        }
      }
    }

    if (!turnOffWordFeatures) {
      features.add(word + "-W");
    }

    if (rads) {
      features.add(RadicalMap.getRadical(word.charAt(0)) + "-FR");
      features.add(RadicalMap.getRadical(word.charAt(word.length()-1)) + "-LR");

      for (int i=0; i<word.length(); i++) {
        features.add(RadicalMap.getRadical(word.charAt(i)) + "-CR");
      }
    }

    if (chars) {
      // first and last chars
      features.add(word.charAt(0) + "-FC");
      features.add(word.charAt(word.length()-1) + "-LC");

      for (int i=0; i<word.length(); i++) {
        features.add(word.charAt(i) + "-CC");
      }

      if (bigrams && word.length() > 1) {
        features.add(word.substring(0,2) + "-FB");
        features.add(word.substring(word.length()-2) + "-LB");
        for (int i=2; i<=word.length(); i++) {
          features.add(word.substring(i-2,i) + "-CB");
        }
      }
    }

    if (useLength) {
      int lengthBin = word.length();
      if (lengthBin >= 5) {
        if (lengthBin >= 8) {
          lengthBin = 8;
        } else {
          lengthBin = 5;
        }
      }
      features.add(word.length() + "-L");
    }

    if (useFreq && !turnOffWordFeatures) {
      int freq = wordCounter.getIntCount(word);
      int freqBin;
      if (freq <= 1) freqBin = 0;
      else if (freq <= 3) freqBin = 1;
      else if (freq <= 6) freqBin = 2;
      else if (freq <= 15) freqBin = 3;
      else if (freq <= 50) freqBin = 4;
      else freqBin = 5;
      features.add(freqBin + "-FQ");
    }

    features.add("PR");

    if (threshedFeatures != null) {
      for (Iterator<String> iter = features.iterator(); iter.hasNext();) {
        String s = iter.next();
        if (!threshedFeatures.contains(s)) {
          iter.remove();
        }
      }
    }

    return features;
  }
}