package edu.stanford.nlp.parser.lexparser; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.ie.ChineseMorphFeatureSets; import edu.stanford.nlp.ling.TaggedWord; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.international.pennchinese.RadicalMap; import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.stats.IntCounter; import java.util.*; /** * @author Galen Andrew */ public class ChineseWordFeatureExtractor implements WordFeatureExtractor { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ChineseWordFeatureExtractor.class); /** * */ private static final long serialVersionUID = -4327267414095852504L; boolean morpho; boolean chars; boolean rads; boolean useLength; boolean useFreq; boolean bigrams; boolean conjunctions; boolean mildConjunctions; public boolean turnOffWordFeatures = false; private IntCounter wordCounter; private ChineseMorphFeatureSets cmfs = null; private static final String featureDir = "gbfeatures"; public void setFeatureLevel(int level) { morpho = false; chars = false; rads = false; useLength = false; useFreq = false; bigrams = false; conjunctions = false; mildConjunctions = false; switch (level) { case 3: bigrams = true; conjunctions = true; case 2: chars = true; case 1: morpho = true; mildConjunctions = true; loadFeatures(); case 0: rads = true; case -1: useLength = true; useFreq = true; break; default: log.info("Feature level " + level + " is not supported in ChineseWordFeatureExtractor."); log.info("Using level 0"); setFeatureLevel(0); } } /* public ChineseWordFeatureExtractor() { this(trees, 2); } */ public ChineseWordFeatureExtractor(int featureLevel) { wordCounter = new IntCounter(); setFeatureLevel(featureLevel); } public void train(Collection<Tree> trees) { train(trees, 1.0); } public void train(Collection<Tree> trees, double weight) { for (Tree tree : trees) { train(tree, weight); } } public void train(Tree tree, double weight) { train(tree.taggedYield(), weight); } public void train(List<TaggedWord> sentence, double weight) { for (TaggedWord word : sentence) { String wordString = word.word(); wordCounter.incrementCount(wordString, weight); } } private void loadFeatures() { if (cmfs != null) return; cmfs = new ChineseMorphFeatureSets(featureDir); log.info("Total affix features: " + cmfs.getAffixFeatures().size()); } private Collection<String> threshedFeatures; public void applyFeatureCountThreshold(Collection<String> data, int thresh) { IntCounter c = new IntCounter(); for (String datum : data) { for (String feat : makeFeatures(datum)) { c.incrementCount(feat); } } threshedFeatures = c.keysAbove(thresh); log.info((c.size() - threshedFeatures.size()) + " word features removed due to thresholding."); } public Collection<String> makeFeatures(String word) { List<String> features = new ArrayList<>(); if (morpho) { for (Map.Entry<String, Set<Character>> e : cmfs.getSingletonFeatures().entrySet()) { if (e.getValue().contains(word.charAt(0))) { features.add(e.getKey() + "-1"); } } // Hooray for generics!!! :-) for (Map.Entry<String, Pair<Set<Character>, Set<Character>>> e : cmfs.getAffixFeatures().entrySet()) { boolean both = false; if (e.getValue().first().contains(word.charAt(0))) { features.add(e.getKey() + "-P"); both = true; } if (e.getValue().second().contains(word.charAt(word.length() - 1))) { features.add(e.getKey() + "-S"); } else { both = false; } if (both && mildConjunctions && !conjunctions) { features.add(e.getKey() + "-PS"); } } if (conjunctions) { int max = features.size(); for (int i=1; i<max; i++) { String s1 = features.get(i); for (int j=0; j<i; j++) { String s2 = features.get(j); features.add(s1 + "&&" + s2); } } } } if (!turnOffWordFeatures) { features.add(word + "-W"); } if (rads) { features.add(RadicalMap.getRadical(word.charAt(0)) + "-FR"); features.add(RadicalMap.getRadical(word.charAt(word.length()-1)) + "-LR"); for (int i=0; i<word.length(); i++) { features.add(RadicalMap.getRadical(word.charAt(i)) + "-CR"); } } if (chars) { // first and last chars features.add(word.charAt(0) + "-FC"); features.add(word.charAt(word.length()-1) + "-LC"); for (int i=0; i<word.length(); i++) { features.add(word.charAt(i) + "-CC"); } if (bigrams && word.length() > 1) { features.add(word.substring(0,2) + "-FB"); features.add(word.substring(word.length()-2) + "-LB"); for (int i=2; i<=word.length(); i++) { features.add(word.substring(i-2,i) + "-CB"); } } } if (useLength) { int lengthBin = word.length(); if (lengthBin >= 5) { if (lengthBin >= 8) { lengthBin = 8; } else { lengthBin = 5; } } features.add(word.length() + "-L"); } if (useFreq && !turnOffWordFeatures) { int freq = wordCounter.getIntCount(word); int freqBin; if (freq <= 1) freqBin = 0; else if (freq <= 3) freqBin = 1; else if (freq <= 6) freqBin = 2; else if (freq <= 15) freqBin = 3; else if (freq <= 50) freqBin = 4; else freqBin = 5; features.add(freqBin + "-FQ"); } features.add("PR"); if (threshedFeatures != null) { for (Iterator<String> iter = features.iterator(); iter.hasNext();) { String s = iter.next(); if (!threshedFeatures.contains(s)) { iter.remove(); } } } return features; } }