package cc.mallet.pipe; import cc.mallet.types.*; import gnu.trove.*; import java.io.*; /** * Pruning low-count features can be a good way to save memory and computation. * However, in order to use Vectors2Vectors, you need to write out the unpruned * instance list, read it back into memory, collect statistics, create new * instances, and then write everything back out. * <p> * This class supports a simpler method that makes two passes over the data: * one to collect statistics and create an augmented "stop list", and a * second to actually create instances. */ public class FeatureDocFreqPipe extends Pipe { FeatureCounter counter; int numInstances; public FeatureDocFreqPipe() { super(new Alphabet(), null); counter = new FeatureCounter(this.getDataAlphabet()); numInstances = 0; } public FeatureDocFreqPipe(Alphabet dataAlphabet, Alphabet targetAlphabet) { super(dataAlphabet, targetAlphabet); counter = new FeatureCounter(dataAlphabet); numInstances = 0; } public Instance pipe(Instance instance) { TIntIntHashMap localCounter = new TIntIntHashMap(); if (instance.getData() instanceof FeatureSequence) { FeatureSequence features = (FeatureSequence) instance.getData(); for (int position = 0; position < features.size(); position++) { localCounter.adjustOrPutValue(features.getIndexAtPosition(position), 1, 1); } } else { throw new IllegalArgumentException("Looking for a FeatureSequence, found a " + instance.getData().getClass()); } for (int feature: localCounter.keys()) { counter.increment(feature); } numInstances++; return instance; } /** * Add all pruned words to the internal stoplist of a SimpleTokenizer. * * @param docFrequencyCutoff Remove words that occur in greater than this proportion of documents. 0.05 corresponds to IDF >= 3. */ public void addPrunedWordsToStoplist(SimpleTokenizer tokenizer, double docFrequencyCutoff) { Alphabet currentAlphabet = getDataAlphabet(); for (int feature = 0; feature < currentAlphabet.size(); feature++) { if ((double) counter.get(feature) / numInstances > docFrequencyCutoff) { tokenizer.stop((String) currentAlphabet.lookupObject(feature)); } } } static final long serialVersionUID = 1; }