package cc.mallet.pipe;
import cc.mallet.types.*;
import java.io.*;
/**
* Pruning low-count features can be a good way to save memory and computation.
* However, in order to use Vectors2Vectors, you need to write out the unpruned
* instance list, read it back into memory, collect statistics, create new
* instances, and then write everything back out.
* <p>
* This class supports a simpler method that makes two passes over the data:
* one to collect statistics and create an augmented "stop list", and a
* second to actually create instances.
*/
public class FeatureCountPipe extends Pipe {
FeatureCounter counter;
public FeatureCountPipe() {
super(new Alphabet(), null);
counter = new FeatureCounter(this.getDataAlphabet());
}
public FeatureCountPipe(Alphabet dataAlphabet, Alphabet targetAlphabet) {
super(dataAlphabet, targetAlphabet);
counter = new FeatureCounter(dataAlphabet);
}
public Instance pipe(Instance instance) {
if (instance.getData() instanceof FeatureSequence) {
FeatureSequence features = (FeatureSequence) instance.getData();
for (int position = 0; position < features.size(); position++) {
counter.increment(features.getIndexAtPosition(position));
}
}
else {
throw new IllegalArgumentException("Looking for a FeatureSequence, found a " +
instance.getData().getClass());
}
return instance;
}
/**
* Returns a new alphabet that contains only features at or above
* the specified limit.
*/
public Alphabet getPrunedAlphabet(int minimumCount) {
Alphabet currentAlphabet = getDataAlphabet();
Alphabet prunedAlphabet = new Alphabet();
for (int feature = 0; feature < currentAlphabet.size(); feature++) {
if (counter.get(feature) >= minimumCount) {
prunedAlphabet.lookupObject(currentAlphabet.lookupIndex(feature));
}
}
prunedAlphabet.stopGrowth();
return prunedAlphabet;
}
/**
* Writes a list of features that do not occur at or
* above the specified cutoff to the pruned file, one per line.
* This file can then be passed to a stopword filter as
* "additional stopwords".
*/
public void writePrunedWords(File prunedFile, int minimumCount) throws IOException {
PrintWriter out = new PrintWriter(prunedFile);
Alphabet currentAlphabet = getDataAlphabet();
for (int feature = 0; feature < currentAlphabet.size(); feature++) {
if (counter.get(feature) < minimumCount) {
out.println(currentAlphabet.lookupObject(feature));
}
}
out.close();
}
/**
* Add all pruned words to the internal stoplist of a SimpleTokenizer.
*/
public void addPrunedWordsToStoplist(SimpleTokenizer tokenizer, int minimumCount) {
Alphabet currentAlphabet = getDataAlphabet();
for (int feature = 0; feature < currentAlphabet.size(); feature++) {
if (counter.get(feature) < minimumCount) {
tokenizer.stop((String) currentAlphabet.lookupObject(feature));
}
}
}
/**
* List the most common words, for addition to a stop file
*/
public void writeCommonWords(File commonFile, int totalWords) throws IOException {
PrintWriter out = new PrintWriter(commonFile);
Alphabet currentAlphabet = getDataAlphabet();
IDSorter[] sortedWords = new IDSorter[currentAlphabet.size()];
for (int type = 0; type < currentAlphabet.size(); type++) {
sortedWords[type] = new IDSorter(type, counter.get(type));
}
java.util.Arrays.sort(sortedWords);
int max = totalWords;
if (currentAlphabet.size() < max) {
max = currentAlphabet.size();
}
for (int rank = 0; rank < max; rank++) {
int type = sortedWords[rank].getID();
out.println (currentAlphabet.lookupObject(type));
}
out.close();
}
static final long serialVersionUID = 1;
}