FeatureCountPipe.java example

Explorer
topic-modeling-master
package cc.mallet.pipe;

import cc.mallet.types.*;

import java.io.*;

/** 
 *  Pruning low-count features can be a good way to save memory and computation.
 *   However, in order to use Vectors2Vectors, you need to write out the unpruned
 *   instance list, read it back into memory, collect statistics, create new 
 *   instances, and then write everything back out.
 * <p>
 *  This class supports a simpler method that makes two passes over the data:
 *   one to collect statistics and create an augmented "stop list", and a
 *   second to actually create instances.
 */

public class FeatureCountPipe extends Pipe {
		
	FeatureCounter counter;

	public FeatureCountPipe() {
		super(new Alphabet(), null);

		counter = new FeatureCounter(this.getDataAlphabet());
	}
		
	public FeatureCountPipe(Alphabet dataAlphabet, Alphabet targetAlphabet) {
		super(dataAlphabet, targetAlphabet);

		counter = new FeatureCounter(dataAlphabet);
	}

	public Instance pipe(Instance instance) {
			
		if (instance.getData() instanceof FeatureSequence) {
				
			FeatureSequence features = (FeatureSequence) instance.getData();

			for (int position = 0; position < features.size(); position++) {
				counter.increment(features.getIndexAtPosition(position));
			}

		}
		else {
			throw new IllegalArgumentException("Looking for a FeatureSequence, found a " + 
											   instance.getData().getClass());
		}

		return instance;
	}

	/**
	 * Returns a new alphabet that contains only features at or above 
	 *  the specified limit.
	 */
	public Alphabet getPrunedAlphabet(int minimumCount) {
			
		Alphabet currentAlphabet = getDataAlphabet();
		Alphabet prunedAlphabet = new Alphabet();

		for (int feature = 0; feature < currentAlphabet.size(); feature++) {
			if (counter.get(feature) >= minimumCount) {
				prunedAlphabet.lookupObject(currentAlphabet.lookupIndex(feature));
			}
		}

		prunedAlphabet.stopGrowth();
		return prunedAlphabet;
			
	}

	/** 
	 *  Writes a list of features that do not occur at or 
	 *  above the specified cutoff to the pruned file, one per line.
	 *  This file can then be passed to a stopword filter as 
	 *  "additional stopwords".
	 */
	public void writePrunedWords(File prunedFile, int minimumCount) throws IOException {

		PrintWriter out = new PrintWriter(prunedFile);

		Alphabet currentAlphabet = getDataAlphabet();

		for (int feature = 0; feature < currentAlphabet.size(); feature++) {
			if (counter.get(feature) < minimumCount) {
				out.println(currentAlphabet.lookupObject(feature));
			}
		}

		out.close();
	}

	/** 
	 *  Add all pruned words to the internal stoplist of a SimpleTokenizer.
	 */
	public void addPrunedWordsToStoplist(SimpleTokenizer tokenizer, int minimumCount) {
		Alphabet currentAlphabet = getDataAlphabet();

        for (int feature = 0; feature < currentAlphabet.size(); feature++) {
            if (counter.get(feature) < minimumCount) {
                tokenizer.stop((String) currentAlphabet.lookupObject(feature));
            }
        }
	}

	/**
	 * List the most common words, for addition to a stop file
	 */
	public void writeCommonWords(File commonFile, int totalWords) throws IOException {

		PrintWriter out = new PrintWriter(commonFile);
		
        Alphabet currentAlphabet = getDataAlphabet();

		IDSorter[] sortedWords = new IDSorter[currentAlphabet.size()];
		for (int type = 0; type < currentAlphabet.size(); type++) {
			sortedWords[type] = new IDSorter(type, counter.get(type));
		}
		
		java.util.Arrays.sort(sortedWords);

		int max = totalWords;
		if (currentAlphabet.size() < max) {
			max = currentAlphabet.size();
		}

		for (int rank = 0; rank < max; rank++) {
			int type = sortedWords[rank].getID();
			out.println (currentAlphabet.lookupObject(type));
		}

		out.close();

	}


	static final long serialVersionUID = 1;

}