/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
Takes a list of directory names as arguments, (each directory
should contain all the text files for each class), performs a random train/test split,
trains a classifier, and outputs accuracy on the testing and training sets.
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
package cc.mallet.classify.examples;
import java.io.*;
import cc.mallet.classify.*;
import cc.mallet.pipe.*;
import cc.mallet.pipe.iterator.*;
import cc.mallet.types.*;
public class DocumentClassifier
{
static public void main (String[] args)
{
// Create Java File objects for each of the arguments
File[] directories = new File[args.length];
for (int i = 0; i < args.length; i++)
directories[i] = new File (args[i]);
// Create the pipeline that will take as input {data = File, target = String for classname}
// and turn them into {data = FeatureVector, target = Label}
Pipe instancePipe = new SerialPipes (new Pipe[] {
new Target2Label (), // Target String -> class label
new Input2CharSequence (), // Data File -> String containing contents
new CharSubsequence (CharSubsequence.SKIP_HEADER), // Remove UseNet or email header
new CharSequence2TokenSequence (), // Data String -> TokenSequence
new TokenSequenceLowercase (), // TokenSequence words lowercased
new TokenSequenceRemoveStopwords (),// Remove stopwords from sequence
new TokenSequence2FeatureSequence(),// Replace each Token with a feature index
new FeatureSequence2FeatureVector(),// Collapse word order into a "feature vector"
new PrintInputAndTarget(),
});
// Create an empty list of the training instances
InstanceList ilist = new InstanceList (instancePipe);
// Add all the files in the directories to the list of instances.
// The Instance that goes into the beginning of the instancePipe
// will have a File in the "data" slot, and a string from args[] in the "target" slot.
ilist.addThruPipe (new FileIterator (directories, FileIterator.STARTING_DIRECTORIES));
// Make a test/train split; ilists[0] will be for training; ilists[1] will be for testing
InstanceList[] ilists = ilist.split (new double[] {.5, .5});
// Create a classifier trainer, and use it to create a classifier
ClassifierTrainer naiveBayesTrainer = new NaiveBayesTrainer ();
Classifier classifier = naiveBayesTrainer.train (ilists[0]);
System.out.println ("The training accuracy is "+ classifier.getAccuracy (ilists[0]));
System.out.println ("The testing accuracy is "+ classifier.getAccuracy (ilists[1]));
}
}