NaiveBayesTrainer.java example

Explorer
topic-modeling-master
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */




package cc.mallet.classify;


import java.io.Serializable;
import java.io.ObjectOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.Iterator;

import cc.mallet.classify.Classifier;
import cc.mallet.pipe.Noop;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.AlphabetCarrying;
import cc.mallet.types.FeatureSelection;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.LabelVector;
import cc.mallet.types.Labeling;
import cc.mallet.types.Multinomial;

/**
 * Class used to generate a NaiveBayes classifier from a set of training data.
 * In an Bayes classifier,
 *     the p(Classification|Data) = p(Data|Classification)p(Classification)/p(Data)
 * <p>
 *  To compute the likelihood: <br>
 *      p(Data|Classification) = p(d1,d2,..dn | Classification) <br>
 * Naive Bayes makes the assumption  that all of the data are conditionally
 * independent given the Classification: <br>
 *      p(d1,d2,...dn | Classification) = p(d1|Classification)p(d2|Classification)..
 * <p>
 * As with other classifiers in Mallet, NaiveBayes is implemented as two classes:
 * a trainer and a classifier.  The NaiveBayesTrainer produces estimates of the various
 * p(dn|Classifier) and contructs this class with those estimates.
 * <p>
 * A call to train() or incrementalTrain() produces a
 * {@link cc.mallet.classify.NaiveBayes} classifier that can
 * can be used to classify instances.  A call to incrementalTrain() does not throw
 * away the internal state of the trainer; subsequent calls to incrementalTrain()
 * train by extending the previous training set.
 * <p>
 * A NaiveBayesTrainer can be persisted using serialization.
 * @see NaiveBayes
 *  @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
 *
 */
public class NaiveBayesTrainer extends ClassifierTrainer<NaiveBayes> 
implements ClassifierTrainer.ByInstanceIncrements<NaiveBayes>, Boostable, AlphabetCarrying, Serializable
{
  // These function as default selections for the kind of Estimator used
  Multinomial.Estimator featureEstimator = new Multinomial.LaplaceEstimator();
  Multinomial.Estimator priorEstimator = new Multinomial.LaplaceEstimator();

  // Added to support incremental training.
  // These are the counts formed after NaiveBayes training.  Note that
  // these are *not* the estimates passed to the NaiveBayes classifier;
  // rather the estimates are formed from these counts.
  // we could break these five fields out into a inner class.
  Multinomial.Estimator[] me;
  Multinomial.Estimator pe;
  double docLengthNormalization = -1;  // A value of -1 means don't do any document length normalization
  NaiveBayes classifier;

  // If this style of incremental training is successful, the following members
  // should probably be moved up into IncrementalClassifierTrainer
  Pipe instancePipe;        // Needed to construct a new classifier
  Alphabet dataAlphabet;    // Extracted from InstanceList. Must be the same for all calls to incrementalTrain()
  Alphabet targetAlphabet; // Extracted from InstanceList. Must be the same for all calls to incrementalTrain
  
  
  public NaiveBayesTrainer (NaiveBayes initialClassifier) {
  	if (initialClassifier != null) {
  		this.instancePipe = initialClassifier.getInstancePipe();
  		this.dataAlphabet = initialClassifier.getAlphabet();
  		this.targetAlphabet = initialClassifier.getLabelAlphabet();
  		this.classifier = initialClassifier;
  	}
  }
  
  public NaiveBayesTrainer (Pipe instancePipe) {
  	this.instancePipe = instancePipe;
  	this.dataAlphabet = instancePipe.getDataAlphabet();
  	this.targetAlphabet = instancePipe.getTargetAlphabet();
  }
  
  public NaiveBayesTrainer () {
  }
  
  
	public NaiveBayes getClassifier () { return classifier; }

  public NaiveBayesTrainer setDocLengthNormalization (double d) {
  	docLengthNormalization = d;
  	return this;
  }
  
  public double getDocLengthNormalization () {
  	return docLengthNormalization;
  }
  
  /**
   *  Get the MultinomialEstimator instance used to specify the type of estimator
   *  for features.
   *
   * @return  estimator to be cloned on next call to train() or first call
   * to incrementalTrain()
   */
  public Multinomial.Estimator getFeatureMultinomialEstimator ()
  {
    return featureEstimator;
  }

  /**
   * Set the Multinomial Estimator used for features. The MulitnomialEstimator
   * is internally cloned and the clone is used to maintain the counts
   * that will be used to generate probability estimates
   * the next time train() or an initial incrementalTrain() is run.
   * Defaults to a Multinomial.LaplaceEstimator()
   * @param me to be cloned on next call to train() or first call
   * to incrementalTrain()
   */
  public NaiveBayesTrainer setFeatureMultinomialEstimator (Multinomial.Estimator me)
  {
    if (instancePipe != null)
      throw new IllegalStateException("Can't set after incrementalTrain() is called");
    featureEstimator = me;
    return this;
  }

  /**
   *  Get the MultinomialEstimator instance used to specify the type of estimator
   *  for priors.
   *
   * @return  estimator to be cloned on next call to train() or first call
   * to incrementalTrain()
   */
  public Multinomial.Estimator getPriorMultinomialEstimator ()
  {
    return priorEstimator;
  }

  /**
   * Set the Multinomial Estimator used for priors. The MulitnomialEstimator
   * is internally cloned and the clone is used to maintain the counts
   * that will be used to generate probability estimates
   * the next time train() or an initial incrementalTrain() is run.
   * Defaults to a Multinomial.LaplaceEstimator()
   * @param me to be cloned on next call to train() or first call
   * to incrementalTrain()
   */
  public NaiveBayesTrainer setPriorMultinomialEstimator (Multinomial.Estimator me)
  {
    if (instancePipe != null)
      throw new IllegalStateException("Can't set after incrementalTrain() is called");
    priorEstimator = me;
    return this;
  }

  

  /**
   * Create a NaiveBayes classifier from a set of training data.
   * The trainer uses counts of each feature in an instance's feature vector
   * to provide an estimate of p(Labeling| feature).  The internal state
   * of the trainer is thrown away ( by a call to reset() ) when train() returns. Each
   * call to train() is completely independent of any other.
   * @param trainingList        The InstanceList to be used to train the classifier.
   * Within each instance the data slot is an instance of FeatureVector and the
   * target slot is an instance of Labeling
   * @param validationList      Currently unused
   * @param testSet             Currently unused
   * @param evaluator           Currently unused
   * @param initialClassifier   Currently unused
   * @return The NaiveBayes classifier as trained on the trainingList
   */
  public NaiveBayes train (InstanceList trainingList)
  {
  	// Forget all the previous sufficient statistics counts;
  	me = null; pe = null;
  	// Train a new classifier based on this data
  	this.classifier = trainIncremental (trainingList);
  	return classifier;
  }
  
  public NaiveBayes trainIncremental (InstanceList trainingInstancesToAdd) 
  {
  	// Initialize and check instance variables as necessary...
  	setup(trainingInstancesToAdd, null);

  	// Incrementally add the counts of this new training data
  	for (Instance instance : trainingInstancesToAdd)
    	incorporateOneInstance(instance, trainingInstancesToAdd.getInstanceWeight(instance));
    
    // Estimate multinomials, and return a new naive Bayes classifier.  
    // Note that, unlike MaxEnt, NaiveBayes is immutable, so we create a new one each time.
    classifier = new NaiveBayes (instancePipe, pe.estimate(), estimateFeatureMultinomials());
    return classifier;
  }
  
  public NaiveBayes trainIncremental (Instance instance) {
  	setup (null, instance);
  	
  	// Incrementally add the counts of this new training instance
  	incorporateOneInstance (instance, 1.0);
  	if (instancePipe == null)
  		instancePipe = new Noop (dataAlphabet, targetAlphabet);
  	classifier = new NaiveBayes (instancePipe, pe.estimate(), estimateFeatureMultinomials());
  	return classifier;
  }

  
  private void setup (InstanceList instances, Instance instance) {
  	assert (instances != null || instance != null);
  	if (instance == null && instances != null)
  		instance = instances.get(0);
  	// Initialize the alphabets
  	if (dataAlphabet == null) {
  		this.dataAlphabet = instance.getDataAlphabet();
  		this.targetAlphabet = instance.getTargetAlphabet();
  	}	else if (!Alphabet.alphabetsMatch(instance, this))
  		// Make sure the alphabets match 
  		throw new IllegalArgumentException ("Training set alphabets do not match those of NaiveBayesTrainer.");

  	// Initialize or check the instancePipe
  	if (instances != null) {
  		if (instancePipe == null)
  			instancePipe = instances.getPipe();
  		else if (instancePipe != instances.getPipe())
  			// Make sure that this pipes match.  Is this really necessary??  
  			// I don't think so, but it could be confusing to have each returned classifier have a different pipe?  -akm 1/08
  			throw new IllegalArgumentException ("Training set pipe does not match that of NaiveBayesTrainer.");
  	}
  	
  	if (me == null) {
  		int numLabels = targetAlphabet.size();
  		me = new Multinomial.Estimator[numLabels];
  		for (int i = 0; i < numLabels; i++) {
  			me[i] = (Multinomial.Estimator) featureEstimator.clone();
  			me[i].setAlphabet(dataAlphabet);
  		}
  		pe = (Multinomial.Estimator) priorEstimator.clone();
  	}
  	
    if (targetAlphabet.size() > me.length) {
      // target alphabet grew. increase size of our multinomial array
      int targetAlphabetSize = targetAlphabet.size();
      // copy over old values
      Multinomial.Estimator[] newMe = new Multinomial.Estimator[targetAlphabetSize];
      System.arraycopy (me, 0, newMe, 0, me.length);
      // initialize new expanded space
      for (int i= me.length; i<targetAlphabetSize; i++){
        Multinomial.Estimator mest = (Multinomial.Estimator)featureEstimator.clone ();
        mest.setAlphabet (dataAlphabet);
        newMe[i] = mest;
      }
      me = newMe;
    }
  }

  private void incorporateOneInstance (Instance instance, double instanceWeight) 
  {
    Labeling labeling = instance.getLabeling ();
    if (labeling == null) return; // Handle unlabeled instances by skipping them
    FeatureVector fv = (FeatureVector) instance.getData ();
    double oneNorm = fv.oneNorm();
    if (oneNorm <= 0) return; // Skip instances that have no features present
    if (docLengthNormalization > 0)
    	// Make the document have counts that sum to docLengthNormalization
    	// I.e., if 20, it would be as if the document had 20 words.
    	instanceWeight *= docLengthNormalization / oneNorm;
    assert (instanceWeight > 0 && !Double.isInfinite(instanceWeight));
    for (int lpos = 0; lpos < labeling.numLocations(); lpos++) {
      int li = labeling.indexAtLocation (lpos);
      double labelWeight = labeling.valueAtLocation (lpos);
      if (labelWeight == 0) continue;
      //System.out.println ("NaiveBayesTrainer me.increment "+ labelWeight * instanceWeight);
      me[li].increment (fv, labelWeight * instanceWeight);
      // This relies on labelWeight summing to 1 over all labels
      pe.increment (li, labelWeight * instanceWeight);
    }
  }
  
  private Multinomial[] estimateFeatureMultinomials () {
    int numLabels = targetAlphabet.size();
    Multinomial[] m = new Multinomial[numLabels];
    for (int li = 0; li < numLabels; li++) {
      //me[li].print (); // debugging
      m[li] = me[li].estimate();
    }
    return m;
  }

  /**
   * Create a NaiveBayes classifier from a set of training data and the
   * previous state of the trainer.  Subsequent calls to incrementalTrain()
   * add to the state of the trainer.  An incremental training session
   * should consist only of calls to incrementalTrain() and have no
   * calls to train();     *
   * @param trainingList        The InstanceList to be used to train the classifier.
   * Within each instance the data slot is an instance of FeatureVector and the
   * target slot is an instance of Labeling
   * @param validationList      Currently unused
   * @param testSet             Currently unused
   * @param evaluator           Currently unused
   * @param initialClassifier   Currently unused
   * @return The NaiveBayes classifier as trained on the trainingList and the previous
   * trainingLists passed to incrementalTrain()
   */

  public String toString()
  {
    return "NaiveBayesTrainer";
  }

  
  // AlphabetCarrying interface
	public boolean alphabetsMatch(AlphabetCarrying object) {
		return Alphabet.alphabetsMatch (this, object);
	}

	public Alphabet getAlphabet() {
		return dataAlphabet;
	}

	public Alphabet[] getAlphabets() {
		return new Alphabet[] { dataAlphabet, targetAlphabet };
	}


  // Serialization
  // serialVersionUID is overriden to prevent innocuous changes in this
  // class from making the serialization mechanism think the external
  // format has changed.

  private static final long serialVersionUID = 1;
  private static final int CURRENT_SERIAL_VERSION = 1;

  private void writeObject(ObjectOutputStream out) throws IOException
  {
    out.writeInt(CURRENT_SERIAL_VERSION);

    //default selections for the kind of Estimator used
    out.writeObject(featureEstimator);
    out.writeObject(priorEstimator);

    // These are the counts formed after NaiveBayes training.
    out.writeObject(me);
    out.writeObject(pe);

    // pipe and alphabets
    out.writeObject(instancePipe);
    out.writeObject(dataAlphabet);
    out.writeObject(targetAlphabet);
  }

  private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
    int version = in.readInt();
    if (version != CURRENT_SERIAL_VERSION)
      throw new ClassNotFoundException("Mismatched NaiveBayesTrainer versions: wanted " +
                                       CURRENT_SERIAL_VERSION + ", got " +
                                       version);

    //default selections for the kind of Estimator used
    featureEstimator = (Multinomial.Estimator) in.readObject();
    priorEstimator = (Multinomial.Estimator) in.readObject();

    // These are the counts formed after NaiveBayes training.
    me = (Multinomial.Estimator []) in.readObject();
    pe = (Multinomial.Estimator) in.readObject();

    // pipe and alphabets
    instancePipe = (Pipe) in.readObject();
    dataAlphabet = (Alphabet) in.readObject();
    targetAlphabet = (Alphabet) in.readObject();
  }
  
  public static class Factory extends ClassifierTrainer.Factory<NaiveBayesTrainer> 
  {
    Multinomial.Estimator featureEstimator = new Multinomial.LaplaceEstimator();
    Multinomial.Estimator priorEstimator = new Multinomial.LaplaceEstimator();
    double docLengthNormalization = -1;
    
		public NaiveBayesTrainer newClassifierTrainer(Classifier initialClassifier) {
			return new NaiveBayesTrainer ((NaiveBayes)initialClassifier);
		}
    public NaiveBayesTrainer.Factory setDocLengthNormalization (double docLengthNormalization) {
    	this.docLengthNormalization = docLengthNormalization;
    	return this;
    }
    
    public NaiveBayesTrainer.Factory setFeatureMultinomialEstimator (Multinomial.Estimator featureEstimator) {
    	this.featureEstimator = featureEstimator;
    	return this;
    }
    
    public NaiveBayesTrainer.Factory setPriorMultinomialEstimator (Multinomial.Estimator priorEstimator) {
    	this.priorEstimator = priorEstimator;
    	return this;
    }
  	
  }

}