MCMaxEntTrainer.java example

Explorer
topic-modeling-master
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org.  For further
information, see the file `LICENSE' included with this distribution. */





package cc.mallet.classify;


import java.util.logging.*;
import java.util.*;
import java.io.*;

import cc.mallet.classify.Classifier;
import cc.mallet.optimize.LimitedMemoryBFGS;
import cc.mallet.optimize.Optimizable;
import cc.mallet.optimize.Optimizer;
import cc.mallet.optimize.tests.*;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.ExpGain;
import cc.mallet.types.FeatureInducer;
import cc.mallet.types.FeatureSelection;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.GradientGain;
import cc.mallet.types.InfoGain;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelVector;
import cc.mallet.types.Labeling;
import cc.mallet.types.MatrixOps;
import cc.mallet.types.RankedFeatureVector;
import cc.mallet.types.Vector;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;
import cc.mallet.util.MalletProgressMessageLogger;
import cc.mallet.util.Maths;

// Does not currently handle instances that are labeled with distributions
// instead of a single label.
/**
 * The trainer for a Maximum Entropy classifier.
 @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
 */

public class MCMaxEntTrainer extends ClassifierTrainer<MCMaxEnt> implements Boostable, Serializable //implements CommandOption.ListProviding
{
	private static Logger logger = MalletLogger.getLogger(MCMaxEntTrainer.class.getName());
	private static Logger progressLogger = MalletProgressMessageLogger.getLogger(MCMaxEntTrainer.class.getName()+"-pl");

	int numGetValueCalls = 0;
	int numGetValueGradientCalls = 0;
	int numIterations = 10;

	public static final String EXP_GAIN = "exp";
	public static final String GRADIENT_GAIN = "grad";
	public static final String INFORMATION_GAIN = "info";

	// xxx Why does TestMaximizable fail when this variance is very small?
	static final double DEFAULT_GAUSSIAN_PRIOR_VARIANCE = .1;  // note used to be 1
	static final double DEFAULT_HYPERBOLIC_PRIOR_SLOPE = 0.2;
	static final double DEFAULT_HYPERBOLIC_PRIOR_SHARPNESS = 10.0;
	static final Class DEFAULT_MAXIMIZER_CLASS = LimitedMemoryBFGS.class;

	// CPAL
	boolean usingMultiConditionalTraining = true;
	boolean usingHyperbolicPrior = false;
	double gaussianPriorVariance = DEFAULT_GAUSSIAN_PRIOR_VARIANCE;
	double hyperbolicPriorSlope = DEFAULT_HYPERBOLIC_PRIOR_SLOPE;
	double hyperbolicPriorSharpness = DEFAULT_HYPERBOLIC_PRIOR_SHARPNESS;
	Class maximizerClass = DEFAULT_MAXIMIZER_CLASS;
	double generativeWeighting = 1.0;
	MaximizableTrainer mt;
	MCMaxEnt initialClassifier;

	// CPAL
	static CommandOption.Boolean usingMultiConditionalTrainingOption =
	    new CommandOption.Boolean (MCMaxEntTrainer.class, "useMCTraining", "true|false", true, true,
	                               "Use MultiConditional Training", null);
	static CommandOption.Boolean usingHyperbolicPriorOption =
	    new CommandOption.Boolean (MCMaxEntTrainer.class, "useHyperbolicPrior", "true|false", false, false,
	                               "Use hyperbolic (close to L1 penalty) prior over parameters", null);
	static CommandOption.Double gaussianPriorVarianceOption =
	    new CommandOption.Double (MCMaxEntTrainer.class, "gaussianPriorVariance", "FLOAT", true, 10.0,
	                              "Variance of the gaussian prior over parameters", null);
	static CommandOption.Double hyperbolicPriorSlopeOption =
	    new CommandOption.Double (MCMaxEntTrainer.class, "hyperbolicPriorSlope", "FLOAT", true, 0.2,
	                              "Slope of the (L1 penalty) hyperbolic prior over parameters", null);
	static CommandOption.Double hyperbolicPriorSharpnessOption =
	    new CommandOption.Double (MCMaxEntTrainer.class, "hyperbolicPriorSharpness", "FLOAT", true, 10.0,
	                              "Sharpness of the (L1 penalty) hyperbolic prior over parameters", null);

	static final CommandOption.List commandOptions =
	    new CommandOption.List (
	        "MCMaximum Entropy Classifier",
	        new CommandOption[] {
		        usingHyperbolicPriorOption,
		        gaussianPriorVarianceOption,
		        hyperbolicPriorSlopeOption,
		        hyperbolicPriorSharpnessOption,
		        usingMultiConditionalTrainingOption,   // CPAL
	        });

	public static CommandOption.List getCommandOptionList ()
	{
		return commandOptions;
	}

	/*
	public MCMaxEntTrainer(Maximizer.ByGradient maximizer)
	{
	this.maximizerByGradient = maximizer;
	this.usingHyperbolicPrior = false;
	}
	*/

	public MCMaxEntTrainer (CommandOption.List col)
	{
		this.usingHyperbolicPrior = usingHyperbolicPriorOption.value;
		this.gaussianPriorVariance = gaussianPriorVarianceOption.value;
		this.hyperbolicPriorSlope = hyperbolicPriorSlopeOption.value;
		this.hyperbolicPriorSharpness = hyperbolicPriorSharpnessOption.value;
		this.usingMultiConditionalTraining = usingMultiConditionalTrainingOption.value;
	}
	
	public MCMaxEntTrainer (MCMaxEnt initialClassifier) {
		this.initialClassifier = initialClassifier;
	}

	public MCMaxEntTrainer ()
	{
		this (false);
	}

	public MCMaxEntTrainer (boolean useHyperbolicPrior)
	{
		this.usingHyperbolicPrior = useHyperbolicPrior;
	}

	/** Constructs a trainer with a parameter to avoid overtraining.  1.0 is
	 * usually a reasonable default value. */
	public MCMaxEntTrainer (double gaussianPriorVariance)
	{
		this.usingHyperbolicPrior = false;
		this.gaussianPriorVariance = gaussianPriorVariance;
	}

	// CPAL - added this to do MultiConditionalTraining
	public MCMaxEntTrainer (double gaussianPriorVariance, boolean useMultiConditionalTraining )
	{
		this.usingHyperbolicPrior = false;
		this.usingMultiConditionalTraining = useMultiConditionalTraining;
		this.gaussianPriorVariance = gaussianPriorVariance;
	}

	public MCMaxEntTrainer (double hyperbolicPriorSlope,
	                        double hyperbolicPriorSharpness)
	{
		this.usingHyperbolicPrior = true;
		this.hyperbolicPriorSlope = hyperbolicPriorSlope;
		this.hyperbolicPriorSharpness = hyperbolicPriorSharpness;
	}

	public Optimizable.ByGradientValue getMaximizableTrainer (InstanceList ilist)
	{
		if (ilist == null)
			return new MaximizableTrainer ();
		return new MaximizableTrainer (ilist, null);
	}

	/**
	 * Specifies the maximum number of iterations to run during a single call
	 * to <code>train</code> or <code>trainWithFeatureInduction</code>.  Not
	 * currently functional.
	 * @return This trainer
	 */
	// XXX Since we maximize before using numIterations, this doesn't work.
	// Is that a bug?  If so, should the default numIterations be higher?
	public MCMaxEntTrainer setNumIterations (int i)
	{
		numIterations = i;
		return this;
	}

	public MCMaxEntTrainer setUseHyperbolicPrior (boolean useHyperbolicPrior)
	{
		this.usingHyperbolicPrior = useHyperbolicPrior;
		return this;
	}

	/**
	 * Sets a parameter to prevent overtraining.  A smaller variance for the prior
	 * means that feature weights are expected to hover closer to 0, so extra
	 * evidence is required to set a higher weight.
	 * @return This trainer
	 */
	public MCMaxEntTrainer setGaussianPriorVariance (double gaussianPriorVariance)
	{
		this.usingHyperbolicPrior = false;
		this.gaussianPriorVariance = gaussianPriorVariance;
		return this;
	}

	public MCMaxEntTrainer setHyperbolicPriorSlope(double hyperbolicPriorSlope)
	{
		this.usingHyperbolicPrior = true;
		this.hyperbolicPriorSlope = hyperbolicPriorSlope;

		return this;
	}

	public MCMaxEntTrainer setHyperbolicPriorSharpness (double hyperbolicPriorSharpness)
	{
		this.usingHyperbolicPrior = true;
		this.hyperbolicPriorSharpness = hyperbolicPriorSharpness;

		return this;
	}
	
	public MCMaxEnt getClassifier () {
		return mt.getClassifier();
	}


	public MCMaxEnt train (InstanceList trainingSet)
	{
		logger.fine ("trainingSet.size() = "+trainingSet.size());
		mt = new MaximizableTrainer (trainingSet, (MCMaxEnt)initialClassifier);
		Optimizer maximizer = new LimitedMemoryBFGS(mt);
		// CPAL - change the tolerance for large vocab experiments
		((LimitedMemoryBFGS)maximizer).setTolerance(.00001);    // std is .0001;
		maximizer.optimize (); // XXX given the loop below, this seems wrong.

		logger.info("MCMaxEnt ngetValueCalls:"+getValueCalls()+"\nMCMaxEnt ngetValueGradientCalls:"+getValueGradientCalls());
//		boolean converged;
//
//	 	for (int i = 0; i < numIterations; i++) {
//			converged = maximizer.maximize (mt, 1);
//			if (converged)
//			 	break;
//			else if (evaluator != null)
//			 	if (!evaluator.evaluate (mt.getClassifier(), converged, i, mt.getValue(),
//				 												 trainingSet, validationSet, testSet))
//				 	break;
//		}
//		TestMaximizable.testValueAndGradient (mt);
		progressLogger.info("\n"); //  progess messages are on one line; move on.
		return mt.getClassifier ();
	}


	/**
	 * <p>Like the other version of <code>trainWithFeatureInduction</code>, but
	 * allows some default options to be changed.</p>
	 *
	 * @param maxent An initial partially-trained classifier (default <code>null</code>).
	 * This classifier may be modified during training.
	 * @param gainName The estimate of gain (log-likelihood increase) we want our chosen
	 * features to maximize.
	 * Should be one of <code>MaxEntTrainer.EXP_GAIN</code>,
	 * <code>MaxEntTrainer.GRADIENT_GAIN</code>, or
	 * <code>MaxEntTrainer.INFORMATION_GAIN</code> (default <code>EXP_GAIN</code>).
	 *
	 * @return The trained <code>MaxEnt</code> classifier
	 */
	/*
	public Classifier trainWithFeatureInduction (InstanceList trainingData,
	                                             InstanceList validationData,
	                                             InstanceList testingData,
	                                             ClassifierEvaluating evaluator,
	                                             MCMaxEnt maxent,

	                                             int totalIterations,
	                                             int numIterationsBetweenFeatureInductions,
	                                             int numFeatureInductions,
	                                             int numFeaturesPerFeatureInduction,
	                                             String gainName) {

		// XXX This ought to be a parameter, except that setting it to true can
		// crash training ("Jump too small").
		boolean saveParametersDuringFI = false;

		Alphabet inputAlphabet = trainingData.getDataAlphabet();
		Alphabet outputAlphabet = trainingData.getTargetAlphabet();

		if (maxent == null)
			maxent = new MCMaxEnt(trainingData.getPipe(),
			                      new double[(1+inputAlphabet.size()) * outputAlphabet.size()]);

		int trainingIteration = 0;
		int numLabels = outputAlphabet.size();

		// Initialize feature selection
		FeatureSelection globalFS = trainingData.getFeatureSelection();
		if (globalFS == null) {
			// Mask out all features; some will be added later by FeatureInducer.induceFeaturesFor(.)
			globalFS = new FeatureSelection (trainingData.getDataAlphabet());
			trainingData.setFeatureSelection (globalFS);
		}
		if (validationData != null) validationData.setFeatureSelection (globalFS);
		if (testingData != null) testingData.setFeatureSelection (globalFS);
		maxent = new MCMaxEnt(maxent.getInstancePipe(), maxent.getParameters(), globalFS);

		// Run feature induction
		for (int featureInductionIteration = 0;
		     featureInductionIteration < numFeatureInductions;
		     featureInductionIteration++) {

			// Print out some feature information
			logger.info ("Feature induction iteration "+featureInductionIteration);

			// Train the model a little bit.  We don't care whether it converges; we
			// execute all feature induction iterations no matter what.
			if (featureInductionIteration != 0) {
				// Don't train until we have added some features
				setNumIterations(numIterationsBetweenFeatureInductions);
				maxent = (MCMaxEnt)this.train (trainingData, validationData, testingData, evaluator,
				                               maxent);
			}
			trainingIteration += numIterationsBetweenFeatureInductions;

			logger.info ("Starting feature induction with "+(1+inputAlphabet.size())+
			             " features over "+numLabels+" labels.");

			// Create the list of error tokens
			InstanceList errorInstances = new InstanceList (trainingData.getDataAlphabet(),
			                                                trainingData.getTargetAlphabet());

			// This errorInstances.featureSelection will get examined by FeatureInducer,
			// so it can know how to add "new" singleton features
			errorInstances.setFeatureSelection (globalFS);
			List errorLabelVectors = new ArrayList();    // these are length-1 vectors
			for (int i = 0; i < trainingData.size(); i++) {
				Instance instance = trainingData.get(i);
				FeatureVector inputVector = (FeatureVector) instance.getData();
				Label trueLabel = (Label) instance.getTarget();

				// Having trained using just the current features, see how we classify
				// the training data now.
				Classification classification = maxent.classify(instance);
				if (!classification.bestLabelIsCorrect()) {
					errorInstances.add(inputVector, trueLabel, null, null);
					errorLabelVectors.add(classification.getLabelVector());
				}
			}
			logger.info ("Error instance list size = "+errorInstances.size());
			int s = errorLabelVectors.size();

			LabelVector[] lvs = new LabelVector[s];
			for (int i = 0; i < s; i++) {
				lvs[i] = (LabelVector)errorLabelVectors.get(i);
			}

			RankedFeatureVector.Factory gainFactory = null;
			if (gainName.equals (EXP_GAIN))
				gainFactory = new ExpGain.Factory (lvs, gaussianPriorVariance);
			else if (gainName.equals(GRADIENT_GAIN))
				gainFactory =	new GradientGain.Factory (lvs);
			else if (gainName.equals(INFORMATION_GAIN))
				gainFactory =	new InfoGain.Factory ();
			else
				throw new IllegalArgumentException("Unsupported gain name: "+gainName);

			FeatureInducer klfi =
			    new FeatureInducer (gainFactory,
			                        errorInstances,
			                        numFeaturesPerFeatureInduction,
			                        2*numFeaturesPerFeatureInduction,
			                        2*numFeaturesPerFeatureInduction);

			// Note that this adds features globally, but not on a per-transition basis
			klfi.induceFeaturesFor (trainingData, false, false);
			if (testingData != null) klfi.induceFeaturesFor (testingData, false, false);
			logger.info ("MCMaxEnt FeatureSelection now includes "+globalFS.cardinality()+" features");
			klfi = null;

			double[] newParameters = new double[(1+inputAlphabet.size()) * outputAlphabet.size()];

			// XXX (Executing this block often causes an error during training; I don't know why.)
			if (saveParametersDuringFI) {
				// Keep current parameter values
				// XXX This relies on the implementation detail that the most recent features
				// added to an Alphabet get the highest indices.

				// Count parameters per output label
				int oldParamCount = maxent.parameters.length / outputAlphabet.size();
				int newParamCount = 1+inputAlphabet.size();
				// Copy params into the proper locations
				for (int i=0; i<outputAlphabet.size(); i++) {
					System.arraycopy(maxent.parameters, i*oldParamCount,
					                 newParameters, i*newParamCount,
					                 oldParamCount);
				}
				for (int i=0; i<oldParamCount; i++)
					if (maxent.parameters[i] != newParameters[i]) {
						System.out.println(maxent.parameters[i]+" "+newParameters[i]);
						System.exit(0);
					}
			}

			maxent.parameters = newParameters;
			maxent.defaultFeatureIndex = inputAlphabet.size();
		}

		// Finished feature induction
		logger.info("Ended with "+globalFS.cardinality()+" features.");
		setNumIterations(totalIterations - trainingIteration);
		return this.train (trainingData, validationData, testingData,
		                   evaluator, maxent);
	}
	*/

	// XXX Should these really be public?  Why?
	/** Counts how many times this trainer has computed the gradient of the
	 * log probability of training labels. */
	public int getValueGradientCalls() {return numGetValueGradientCalls;}
	/** Counts how many times this trainer has computed the
	 * log probability of training labels. */
	public int getValueCalls() {return numGetValueCalls;}
//	public int getIterations() {return maximizerByGradient.getIterations();}

	public String toString()
	{
		return "MCMaxEntTrainer"
		//	+ "("+maximizerClass.getName()+") "
		       + ",numIterations=" + numIterations
		       + (usingHyperbolicPrior
		          ? (",hyperbolicPriorSlope="+hyperbolicPriorSlope+
		             ",hyperbolicPriorSharpness="+hyperbolicPriorSharpness)
		          : (",gaussianPriorVariance="+gaussianPriorVariance));
	}



	// A private inner class that wraps up a MCMaxEnt classifier and its training data.
	// The result is a maximize.Maximizable function.
	private class MaximizableTrainer implements Optimizable.ByGradientValue
	{
		double[] parameters, constraints, cachedGradient;
		MCMaxEnt theClassifier;
		InstanceList trainingList;
		// The expectations are (temporarily) stored in the cachedGradient
		double cachedValue;
		boolean cachedValueStale;
		boolean cachedGradientStale;
		int numLabels;
		int numFeatures;
		int defaultFeatureIndex;						// just for clarity
		FeatureSelection featureSelection;
		FeatureSelection[] perLabelFeatureSelection;

		public MaximizableTrainer (){}

		public MaximizableTrainer (InstanceList ilist, MCMaxEnt initialClassifier)
		{
			this.trainingList = ilist;
			Alphabet fd = ilist.getDataAlphabet();
			LabelAlphabet ld = (LabelAlphabet) ilist.getTargetAlphabet();
			// Don't fd.stopGrowth, because someone might want to do feature induction
			ld.stopGrowth();
			// Add one feature for the "default feature".
			this.numLabels = ld.size();
			this.numFeatures = fd.size() + 1;
			this.defaultFeatureIndex = numFeatures-1;
			this.parameters = new double [numLabels * numFeatures];
			this.constraints = new double [numLabels * numFeatures];
			this.cachedGradient = new double [numLabels * numFeatures];
			Arrays.fill (parameters, 0.0);
			Arrays.fill (constraints, 0.0);
			Arrays.fill (cachedGradient, 0.0);
			this.featureSelection = ilist.getFeatureSelection();
			this.perLabelFeatureSelection = ilist.getPerLabelFeatureSelection();
			// Add the default feature index to the selection
			if (featureSelection != null)
				featureSelection.add (defaultFeatureIndex);
			if (perLabelFeatureSelection != null)
				for (int i = 0; i < perLabelFeatureSelection.length; i++)
					perLabelFeatureSelection[i].add (defaultFeatureIndex);
			// xxx Later change this to allow both to be set, but select which one to use by a boolean flag?
			assert (featureSelection == null || perLabelFeatureSelection == null);
			if (initialClassifier != null) {

				this.theClassifier = initialClassifier;
				this.parameters = theClassifier.parameters;
				this.featureSelection = theClassifier.featureSelection;
				this.perLabelFeatureSelection = theClassifier.perClassFeatureSelection;
				this.defaultFeatureIndex = theClassifier.defaultFeatureIndex;
				assert (initialClassifier.getInstancePipe() == ilist.getPipe());
			}
			else if (this.theClassifier == null) {
				this.theClassifier = new MCMaxEnt (ilist.getPipe(), parameters, featureSelection, perLabelFeatureSelection);
			}
			cachedValueStale = true;
			cachedGradientStale = true;

			// Initialize the constraints
			logger.fine("Number of instances in training list = " + trainingList.size());
			for (Instance inst : trainingList) {
				double instanceWeight = trainingList.getInstanceWeight(inst);
				Labeling labeling = inst.getLabeling ();
				//logger.fine ("Instance "+ii+" labeling="+labeling);
				FeatureVector fv = (FeatureVector) inst.getData ();
				Alphabet fdict = fv.getAlphabet();
				assert (fv.getAlphabet() == fd);
				int li = labeling.getBestIndex();
				// The "2*" below is because there is one copy for the p(y|x)and another for the p(x|y).
				MatrixOps.rowPlusEquals (constraints, numFeatures, li, fv, 2*instanceWeight);
				// For the default feature, whose weight is 1.0
				assert(!Double.isNaN(instanceWeight)) : "instanceWeight is NaN";
				assert(!Double.isNaN(li)) : "bestIndex is NaN";
				boolean hasNaN = false;
				for(int i = 0; i < fv.numLocations(); i++) {
					if(Double.isNaN(fv.valueAtLocation(i))) {
						logger.info("NaN for feature " + fdict.lookupObject(fv.indexAtLocation(i)).toString());
						hasNaN = true;
					}
				}
				if(hasNaN)
					logger.info("NaN in instance: " + inst.getName());
        // Only p(y|x) uses the default feature; p(x|y) doesn't use it.  The default feature value is 1.0.
        constraints[li*numFeatures + defaultFeatureIndex] += instanceWeight;
			}
			//TestMaximizable.testValueAndGradientCurrentParameters (this);
		}

		public MCMaxEnt getClassifier () { return theClassifier; }

		public double getParameter (int index) {
			return parameters[index];
		}

		public void setParameter (int index, double v) {
			cachedValueStale = true;
			cachedGradientStale = true;
			parameters[index] = v;
		}

		public int getNumParameters() {
			return parameters.length;
		}

		public void getParameters (double[] buff) {
			if (buff == null || buff.length != parameters.length)
				buff = new double [parameters.length];
			System.arraycopy (parameters, 0, buff, 0, parameters.length);
		}

		public void setParameters (double [] buff) {
			assert (buff != null);
			cachedValueStale = true;
			cachedGradientStale = true;
			if (buff.length != parameters.length)
				parameters = new double[buff.length];
			System.arraycopy (buff, 0, parameters, 0, buff.length);
		}


		// log probability of the training labels
		public double getValue ()
		{
			if (cachedValueStale) {
				numGetValueCalls++;
				cachedValue = 0;
				// We'll store the expectation values in "cachedGradient" for now
				cachedGradientStale = true;
				java.util.Arrays.fill (cachedGradient, 0.0);
				// Incorporate likelihood of data
				double[] scores = new double[trainingList.getTargetAlphabet().size()];
				double value = 0.0;
				//System.out.println("I Now "+inputAlphabet.size()+" regular features.");
				Iterator<Instance> iter = trainingList.iterator();
				//int ii = 0;

				// Normalize the parameters to be per-class multinomials
				double probs[][] = new double[scores.length][numFeatures];
				double lprobs[][] = new double[scores.length][numFeatures];

				for (int si = 0; si < scores.length; si++) {
					double sum = 0, max = MatrixOps.max (parameters);
					for (int fi = 0; fi < numFeatures; fi++) {
            // TODO Strongly consider some smoothing here.  What happens when all parameters are zero?
            // Oh, this should be no problem, because exp(0) == 1.
            probs[si][fi] = Math.exp(parameters[si*numFeatures+fi] - max);
						sum += probs[si][fi];
					}
          assert (sum > 0);
          for (int fi = 0; fi < numFeatures; fi++) {
						probs[si][fi] /= sum;
						lprobs[si][fi] = Math.log(probs[si][fi]);
					}
				}

				while (iter.hasNext()) {
					Instance instance = iter.next();
					double instanceWeight = trainingList.getInstanceWeight(instance);
					Labeling labeling = instance.getLabeling ();
					//System.out.println("L Now "+inputAlphabet.size()+" regular features.");

					this.theClassifier.getClassificationScores (instance, scores);
					FeatureVector fv = (FeatureVector) instance.getData ();
					int li = labeling.getBestIndex();
					value = - (instanceWeight * Math.log (scores[li]));
					if(Double.isNaN(value)) {
						logger.fine ("MCMaxEntTrainer: Instance " + instance.getName() +
						             "has NaN value. log(scores)= " + Math.log(scores[li]) +
						             " scores = " + scores[li] +
						             " has instance weight = " + instanceWeight);

					}
					if (Double.isInfinite(value)) {
						logger.warning ("Instance "+instance.getSource() + " has infinite value; skipping value and gradient");
						cachedValue -= value;
						cachedValueStale = false;
						return -value;
//						continue;
					}
					cachedValue += value;
					// CPAL - this is a loop over classes and their scores
					//      - we compute the gradient by taking the dot product of the feature value
					//        and the probability of the class
					for (int si = 0; si < scores.length; si++) {
						if (scores[si] == 0) continue;
						assert (!Double.isInfinite(scores[si]));
						// CPAL - accumulating the current classifiers expectation of the feature
						// vector counts for this class label
						// Current classifier has expectation over class label, not over feature vector
						MatrixOps.rowPlusEquals (cachedGradient, numFeatures,
						                         si, fv, -instanceWeight * scores[si]);
						cachedGradient[numFeatures*si + defaultFeatureIndex] += (-instanceWeight * scores[si]);
					}

					// CPAL - if we wish to do multiconditional training we need another term for this accumulated
					//        expectation
					if (usingMultiConditionalTraining) {
						// need something analogous to this
						// this.theClassifier.getClassificationScores (instance, scores);
						// this.theClassifier.getFeatureDistributions (instance,
						// Note: li is the "label" for this instance

						// Get the sum of the feature vector
						// which is the number of counts for the document if we use that as input
						double Ncounts = MatrixOps.sum(fv);

						// CPAL - get the additional term for the value of our - log probability
						//      - this computation amounts to the dot product of the feature vector and the probability vector
						cachedValue -= (instanceWeight * fv.dotProduct(lprobs[li]));

						// CPAL - get the model expectation over features for the given class
						for (int fi = 0; fi < numFeatures; fi++) {

							//if(parameters[numFeatures*li + fi] != 0) {
							// MatrixOps.rowPlusEquals(cachedGradient, numFeatures,li,fv,))
							cachedGradient[numFeatures*li + fi] += (-instanceWeight * Ncounts * probs[li][fi]);
							//    }
						}

					}
				}
				//logger.info ("-Expectations:"); cachedGradient.print();
				// Incorporate prior on parameters
				if (usingHyperbolicPrior) {
					for (int li = 0; li < numLabels; li++)
						for (int fi = 0; fi < numFeatures; fi++)
							cachedValue += (hyperbolicPriorSlope / hyperbolicPriorSharpness
							                * Math.log (Maths.cosh (hyperbolicPriorSharpness * parameters[li *numFeatures + fi])));
				} else {
					for (int li = 0; li < numLabels; li++)
						for (int fi = 0; fi < numFeatures; fi++) {
							double param = parameters[li*numFeatures + fi];
							cachedValue += param * param / (2 * gaussianPriorVariance);
						}
				}
				cachedValue *= -1.0; // MAXIMIZE, NOT MINIMIZE
				cachedValueStale = false;
				progressLogger.info ("Value (loglikelihood) = "+cachedValue);
			}
			return cachedValue;
		}

		// CPAL first get value, then gradient

		public void getValueGradient (double [] buffer)
		{
			// Gradient is (constraint - expectation - parameters/gaussianPriorVariance)
			if (cachedGradientStale) {
				numGetValueGradientCalls++;
				if (cachedValueStale)
				// This will fill in the cachedGradient with the "-expectation"
					getValue ();
				// cachedGradient contains the negative expectations
				// expectations are model expectations and constraints are
				// empirical expectations
				MatrixOps.plusEquals (cachedGradient, constraints);
				// CPAL - we need a second copy of the constraints
				//      - actually, we only want this for the feature values
				//      - I've moved this up into getValue
				//if (usingMultiConditionalTraining){
				//    MatrixOps.plusEquals(cachedGradient, constraints);
				//}
				// Incorporate prior on parameters
				if (usingHyperbolicPrior) {
					throw new UnsupportedOperationException ("Hyperbolic prior not yet implemented.");
				}
				else {
					MatrixOps.plusEquals (cachedGradient, parameters,
					                      -1.0 / gaussianPriorVariance);
				}

				// A parameter may be set to -infinity by an external user.
				// We set gradient to 0 because the parameter's value can
				// never change anyway and it will mess up future calculations
				// on the matrix, such as norm().
				MatrixOps.substitute (cachedGradient, Double.NEGATIVE_INFINITY, 0.0);
				// Set to zero all the gradient dimensions that are not among the selected features
				if (perLabelFeatureSelection == null) {
					for (int labelIndex = 0; labelIndex < numLabels; labelIndex++)
						MatrixOps.rowSetAll (cachedGradient, numFeatures,
						                     labelIndex, 0.0, featureSelection, false);
				} else {
					for (int labelIndex = 0; labelIndex < numLabels; labelIndex++)
						MatrixOps.rowSetAll (cachedGradient, numFeatures,
						                     labelIndex, 0.0,
						                     perLabelFeatureSelection[labelIndex], false);
				}
				cachedGradientStale = false;
			}
			assert (buffer != null && buffer.length == parameters.length);
			System.arraycopy (cachedGradient, 0, buffer, 0, cachedGradient.length);
		}

		public double sumNegLogProb (double a, double b)
		{
			if (a == Double.POSITIVE_INFINITY && b == Double.POSITIVE_INFINITY)
				return Double.POSITIVE_INFINITY;
			else if (a > b)
				return b - Math.log (1 + Math.exp(b-a));
			else
				return a - Math.log (1 + Math.exp(a-b));
		}

	}

}