DMROptimizable.java example

Explorer
topic-modeling-master
package cc.mallet.topics;

/** This class implements the value and gradient functions for
 *   Dirichlet-multinomial Regression. See Guimaraes and Lindrooth, 
 *   for a general introduction to DMR, 
 *   and Mimno and McCallum (UAI, 2008) for an application to 
 *   multinomial mixture models.
 */

import cc.mallet.optimize.Optimizable;
import cc.mallet.classify.MaxEnt;

import cc.mallet.types.InstanceList;
import cc.mallet.types.Instance;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureVector;
import cc.mallet.types.Dirichlet;
import cc.mallet.types.MatrixOps;

import cc.mallet.util.MalletLogger;
import cc.mallet.util.MalletProgressMessageLogger;

import java.util.logging.*;
import java.util.*;

import java.text.NumberFormat;
import java.text.DecimalFormat;

import gnu.trove.TIntIntHashMap;

public class DMROptimizable implements Optimizable.ByGradientValue {

	private static Logger logger = MalletLogger.getLogger(DMROptimizable.class.getName());
	private static Logger progressLogger = MalletProgressMessageLogger.getLogger(DMROptimizable.class.getName()+"-pl");

	MaxEnt classifier;
	InstanceList trainingList;

	int numGetValueCalls = 0;
    int numGetValueGradientCalls = 0;
    int numIterations = Integer.MAX_VALUE;

	NumberFormat formatter = null;

	static final double DEFAULT_GAUSSIAN_PRIOR_VARIANCE = 1;
	static final double DEFAULT_LARGE_GAUSSIAN_PRIOR_VARIANCE = 100;
    static final double DEFAULT_GAUSSIAN_PRIOR_MEAN = 0.0;
        
    double gaussianPriorMean = DEFAULT_GAUSSIAN_PRIOR_MEAN;
    double gaussianPriorVariance = DEFAULT_GAUSSIAN_PRIOR_VARIANCE;

	// Allowing the default feature (the base level) to 
	//  fluctuate more freely than the feature parameters leads
	//  to much better results.
    double defaultFeatureGaussianPriorVariance = DEFAULT_LARGE_GAUSSIAN_PRIOR_VARIANCE;

	double[] parameters;
	double[] cachedGradient;

	double cachedValue;
	boolean cachedValueStale;
	boolean cachedGradientStale;
	int numLabels;
	int numFeatures;
	int defaultFeatureIndex;

	public DMROptimizable () {}

	public DMROptimizable (InstanceList instances, MaxEnt initialClassifier) {

		this.trainingList = instances;
		Alphabet alphabet = instances.getDataAlphabet();
		Alphabet labelAlphabet = instances.getTargetAlphabet();

		this.numLabels = labelAlphabet.size();

		// Add one feature for the "default feature".
		this.numFeatures = alphabet.size() + 1; // add a spot for the intercept term
            
		//System.out.println("num features: " + numFeatures + " numLabels: " + numLabels);

		this.defaultFeatureIndex = numFeatures - 1;

		this.parameters = new double [numLabels * numFeatures];

		//this.constraints = new double [numLabels * numFeatures];
		this.cachedGradient = new double [numLabels * numFeatures];

		if (initialClassifier != null) {
			this.classifier = initialClassifier;
			this.parameters = classifier.getParameters();
			this.defaultFeatureIndex = classifier.getDefaultFeatureIndex();
			assert (initialClassifier.getInstancePipe() == instances.getPipe());
		}
		else if (this.classifier == null) {
			this.classifier =
				new MaxEnt (instances.getPipe(), parameters);
		}

		formatter = new DecimalFormat("0.###E0");

		cachedValueStale = true;
		cachedGradientStale = true;

		// Initialize the constraints

		logger.fine("Number of instances in training list = " + trainingList.size());

		for (Instance instance : trainingList) {
			FeatureVector multinomialValues = (FeatureVector) instance.getTarget();

			if (multinomialValues == null)
				continue;

			FeatureVector features = (FeatureVector) instance.getData();
			assert (features.getAlphabet() == alphabet);

			boolean hasNaN = false;

			for (int i = 0; i < features.numLocations(); i++) {
				if (Double.isNaN(features.valueAtLocation(i))) {
					logger.info("NaN for feature " + alphabet.lookupObject(features.indexAtLocation(i)).toString()); 
					hasNaN = true;
				}
			}

			if (hasNaN) {
				logger.info("NaN in instance: " + instance.getName());
			}

		}

		//TestMaximizable.testValueAndGradientCurrentParameters (this);
	}
	
	/** Set the variance for the default features (aka intercept terms), generally 
	 *   larger than the variance for the regular features.
	 */
	public void setInterceptGaussianPriorVariance(double sigmaSquared) {
		this.defaultFeatureGaussianPriorVariance = sigmaSquared;
	}

	/** Set the variance for regular (non default) features, generally 
	 *   smaller than the variance for the default features.
	 */
	public void setRegularGaussianPriorVariance(double sigmaSquared) {
		this.gaussianPriorVariance = sigmaSquared;
	}

	public MaxEnt getClassifier () { return classifier; }
                
	public double getParameter (int index) {
		return parameters[index];
	}
                
	public void setParameter (int index, double v) {
		cachedValueStale = true;
		cachedGradientStale = true;
		parameters[index] = v;
	}
                
	public int getNumParameters() {
		return parameters.length;
	}
                
	public void getParameters (double[] buff) {
		if (buff == null || buff.length != parameters.length) {
			buff = new double [parameters.length];
		}
		System.arraycopy (parameters, 0, buff, 0, parameters.length);
	}
        
	public void setParameters (double [] buff) {
		assert (buff != null);
		cachedValueStale = true;
		cachedGradientStale = true;
		if (buff.length != parameters.length)
			parameters = new double[buff.length];
		System.arraycopy (buff, 0, parameters, 0, buff.length);
	}

	/** The log probability of the observed count vectors given the features. */
	public double getValue () {

		if (! cachedValueStale) { return cachedValue; }

		numGetValueCalls++;
		cachedValue = 0;

		// Incorporate likelihood of data
		double[] scores = new double[ trainingList.getTargetAlphabet().size() ];
		double value = 0.0;

		int instanceIndex = 0;
		
		for (Instance instance: trainingList) {

			FeatureVector multinomialValues = (FeatureVector) instance.getTarget();
			if (multinomialValues == null) { continue; }

			//System.out.println("L Now "+inputAlphabet.size()+" regular features.");
                
			// Get the predicted probability of each class
			//   under the current model parameters
			this.classifier.getUnnormalizedClassificationScores(instance, scores);

			double sumScores = 0.0;

			// Exponentiate the scores
			for (int i=0; i<scores.length; i++) {
				// Due to underflow, it's very likely that some of these scores will be 0.0.
				scores[i] = Math.exp(scores[i]);
				sumScores += scores[i];
			}

			FeatureVector features = (FeatureVector) instance.getData();

			// This is really an int, but since FeatureVectors are defined as doubles, 
			//  avoid casting.
			double totalLength = 0;

			for (int i = 0; i < multinomialValues.numLocations(); i++) {
				int label = multinomialValues.indexAtLocation(i);
				double count = multinomialValues.valueAtLocation(i);
				value += (Dirichlet.logGammaStirling(scores[label] + count) -
						  Dirichlet.logGammaStirling(scores[label]));
				totalLength += count;
			}

			value -= (Dirichlet.logGammaStirling(sumScores + totalLength) -
					  Dirichlet.logGammaStirling(sumScores));
                    
			// Error Checking:
                
			if (Double.isNaN(value)) {
				logger.fine ("DCMMaxEntTrainer: Instance " + instance.getName() +
							 "has NaN value.");

				for (int label: multinomialValues.getIndices()) {
					logger.fine ("log(scores)= " + Math.log(scores[label]) +
								 " scores = " + scores[label]);
				}
			}

			if (Double.isInfinite(value)) {
				logger.warning ("Instance " + instance.getSource() + 
								" has infinite value; skipping value and gradient");
				cachedValue -= value;
				cachedValueStale = false;
				return -value;
			}

			//System.out.println(value);

			cachedValue += value;
                
			instanceIndex++;
		}

		// Incorporate prior on parameters

		double prior = 0;

		// The log of a gaussian prior is x^2 / -2sigma^2

		for (int label = 0; label < numLabels; label++) {
			for (int feature = 0; feature < numFeatures - 1; feature++) {
				double param = parameters[label*numFeatures + feature];
				prior -= (param - gaussianPriorMean) * (param - gaussianPriorMean) / (2 * gaussianPriorVariance);
			}
			double param = parameters[label*numFeatures + defaultFeatureIndex];
			prior -= (param - gaussianPriorMean) * (param - gaussianPriorMean) /
				(2 * defaultFeatureGaussianPriorVariance);
		}

		double labelProbability = cachedValue;
		cachedValue += prior;
		cachedValueStale = false;
		progressLogger.info ("Value (likelihood=" + formatter.format(labelProbability) +
							 " prior=" + formatter.format(prior) +
							 ") = " + formatter.format(cachedValue));

		return cachedValue;
	}

	public void getValueGradient (double [] buffer) {

		MatrixOps.setAll (cachedGradient, 0.0);

		// Incorporate likelihood of data
		double[] scores = new double[ trainingList.getTargetAlphabet().size() ];

		int instanceIndex = 0;

		for (Instance instance: trainingList) {

			FeatureVector multinomialValues = (FeatureVector) instance.getTarget();
			if (multinomialValues == null) { continue; }

			// Get the predicted probability of each class
			//   under the current model parameters
			this.classifier.getUnnormalizedClassificationScores(instance, scores);

			double sumScores = 0.0;

			// Exponentiate the scores
			for (int i=0; i<scores.length; i++) {
				// Due to underflow, it's very likely that some of these scores will be 0.0.
				scores[i] = Math.exp(scores[i]);
				sumScores += scores[i];
			}

			FeatureVector features = (FeatureVector) instance.getData();

			double totalLength = 0;

			for (double count : multinomialValues.getValues()) {
				totalLength += count;
			}
			
			double digammaDifferenceForSums = 
				Dirichlet.digamma(sumScores + totalLength) -
				Dirichlet.digamma(sumScores);
			
			for (int loc = 0; loc < features.numLocations(); loc++) {
				int index = features.indexAtLocation(loc);
				double value = features.valueAtLocation(loc);
                    
				if (value == 0.0) { continue; }

				// In a FeatureVector, there's no easy way to say "do you know
				//   about this id?" so I've broken this into two for loops,
				//  one for all labels, the other for just the non-zero ones.

				for (int label=0; label<numLabels; label++) {
					cachedGradient[label * numFeatures + index] -=
						value * scores[label] * digammaDifferenceForSums;
				}

				for (int labelLoc = 0; labelLoc <multinomialValues.numLocations(); labelLoc++) {
					int label = multinomialValues.indexAtLocation(labelLoc);
					double count = multinomialValues.valueAtLocation(labelLoc);

					double diff = 0.0;
                            
					if (count < 20) {
						for (int i=0; i < count; i++) {
							diff += 1 / (scores[label] + i);
						}
					}
					else {
						diff = Dirichlet.digamma(scores[label] + count) -
							Dirichlet.digamma(scores[label]);
					}

					cachedGradient[label * numFeatures + index] +=
						value * scores[label] * diff;

				}
			}
			// Now add the default feature

			for (int label=0; label<numLabels; label++) {
				cachedGradient[label * numFeatures + defaultFeatureIndex] -=
					scores[label] * digammaDifferenceForSums;
			}


            for(int labelLoc = 0; labelLoc <multinomialValues.numLocations(); labelLoc++) {
				int label = multinomialValues.indexAtLocation(labelLoc);
                double count = multinomialValues.valueAtLocation(labelLoc);
				
				double diff = 0.0;

				if (count < 20) {
					for (int i=0; i < count; i++) {
						diff += 1 / (scores[label] + i);
					}
				}
				else {
					diff = Dirichlet.digamma(scores[label] + count) -
						Dirichlet.digamma(scores[label]);
				}

				cachedGradient[label * numFeatures + defaultFeatureIndex] +=
					scores[label] * diff;
                    

			}

		}

		numGetValueGradientCalls++;
            
		for (int label = 0; label < numLabels; label++) {
			for (int feature = 0; feature < numFeatures - 1; feature++) {
				double param = parameters[label*numFeatures + feature];

				cachedGradient[label * numFeatures + feature] -= 
					(param - gaussianPriorMean) / gaussianPriorVariance;
			}

			double param = parameters[label*numFeatures + defaultFeatureIndex];
                
			cachedGradient[label * numFeatures + defaultFeatureIndex] -= 
				(param - gaussianPriorMean) / defaultFeatureGaussianPriorVariance;
		}

		// A parameter may be set to -infinity by an external user.
		// We set gradient to 0 because the parameter's value can
		// never change anyway and it will mess up future calculations
		// on the matrix, such as norm().
		MatrixOps.substitute (cachedGradient, Double.NEGATIVE_INFINITY, 0.0);

		assert (buffer != null && buffer.length == parameters.length);
		System.arraycopy (cachedGradient, 0, buffer, 0, cachedGradient.length);
		//System.out.println ("DCMMaxEntTrainer gradient infinity norm = "+MatrixOps.infinityNorm(cachedGradient));
	}
}