ExpGain.java example

Explorer
topic-modeling-master
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */




/**
	 The "gain" obtained by adding a feature to a conditional exponential model.
	 Based on the joint exponential model in Della Pietra, Della Pietra & Lafferty, 1997.

	 We smooth using a Gaussian prior.
	 Note that we use Math.log(), not log-base-2, so the units are not "bits".

   @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
 */

package cc.mallet.types;

import java.util.logging.*;
import java.io.*;

import cc.mallet.classify.Classification;
import cc.mallet.util.MalletLogger;

public class ExpGain extends RankedFeatureVector
{
	private static Logger logger = MalletLogger.getLogger(ExpGain.class.getName());
	// ExpGain of a feature, f, is defined in terms of MaxEnt-type feature+class "Feature"s, F,
	//  F = f,c
	// ExpGain of a Feature, F, is
	//  G(F) = KL(p~[C]||q[C]) - KL(p~[C]||q_F[C])
	// where p~[] is the empirical distribution, according to the true class label distribution
	// and   q[] is the distribution from the (imperfect) classifier
	// and   q_F[] is the distribution from the (imperfect) classifier with F added
	//       and F's weight adjusted (but none of the other weights adjusted)
	// ExpGain of a feature,f, is
	//  G(f) = sum_c G(f,c)

	// It would be more accurate to return a gain number for each "feature/class" combination,
	// but here we simply return the gain(feature) = \sum_{class} gain(feature,class).

	// xxx Not ever used.  Remove them.
	boolean usingHyperbolicPrior = false;
	double hyperbolicSlope = 0.2;
	double hyperbolicSharpness = 10.0;

	private static double[] calcExpGains (InstanceList ilist, LabelVector[] classifications,
																				double gaussianPriorVariance)
	{
		int numInstances = ilist.size();
		int numClasses = ilist.getTargetAlphabet().size();
		int numFeatures = ilist.getDataAlphabet().size();
		assert (ilist.size() > 0);
		// Notation from  Della Pietra & Lafferty 1997, p.4
		// "p~"
		double[][] p = new double[numClasses][numFeatures];
		// "q"
		double[][] q = new double[numClasses][numFeatures];
		// "alpha", the weight of the new feature
		double[][] alphas = new double[numClasses][numFeatures];
		int fli; // feature location index
		double flv;	// feature location value
		logger.info ("Starting klgains, #instances="+numInstances);
		double trueLabelWeightSum = 0;
		double modelLabelWeightSum = 0;

		// Calculate p~[f] and q[f]
		for (int i = 0; i < numInstances; i++) {
			assert (classifications[i].getLabelAlphabet() == ilist.getTargetAlphabet());
			Instance inst = ilist.get(i);
			Labeling labeling = inst.getLabeling ();
			FeatureVector fv = (FeatureVector) inst.getData ();
			//double instanceWeight = ilist.getInstanceWeight(i);
			// The code below relies on labelWeights summing to 1 over all labels!
			double perInstanceModelLabelWeight = 0;
			for (int li = 0; li < numClasses; li++) {
				double trueLabelWeight = labeling.value (li);
				double modelLabelWeight = classifications[i].value(li);
				trueLabelWeightSum += trueLabelWeight;
				modelLabelWeightSum += modelLabelWeight;
				perInstanceModelLabelWeight += modelLabelWeight;
				//if (i < 500) System.out.println ("i="+i+" li="+li+" true="+trueLabelWeight+" model="+modelLabelWeight);
				if (trueLabelWeight == 0 && modelLabelWeight == 0)
					continue;
				for (int fl = 0; fl < fv.numLocations(); fl++) {
					fli = fv.indexAtLocation(fl);
					assert (fv.valueAtLocation(fl) == 1.0);
					// xxx Note that we are not attenting to instanceWeight here!
					//p[li][fli] += trueLabelWeight * instanceWeight / (numInstances+1);
					//q[li][fli] += modelLabelWeight * instanceWeight / (numInstances+1);
					p[li][fli] += trueLabelWeight;
					q[li][fli] += modelLabelWeight;
				}
			}
			assert (Math.abs (perInstanceModelLabelWeight - 1.0) < 0.001);
		}
		assert (Math.abs (trueLabelWeightSum/numInstances - 1.0) < 0.001)
			: "trueLabelWeightSum should be 1.0, it was "+trueLabelWeightSum;
		assert (Math.abs (modelLabelWeightSum/numInstances - 1.0) < 0.001)
			: "modelLabelWeightSum should be 1.0, it was "+modelLabelWeightSum;

/*
		double psum = 0;
		double qsum = 0;
		for (int i = 0; i < numClasses; i++)
			for (int j = 0; j < numFeatures; j++) {
				psum += p[i][j];
				qsum += q[i][j];
			}
		assert (Math.abs(psum - 1.0) < 0.0001) : "psum not 1.0!  psum="+psum+" qsum="+qsum;
		assert (Math.abs(qsum - 1.0) < 0.0001) : "qsum not 1.0!  psum="+psum+" qsum="+qsum;
*/

		// Determine the alphas
		// We can't do it in closed form as in the Della Pietra paper, because this we have here
		// a conditional MaxEnt model.
		// So we do it by Newton-Raphson...

		// ...initializing by the broken, inappropriate joint-case closed-form solution:
		//for (int i = 0; i < numClasses; i++)
		//for (int j = 0; j < numFeatures; j++)
		//alphas[i][j] = Math.log ( (p[i][j]*(1.0-q[i][j])) / (q[i][j]*(1.0-p[i][j])) );
		
		double[][] dalphas = new double[numClasses][numFeatures];	// first derivative
		double[][] alphaChangeOld = new double[numClasses][numFeatures];	// change in alpha, last iteration
		double[][] alphaMax = new double[numClasses][numFeatures];	// change in alpha, last iteration
		double[][] alphaMin = new double[numClasses][numFeatures];	// change in alpha, last iteration
		double[][] ddalphas = new double[numClasses][numFeatures];// second derivative
		for (int i = 0; i < numClasses; i++)
			for (int j = 0; j < numFeatures; j++) {
				alphaMax[i][j] = Double.POSITIVE_INFINITY;
				alphaMin[i][j] = Double.NEGATIVE_INFINITY;
			}
		double maxAlphachange = 0;
		double maxDalpha = 99;
		int maxNewtonSteps = 50;							// xxx Change to more?
		// alphas[][] are initialized to zero
		for (int newton = 0; maxDalpha > 1.0E-8 && newton < maxNewtonSteps; newton++) {
			//System.out.println ("Newton iteration "+newton);
			if (false /*usingHyperbolicPrior*/) {
				for (int i = 0; i < numClasses; i++)
					for (int j = 0; j < numFeatures; j++) {
						dalphas[i][j] = p[i][j] - (alphas[i][j] / gaussianPriorVariance);
						ddalphas[i][j] = -1 / gaussianPriorVariance;
					}
			} else {
				// Gaussian prior
				for (int i = 0; i < numClasses; i++)
					for (int j = 0; j < numFeatures; j++) {
						dalphas[i][j] = p[i][j] - (alphas[i][j] / gaussianPriorVariance);
						ddalphas[i][j] = -1 / gaussianPriorVariance;
					}
			}
			for (int i = 0; i < ilist.size(); i++) {
				assert (classifications[i].getLabelAlphabet() == ilist.getTargetAlphabet());
				Instance inst = ilist.get(i);
				Labeling labeling = inst.getLabeling ();
				FeatureVector fv = (FeatureVector) inst.getData ();
				// xxx This assumes binary-valued features.  What about "tied" weights?
				for (int fl = 0; fl < fv.numLocations(); fl++) {
					fli = fv.indexAtLocation(fl);
 					for (int li = 0; li < numClasses; li++) {
						double modelLabelWeight = classifications[i].value(li);
						double expalpha = Math.exp (alphas[li][fli]);
						double numerator = modelLabelWeight * expalpha;
						double denominator = numerator + (1.0 - modelLabelWeight);
						dalphas[li][fli] -= numerator / denominator;
						ddalphas[li][fli] += ((numerator*numerator) / (denominator*denominator)
																	- (numerator/denominator));
					}
				}
			}
			// We now now first- and second-derivative for this newton step
			// Run tests on the alphas and their derivatives, and do a newton step
			double alphachange, newalpha, oldalpha;
			maxAlphachange = maxDalpha = 0;
			for (int i = 0; i < numClasses; i++)
				for (int j = 0; j < numFeatures; j++) {
					alphachange = - (dalphas[i][j] / ddalphas[i][j]);
					if (p[i][j] == 0 && q[i][j] == 0)
						continue;
					else if (false && (i*numFeatures+j) % (numClasses*numFeatures/2000) == 0
									 || Double.isNaN(alphas[i][j]) || Double.isNaN(alphachange))
						// Print just a sampling of them...
						logger.info ("alpha["+i+"]["+j+"]="+alphas[i][j]+
												 " p="+p[i][j]+
												 " q="+q[i][j]+
												 " dalpha="+dalphas[i][j]+
												 " ddalpha="+ddalphas[i][j]+
												 " alphachange="+alphachange+
												 " min="+alphaMin[i][j]+
												 " max="+alphaMax[i][j]);
					if (Double.isNaN(alphas[i][j]) || Double.isNaN(dalphas[i][j]) || Double.isNaN(ddalphas[i][j])
							|| Double.isInfinite(alphas[i][j]) || Double.isInfinite(dalphas[i][j]) || Double.isInfinite(ddalphas[i][j])) 
						alphachange = 0;
//					assert (!Double.isNaN(alphas[i][j]));
//					assert (!Double.isNaN(dalphas[i][j]));
//					assert (!Double.isNaN(ddalphas[i][j]));
					oldalpha = alphas[i][j];
					// xxx assert (ddalphas[i][j] <= 0);
					//assert (Math.abs(alphachange) < 100.0) : alphachange; // xxx arbitrary?
					// Trying to prevent a cycle
					if (Math.abs(alphachange + alphaChangeOld[i][j]) / Math.abs(alphachange) < 0.01)
						newalpha = alphas[i][j] + alphachange / 2;
					else
						newalpha = alphas[i][j] + alphachange;
					if (alphachange < 0 && alphaMax[i][j] > alphas[i][j]) {
						//System.out.println ("Updating alphaMax["+i+"]["+j+"] = "+alphas[i][j]);
						alphaMax[i][j] = alphas[i][j];
					}
					if (alphachange > 0 && alphaMin[i][j] < alphas[i][j]) {
						//System.out.println ("Updating alphaMin["+i+"]["+j+"] = "+alphas[i][j]);
						alphaMin[i][j] = alphas[i][j];
					}
					if (newalpha <= alphaMax[i][j] && newalpha >= alphaMin[i][j])
						// Newton wants to jump to a point inside the boundaries; let it
						alphas[i][j] = newalpha;
					else {
						// Newton wants to jump to a point outside the boundaries; bisect instead
						assert (alphaMax[i][j] != Double.POSITIVE_INFINITY);
						assert (alphaMin[i][j] != Double.NEGATIVE_INFINITY);
						alphas[i][j] = alphaMin[i][j] + (alphaMax[i][j] - alphaMin[i][j]) / 2;
						//System.out.println ("Newton tried to exceed bounds; bisecting. dalphas["+i+"]["+j+"]="+dalphas[i][j]+" alphaMin="+alphaMin[i][j]+" alphaMax="+alphaMax[i][j]);
					}
					alphachange = alphas[i][j] - oldalpha;
					if (Math.abs(alphachange) > maxAlphachange)
						maxAlphachange = Math.abs (alphachange);
					if (Math.abs (dalphas[i][j]) > maxDalpha)
						maxDalpha = Math.abs (dalphas[i][j]);
					alphaChangeOld[i][j] = alphachange;
				}
			logger.info ("After "+newton+" Newton iterations, maximum alphachange="+maxAlphachange+
													" dalpha="+maxDalpha);
		}
					
		// Allow some memory to be freed
		//q = null;
		ddalphas = dalphas = alphaChangeOld = alphaMin = alphaMax = null;
		
		// "q[e^{\alpha g}]", p.4
		//System.out.println ("Calculating qeag...");
		// Note that we are using a gaussian prior, so we don't multiply by (1/numInstances)
		double[][] qeag = new double[numClasses][numFeatures];
		for (int i = 0; i < ilist.size(); i++) {
			assert (classifications[i].getLabelAlphabet() == ilist.getTargetAlphabet());
			Instance inst = ilist.get(i);
			Labeling labeling = inst.getLabeling ();
			FeatureVector fv = (FeatureVector) inst.getData ();
			int fvMaxLocation = fv.numLocations()-1;
			for (int li = 0; li < numClasses; li++) {
				double modelLabelWeight = classifications[i].value(li);
				// Following line now done before outside of loop over instances
				// for (int fi = 0; fi < numFeatures; fi++) qeag[li][fi] += modelLabelWeight; // * 1.0;
				for (int fl = 0; fl < fv.numLocations(); fl++) {
					fli = fv.indexAtLocation(fl);
					// When the value of this feature "g" is zero, a value of 1.0 should be included
					// in the expectation; we'll actually add all these at the end (pre-"assuming"
					// that all features have value zero).  Here we subtract the "assumed"
					// modelLabelWeight, and put in the true value based on non-zero valued feature "g".
					qeag[li][fli] += Math.log (modelLabelWeight * Math.exp (alphas[li][fli]) + (1-modelLabelWeight));
				}
			}
		}
		
		//System.out.println ("Calculating klgain values...");
		double[] klgains = new double[numFeatures];
		double klgainIncr, alpha;
		for (int i = 0; i < numClasses; i++)
			for (int j = 0; j < numFeatures; j++) {
				assert (!Double.isInfinite(alphas[i][j]));
				alpha = alphas[i][j];
				if (alpha == 0)
					continue;
				klgainIncr = (alpha * p[i][j]) - qeag[i][j] - (alpha*alpha/(2*gaussianPriorVariance));
				if (klgainIncr < 0) {
					if (false)
						logger.info ("WARNING: klgainIncr["+i+"]["+j+"]="+klgainIncr+
												 "  alpha="+alphas[i][j]+
												 "  feature="+ilist.getDataAlphabet().lookupObject(j)+
												 "  class="+ilist.getTargetAlphabet().lookupObject(i));
				} else
					klgains[j] += klgainIncr;
			}

		if (false) {
			logger.info ("klgains.length="+klgains.length);
			for (int j = 0; j < numFeatures; j++) {
				if (j % (numFeatures/100) == 0) {
					for (int i = 0; i < numClasses; i++) {
						logger.info ("c="+i+" p["+ilist.getDataAlphabet().lookupObject(j)+"] = "+p[i][j]);
						logger.info ("c="+i+" q["+ilist.getDataAlphabet().lookupObject(j)+"] = "+q[i][j]);
						logger.info ("c="+i+" alphas["+ilist.getDataAlphabet().lookupObject(j)+"] = "+alphas[i][j]);
						logger.info ("c="+i+" qeag["+ilist.getDataAlphabet().lookupObject(j)+"] = "+qeag[i][j]);
					}
					logger.info ("klgains["+ilist.getDataAlphabet().lookupObject(j)+"] = "+klgains[j]);
				} 
			}
		}
		
		return klgains;
	}

	public ExpGain (InstanceList ilist, LabelVector[] classifications, double gaussianPriorVariance)
	{
		super (ilist.getDataAlphabet(), calcExpGains (ilist, classifications, gaussianPriorVariance));
	}

	private static LabelVector[] getLabelVectorsFromClassifications (Classification[] c)
	{
		LabelVector[] ret = new LabelVector[c.length];
		for (int i = 0; i < c.length; i++)
			ret[i] = c[i].getLabelVector();
		return ret;
	}

	public ExpGain (InstanceList ilist, Classification[] classifications, double gaussianPriorVariance)
	{
		super (ilist.getDataAlphabet(),
					 calcExpGains (ilist, getLabelVectorsFromClassifications(classifications), gaussianPriorVariance));
	}


	public static class Factory implements RankedFeatureVector.Factory
	{
		LabelVector[] classifications;
		double gaussianPriorVariance = 10.0;
		
		public Factory (LabelVector[] classifications)
		{
			this.classifications = classifications;
		}

		public Factory (LabelVector[] classifications,
										double gaussianPriorVariance)
		{
			this.classifications = classifications;
			this.gaussianPriorVariance = gaussianPriorVariance;
		}
		
		public RankedFeatureVector newRankedFeatureVector (InstanceList ilist)
		{
			assert (ilist.getTargetAlphabet() == classifications[0].getAlphabet());
			return new ExpGain (ilist, classifications, gaussianPriorVariance);
		}

		// Serialization
		private static final long serialVersionUID = 1;
		private static final int CURRENT_SERIAL_VERSION = 0;
		private void writeObject (ObjectOutputStream out) throws IOException {
			out.writeInt (CURRENT_SERIAL_VERSION);
			out.writeInt(classifications.length);
			for (int i = 0; i < classifications.length; i++)
				out.writeObject(classifications[i]);
		}
		private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
			int version = in.readInt ();
			int n = in.readInt();
			this.classifications = new LabelVector[n];
			for (int i = 0; i < n; i++)
				this.classifications[i] = (LabelVector)in.readObject();
		}
		
	}

}