KLGain.java example

Explorer
topic-modeling-master
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */




/**
	 The "gain" obtained by adding a feature to an exponential model.
	 From Della Pietra, Della Pietra & Lafferty, 1997

	 What is the *right* way to smooth p[] and q[] so we don't get zeros,
	 (and therefore zeros in alpha[], and NaN in klgain[]?)
	 I think it would be to put the prior over parameters into G_q(\alpha,g).
	 Right now I'm simply doing a little m-estimate smoothing of p[] and q[].
	 

	 Note that we use Math.log(), not log-base-2, so the units are not "bits".

   @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
 */

package cc.mallet.types;

import java.util.logging.*;

import cc.mallet.classify.Classification;
import cc.mallet.util.MalletLogger;

public class KLGain extends RankedFeatureVector
{
	private static Logger logger = MalletLogger.getLogger(KLGain.class.getName());
	// KLGain of a feature, f, is defined in terms of MaxEnt-type feature+class "Feature"s, F,
	//  F = f,c
	// KLGain of a Feature, F, is
	//  G(F) = KL(p~[C]||q[C]) - KL(p~[C]||q_F[C])
	// where p~[] is the empirical distribution, according to the true class label distribution
	// and   q[] is the distribution from the (imperfect) classifier
	// and   q_F[] is the distribution from the (imperfect) classifier with F added
	//       and F's weight adjusted (but none of the other weights adjusted)
	// KLGain of a feature,f, is
	//  G(f) = sum_c G(f,c)

	
	private static double[] calcKLGains (InstanceList ilist, LabelVector[] classifications)
	{
		int numInstances = ilist.size();
		int numClasses = ilist.getTargetAlphabet().size();
		int numFeatures = ilist.getDataAlphabet().size();
		assert (ilist.size() > 0);
		// Notation from  Della Pietra & Lafferty 1997, p.4
		// "p~"
		double[][] p = new double[numClasses][numFeatures];
		// "q"
		double[][] q = new double[numClasses][numFeatures];
		// "alpha", the weight of the new feature
		double[][] alphas = new double[numClasses][numFeatures];
		double flv;	// feature location value
		int fli; // feature location index
		logger.info ("Starting klgains, #instances="+numInstances);
		double trueLabelWeightSum = 0;
		double modelLabelWeightSum = 0;
		// Actually pretty lame smoothing based on ghost-counts
		final boolean doingSmoothing = true;

		double numInExpectation = doingSmoothing ? (numInstances+1.0) : (numInstances);
		// Attempt some add-hoc smoothing; remove the "+1.0" in the line above, if not doing smoothing
		if (doingSmoothing) {
			for (int i = 0; i < numClasses; i++)
				for (int j = 0; j < numFeatures; j++) {
					p[i][j] = q[i][j] = 1.0/(numInExpectation*numFeatures*numClasses);
					trueLabelWeightSum += p[i][j];
					modelLabelWeightSum += q[i][j];
				}
		}

		for (int i = 0; i < numInstances; i++) {
			assert (classifications[i].getLabelAlphabet() == ilist.getTargetAlphabet());
			Instance inst = ilist.get(i);
			Labeling labeling = inst.getLabeling ();
			FeatureVector fv = (FeatureVector) inst.getData ();
			//double instanceWeight = ilist.getInstanceWeight(i);
			// The code below relies on labelWeights summing to 1 over all labels!
			for (int li = 0; li < numClasses; li++) {
				double trueLabelWeight = labeling.value (li) / numInExpectation;
				double modelLabelWeight = classifications[i].value(li) / numInExpectation;
				trueLabelWeightSum += trueLabelWeight;
				modelLabelWeightSum += modelLabelWeight;
				//if (i < 500) System.out.println ("i="+i+" li="+li+" true="+trueLabelWeight+" model="+modelLabelWeight);
				if (trueLabelWeight == 0 && modelLabelWeight == 0)
					continue;
				for (int fl = 0; fl < fv.numLocations(); fl++) {
					fli = fv.indexAtLocation(fl);
					assert (fv.valueAtLocation(fl) == 1.0);
					//p[li][fli] += trueLabelWeight * instanceWeight / (numInstances+1);
					//q[li][fli] += modelLabelWeight * instanceWeight / (numInstances+1);
					p[li][fli] += trueLabelWeight;
					q[li][fli] += modelLabelWeight;
				}
			}
		}
		assert (Math.abs (trueLabelWeightSum - 1.0) < 0.001)
			: "trueLabelWeightSum should be 1.0, it was "+trueLabelWeightSum;
		assert (Math.abs (modelLabelWeightSum - 1.0) < 0.001)
			: "modelLabelWeightSum should be 1.0, it was "+modelLabelWeightSum;

/*
		double psum = 0;
		double qsum = 0;
		for (int i = 0; i < numClasses; i++)
			for (int j = 0; j < numFeatures; j++) {
				psum += p[i][j];
				qsum += q[i][j];
			}
		assert (Math.abs(psum - 1.0) < 0.0001) : "psum not 1.0!  psum="+psum+" qsum="+qsum;
		assert (Math.abs(qsum - 1.0) < 0.0001) : "qsum not 1.0!  psum="+psum+" qsum="+qsum;
*/
		
		for (int i = 0; i < numClasses; i++)
			for (int j = 0; j < numFeatures; j++)
				alphas[i][j] = Math.log ( (p[i][j]*(1.0-q[i][j])) / (q[i][j]*(1.0-p[i][j])) );
		//q = null;
		
		// "q[e^{\alpha g}]", p.4
		//System.out.println ("Calculating qeag...");
		double[][] qeag = new double[numClasses][numFeatures];
		modelLabelWeightSum = 0;
		for (int i = 0; i < ilist.size(); i++) {
			assert (classifications[i].getLabelAlphabet() == ilist.getTargetAlphabet());
			Instance inst = ilist.get(i);
			Labeling labeling = inst.getLabeling ();
			FeatureVector fv = (FeatureVector) inst.getData ();
			int fvMaxLocation = fv.numLocations()-1;
			for (int li = 0; li < numClasses; li++) {
				// q(\omega) = (classifications[i].value(li) / numInstances)
				double modelLabelWeight = classifications[i].value(li) / numInstances;
				modelLabelWeightSum += modelLabelWeight;
				// Following line now done before outside of loop over instances
				// for (int fi = 0; fi < numFeatures; fi++) qeag[li][fi] += modelLabelWeight; // * 1.0;
				for (int fl = 0; fl < fv.numLocations(); fl++) {
					fli = fv.indexAtLocation(fl);
					qeag[li][fli] += (modelLabelWeight * Math.exp (alphas[li][fli])) - modelLabelWeight;
				}
			}
		}
		for (int li = 0; li < numClasses; li++)
			for (int fi = 0; fi < numFeatures; fi++)
				// Assume that feature "fi" does not occur in "fv" and thus has value 0.
				// exp(alpha * 0) == 1.0
				// This factoring is possible because all features have value 1.0
				qeag[li][fi] += modelLabelWeightSum; // * 1.0;
		
		//System.out.println ("Calculating klgain values...");
		double[] klgains = new double[numFeatures];
		for (int i = 0; i < numClasses; i++)
			for (int j = 0; j < numFeatures; j++)
				if (alphas[i][j] > 0 && !Double.isInfinite(alphas[i][j]))
					klgains[j] += (alphas[i][j] * p[i][j]) - Math.log (qeag[i][j]);
					//klgains[j] += Math.abs(alphas[i][j] * p[i][j]);
					//klgains[j] += Math.abs(alphas[i][j]);

		if (true) {
			logger.info ("klgains.length="+klgains.length);
			for (int j = 0; j < numFeatures; j++) {
				if (j % (numFeatures/100) == 0) {
					for (int i = 0; i < numClasses; i++) {
						logger.info ("c="+i+" p["+ilist.getDataAlphabet().lookupObject(j)+"] = "+p[i][j]);
						logger.info ("c="+i+" q["+ilist.getDataAlphabet().lookupObject(j)+"] = "+q[i][j]);
						logger.info ("c="+i+" alphas["+ilist.getDataAlphabet().lookupObject(j)+"] = "+alphas[i][j]);
						logger.info ("c="+i+" qeag["+ilist.getDataAlphabet().lookupObject(j)+"] = "+qeag[i][j]);
					}
					logger.info ("klgains["+ilist.getDataAlphabet().lookupObject(j)+"] = "+klgains[j]);
				} 
			}
		}
		
		return klgains;
	}

	public KLGain (InstanceList ilist, LabelVector[] classifications)
	{
		super (ilist.getDataAlphabet(), calcKLGains (ilist, classifications));
	}

	private static LabelVector[] getLabelVectorsFromClassifications (Classification[] c)
	{
		LabelVector[] ret = new LabelVector[c.length];
		for (int i = 0; i < c.length; i++)
			ret[i] = c[i].getLabelVector();
		return ret;
	}

	public KLGain (InstanceList ilist, Classification[] classifications)
	{
		super (ilist.getDataAlphabet(),
					 calcKLGains (ilist, getLabelVectorsFromClassifications(classifications)));
	}
	
}