InfoGain.java example

Explorer
topic-modeling-master
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */




/**
	 Information gain of the absence/precense of each feature.

	 Note that we aren't attending to the feature's value, and MALLET doesn't currently
	 have any support at all for categorical features.

   @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
 */

package cc.mallet.types;

public class InfoGain extends RankedFeatureVector
{
	// xxx This is DISGUSTINGLY non-thread-safe.
	static double staticBaseEntropy;
	static LabelVector staticBaseLabelDistribution;

	// xxx Yuck.  Figure out how to remove this.
	// Not strictly part of a list of feature info gains, but convenient and efficient
	// for ml.classify.DecisionTree
	double baseEntropy;
	LabelVector baseLabelDistribution;
	
	private static double[] calcInfoGains (InstanceList ilist)
	{
		final double log2 = Math.log(2);
		int numInstances = ilist.size();
		int numClasses = ilist.getTargetAlphabet().size();
		int numFeatures = ilist.getDataAlphabet().size();
		double[] infogains = new double[numFeatures];
		double[][] targetFeatureCount = new double[numClasses][numFeatures];
		double[] featureCountSum = new double[numFeatures];
		double[] targetCount = new double[numClasses];
		double targetCountSum = 0;
		double flv;	// feature location value
		int fli; // feature location index
		double count;
		// Populate targetFeatureCount, et al
		for (int i = 0; i < ilist.size(); i++) {
			Instance inst = ilist.get(i);
			Labeling labeling = inst.getLabeling ();
			FeatureVector fv = (FeatureVector) inst.getData ();
			double instanceWeight = ilist.getInstanceWeight(i);
			// The code below relies on labelWeights summing to 1 over all labels!
			double labelWeightSum = 0;
			for (int ll = 0; ll < labeling.numLocations(); ll++) {
				int li = labeling.indexAtLocation (ll);
				double labelWeight = labeling.valueAtLocation (ll);
				labelWeightSum += labelWeight;
				if (labelWeight == 0) continue;
				count = labelWeight * instanceWeight;
				for (int fl = 0; fl < fv.numLocations(); fl++) {
					fli = fv.indexAtLocation(fl);
					// xxx Is this right?  What should we do about negative values?
					// Whatever is decided here should also go in DecisionTree.split()
					if (fv.valueAtLocation(fl) > 0) {
						targetFeatureCount[li][fli] += count;
						featureCountSum[fli] += count;
					}
				}
				targetCount[li] += count;
				targetCountSum += count;
			}
			assert (Math.abs (labelWeightSum - 1.0) < 0.0001);
		}
		if (targetCountSum == 0) {
			staticBaseEntropy = 0.0;					// xxx Should this instead by infinite?
			staticBaseLabelDistribution = new LabelVector ((LabelAlphabet)ilist.getTargetAlphabet(), targetCount);
			return infogains;
		}
		assert (targetCountSum > 0) : targetCountSum;
		double p;
		double[] classDistribution = new double[numClasses];
		// Calculate the overall entropy of the labels, ignoring the features
		staticBaseEntropy = 0;
		//System.out.print ("targetCount "); Vector.print (targetCount);
		//System.out.println ("targetCountSum = "+targetCountSum);
		for (int li = 0; li < numClasses; li++) {
			p = targetCount[li]/targetCountSum;
			classDistribution[li] = p;
			assert (p <= 1.0) : p;
			if (p != 0)
				staticBaseEntropy -= p * Math.log(p) / log2;
		}
		staticBaseLabelDistribution = new LabelVector ((LabelAlphabet)ilist.getTargetAlphabet(), classDistribution);
		//System.out.println ("Total class entropy = "+staticBaseEntropy);
		// Calculate the InfoGain of each feature
		for (int fi = 0; fi < numFeatures; fi++) {
			double featurePresentEntropy = 0;
			double norm = featureCountSum[fi];
			if (norm > 0) {
				for (int li = 0; li < numClasses; li++) {
					p = targetFeatureCount[li][fi]/norm;
					assert (p <= 1.00000001) : p;
					if (p != 0)
						featurePresentEntropy -= p * Math.log(p) / log2;
				}
			}
			assert (!Double.isNaN(featurePresentEntropy)) : fi;
			norm = targetCountSum-featureCountSum[fi];
			double featureAbsentEntropy = 0;
			if (norm > 0) {
				for (int li = 0; li < numClasses; li++) {
					p = (targetCount[li]-targetFeatureCount[li][fi])/norm;
					assert (p <= 1.00000001) : p;
					if (p != 0)
						featureAbsentEntropy -= p * Math.log(p) / log2;
				}
			}
			assert (!Double.isNaN(featureAbsentEntropy)) : fi;
			//Alphabet dictionary = ilist.getDataAlphabet();
			//System.out.println ("Feature="+dictionary.lookupSymbol(fi)+" presentWeight="
			//+(featureCountSum[fi]/targetCountSum)+" absentWeight="
			//+((targetCountSum-featureCountSum[fi])/targetCountSum)+" presentEntropy="
			//+featurePresentEntropy+" absentEntropy="
			//+featureAbsentEntropy);
			infogains[fi] = (staticBaseEntropy
											 - (featureCountSum[fi]/targetCountSum) * featurePresentEntropy
											 - ((targetCountSum-featureCountSum[fi])/targetCountSum) * featureAbsentEntropy);
			assert (!Double.isNaN(infogains[fi])) : fi;
		}
		return infogains;
	}

	public InfoGain (InstanceList ilist)
	{
		super (ilist.getDataAlphabet(), calcInfoGains (ilist));
		baseEntropy = staticBaseEntropy;
		baseLabelDistribution = staticBaseLabelDistribution;
	}

	public InfoGain (Alphabet vocab, double[] infogains)
	{
		super (vocab, infogains);
	}

	public double getBaseEntropy ()
	{
		return baseEntropy;
	}

	public LabelVector getBaseLabelDistribution ()
	{
		return baseLabelDistribution;
	}



	public static class Factory implements RankedFeatureVector.Factory
	{
		public Factory ()
		{
		}
		
		public RankedFeatureVector newRankedFeatureVector (InstanceList ilist)
		{
			return new InfoGain (ilist);
		}
	}
	
}