/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
Information gain of the absence/precense of each feature.
Note that we aren't attending to the feature's value, and MALLET doesn't currently
have any support at all for categorical features.
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
package cc.mallet.types;
public class InfoGain extends RankedFeatureVector
{
// xxx This is DISGUSTINGLY non-thread-safe.
static double staticBaseEntropy;
static LabelVector staticBaseLabelDistribution;
// xxx Yuck. Figure out how to remove this.
// Not strictly part of a list of feature info gains, but convenient and efficient
// for ml.classify.DecisionTree
double baseEntropy;
LabelVector baseLabelDistribution;
private static double[] calcInfoGains (InstanceList ilist)
{
final double log2 = Math.log(2);
int numInstances = ilist.size();
int numClasses = ilist.getTargetAlphabet().size();
int numFeatures = ilist.getDataAlphabet().size();
double[] infogains = new double[numFeatures];
double[][] targetFeatureCount = new double[numClasses][numFeatures];
double[] featureCountSum = new double[numFeatures];
double[] targetCount = new double[numClasses];
double targetCountSum = 0;
double flv; // feature location value
int fli; // feature location index
double count;
// Populate targetFeatureCount, et al
for (int i = 0; i < ilist.size(); i++) {
Instance inst = ilist.get(i);
Labeling labeling = inst.getLabeling ();
FeatureVector fv = (FeatureVector) inst.getData ();
double instanceWeight = ilist.getInstanceWeight(i);
// The code below relies on labelWeights summing to 1 over all labels!
double labelWeightSum = 0;
for (int ll = 0; ll < labeling.numLocations(); ll++) {
int li = labeling.indexAtLocation (ll);
double labelWeight = labeling.valueAtLocation (ll);
labelWeightSum += labelWeight;
if (labelWeight == 0) continue;
count = labelWeight * instanceWeight;
for (int fl = 0; fl < fv.numLocations(); fl++) {
fli = fv.indexAtLocation(fl);
// xxx Is this right? What should we do about negative values?
// Whatever is decided here should also go in DecisionTree.split()
if (fv.valueAtLocation(fl) > 0) {
targetFeatureCount[li][fli] += count;
featureCountSum[fli] += count;
}
}
targetCount[li] += count;
targetCountSum += count;
}
assert (Math.abs (labelWeightSum - 1.0) < 0.0001);
}
if (targetCountSum == 0) {
staticBaseEntropy = 0.0; // xxx Should this instead by infinite?
staticBaseLabelDistribution = new LabelVector ((LabelAlphabet)ilist.getTargetAlphabet(), targetCount);
return infogains;
}
assert (targetCountSum > 0) : targetCountSum;
double p;
double[] classDistribution = new double[numClasses];
// Calculate the overall entropy of the labels, ignoring the features
staticBaseEntropy = 0;
//System.out.print ("targetCount "); Vector.print (targetCount);
//System.out.println ("targetCountSum = "+targetCountSum);
for (int li = 0; li < numClasses; li++) {
p = targetCount[li]/targetCountSum;
classDistribution[li] = p;
assert (p <= 1.0) : p;
if (p != 0)
staticBaseEntropy -= p * Math.log(p) / log2;
}
staticBaseLabelDistribution = new LabelVector ((LabelAlphabet)ilist.getTargetAlphabet(), classDistribution);
//System.out.println ("Total class entropy = "+staticBaseEntropy);
// Calculate the InfoGain of each feature
for (int fi = 0; fi < numFeatures; fi++) {
double featurePresentEntropy = 0;
double norm = featureCountSum[fi];
if (norm > 0) {
for (int li = 0; li < numClasses; li++) {
p = targetFeatureCount[li][fi]/norm;
assert (p <= 1.00000001) : p;
if (p != 0)
featurePresentEntropy -= p * Math.log(p) / log2;
}
}
assert (!Double.isNaN(featurePresentEntropy)) : fi;
norm = targetCountSum-featureCountSum[fi];
double featureAbsentEntropy = 0;
if (norm > 0) {
for (int li = 0; li < numClasses; li++) {
p = (targetCount[li]-targetFeatureCount[li][fi])/norm;
assert (p <= 1.00000001) : p;
if (p != 0)
featureAbsentEntropy -= p * Math.log(p) / log2;
}
}
assert (!Double.isNaN(featureAbsentEntropy)) : fi;
//Alphabet dictionary = ilist.getDataAlphabet();
//System.out.println ("Feature="+dictionary.lookupSymbol(fi)+" presentWeight="
//+(featureCountSum[fi]/targetCountSum)+" absentWeight="
//+((targetCountSum-featureCountSum[fi])/targetCountSum)+" presentEntropy="
//+featurePresentEntropy+" absentEntropy="
//+featureAbsentEntropy);
infogains[fi] = (staticBaseEntropy
- (featureCountSum[fi]/targetCountSum) * featurePresentEntropy
- ((targetCountSum-featureCountSum[fi])/targetCountSum) * featureAbsentEntropy);
assert (!Double.isNaN(infogains[fi])) : fi;
}
return infogains;
}
public InfoGain (InstanceList ilist)
{
super (ilist.getDataAlphabet(), calcInfoGains (ilist));
baseEntropy = staticBaseEntropy;
baseLabelDistribution = staticBaseLabelDistribution;
}
public InfoGain (Alphabet vocab, double[] infogains)
{
super (vocab, infogains);
}
public double getBaseEntropy ()
{
return baseEntropy;
}
public LabelVector getBaseLabelDistribution ()
{
return baseLabelDistribution;
}
public static class Factory implements RankedFeatureVector.Factory
{
public Factory ()
{
}
public RankedFeatureVector newRankedFeatureVector (InstanceList ilist)
{
return new InfoGain (ilist);
}
}
}