/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ /** The "gain" obtained by adding a feature to a conditional exponential model. Based on the joint exponential model in Della Pietra, Della Pietra & Lafferty, 1997. We smooth using a Gaussian prior. Note that we use Math.log(), not log-base-2, so the units are not "bits". @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> */ package cc.mallet.types; import java.util.logging.*; import java.io.*; import cc.mallet.classify.Classification; import cc.mallet.util.MalletLogger; public class ExpGain extends RankedFeatureVector { private static Logger logger = MalletLogger.getLogger(ExpGain.class.getName()); // ExpGain of a feature, f, is defined in terms of MaxEnt-type feature+class "Feature"s, F, // F = f,c // ExpGain of a Feature, F, is // G(F) = KL(p~[C]||q[C]) - KL(p~[C]||q_F[C]) // where p~[] is the empirical distribution, according to the true class label distribution // and q[] is the distribution from the (imperfect) classifier // and q_F[] is the distribution from the (imperfect) classifier with F added // and F's weight adjusted (but none of the other weights adjusted) // ExpGain of a feature,f, is // G(f) = sum_c G(f,c) // It would be more accurate to return a gain number for each "feature/class" combination, // but here we simply return the gain(feature) = \sum_{class} gain(feature,class). // xxx Not ever used. Remove them. boolean usingHyperbolicPrior = false; double hyperbolicSlope = 0.2; double hyperbolicSharpness = 10.0; private static double[] calcExpGains (InstanceList ilist, LabelVector[] classifications, double gaussianPriorVariance) { int numInstances = ilist.size(); int numClasses = ilist.getTargetAlphabet().size(); int numFeatures = ilist.getDataAlphabet().size(); assert (ilist.size() > 0); // Notation from Della Pietra & Lafferty 1997, p.4 // "p~" double[][] p = new double[numClasses][numFeatures]; // "q" double[][] q = new double[numClasses][numFeatures]; // "alpha", the weight of the new feature double[][] alphas = new double[numClasses][numFeatures]; int fli; // feature location index double flv; // feature location value logger.info ("Starting klgains, #instances="+numInstances); double trueLabelWeightSum = 0; double modelLabelWeightSum = 0; // Calculate p~[f] and q[f] for (int i = 0; i < numInstances; i++) { assert (classifications[i].getLabelAlphabet() == ilist.getTargetAlphabet()); Instance inst = ilist.get(i); Labeling labeling = inst.getLabeling (); FeatureVector fv = (FeatureVector) inst.getData (); //double instanceWeight = ilist.getInstanceWeight(i); // The code below relies on labelWeights summing to 1 over all labels! double perInstanceModelLabelWeight = 0; for (int li = 0; li < numClasses; li++) { double trueLabelWeight = labeling.value (li); double modelLabelWeight = classifications[i].value(li); trueLabelWeightSum += trueLabelWeight; modelLabelWeightSum += modelLabelWeight; perInstanceModelLabelWeight += modelLabelWeight; //if (i < 500) System.out.println ("i="+i+" li="+li+" true="+trueLabelWeight+" model="+modelLabelWeight); if (trueLabelWeight == 0 && modelLabelWeight == 0) continue; for (int fl = 0; fl < fv.numLocations(); fl++) { fli = fv.indexAtLocation(fl); assert (fv.valueAtLocation(fl) == 1.0); // xxx Note that we are not attenting to instanceWeight here! //p[li][fli] += trueLabelWeight * instanceWeight / (numInstances+1); //q[li][fli] += modelLabelWeight * instanceWeight / (numInstances+1); p[li][fli] += trueLabelWeight; q[li][fli] += modelLabelWeight; } } assert (Math.abs (perInstanceModelLabelWeight - 1.0) < 0.001); } assert (Math.abs (trueLabelWeightSum/numInstances - 1.0) < 0.001) : "trueLabelWeightSum should be 1.0, it was "+trueLabelWeightSum; assert (Math.abs (modelLabelWeightSum/numInstances - 1.0) < 0.001) : "modelLabelWeightSum should be 1.0, it was "+modelLabelWeightSum; /* double psum = 0; double qsum = 0; for (int i = 0; i < numClasses; i++) for (int j = 0; j < numFeatures; j++) { psum += p[i][j]; qsum += q[i][j]; } assert (Math.abs(psum - 1.0) < 0.0001) : "psum not 1.0! psum="+psum+" qsum="+qsum; assert (Math.abs(qsum - 1.0) < 0.0001) : "qsum not 1.0! psum="+psum+" qsum="+qsum; */ // Determine the alphas // We can't do it in closed form as in the Della Pietra paper, because this we have here // a conditional MaxEnt model. // So we do it by Newton-Raphson... // ...initializing by the broken, inappropriate joint-case closed-form solution: //for (int i = 0; i < numClasses; i++) //for (int j = 0; j < numFeatures; j++) //alphas[i][j] = Math.log ( (p[i][j]*(1.0-q[i][j])) / (q[i][j]*(1.0-p[i][j])) ); double[][] dalphas = new double[numClasses][numFeatures]; // first derivative double[][] alphaChangeOld = new double[numClasses][numFeatures]; // change in alpha, last iteration double[][] alphaMax = new double[numClasses][numFeatures]; // change in alpha, last iteration double[][] alphaMin = new double[numClasses][numFeatures]; // change in alpha, last iteration double[][] ddalphas = new double[numClasses][numFeatures];// second derivative for (int i = 0; i < numClasses; i++) for (int j = 0; j < numFeatures; j++) { alphaMax[i][j] = Double.POSITIVE_INFINITY; alphaMin[i][j] = Double.NEGATIVE_INFINITY; } double maxAlphachange = 0; double maxDalpha = 99; int maxNewtonSteps = 50; // xxx Change to more? // alphas[][] are initialized to zero for (int newton = 0; maxDalpha > 1.0E-8 && newton < maxNewtonSteps; newton++) { //System.out.println ("Newton iteration "+newton); if (false /*usingHyperbolicPrior*/) { for (int i = 0; i < numClasses; i++) for (int j = 0; j < numFeatures; j++) { dalphas[i][j] = p[i][j] - (alphas[i][j] / gaussianPriorVariance); ddalphas[i][j] = -1 / gaussianPriorVariance; } } else { // Gaussian prior for (int i = 0; i < numClasses; i++) for (int j = 0; j < numFeatures; j++) { dalphas[i][j] = p[i][j] - (alphas[i][j] / gaussianPriorVariance); ddalphas[i][j] = -1 / gaussianPriorVariance; } } for (int i = 0; i < ilist.size(); i++) { assert (classifications[i].getLabelAlphabet() == ilist.getTargetAlphabet()); Instance inst = ilist.get(i); Labeling labeling = inst.getLabeling (); FeatureVector fv = (FeatureVector) inst.getData (); // xxx This assumes binary-valued features. What about "tied" weights? for (int fl = 0; fl < fv.numLocations(); fl++) { fli = fv.indexAtLocation(fl); for (int li = 0; li < numClasses; li++) { double modelLabelWeight = classifications[i].value(li); double expalpha = Math.exp (alphas[li][fli]); double numerator = modelLabelWeight * expalpha; double denominator = numerator + (1.0 - modelLabelWeight); dalphas[li][fli] -= numerator / denominator; ddalphas[li][fli] += ((numerator*numerator) / (denominator*denominator) - (numerator/denominator)); } } } // We now now first- and second-derivative for this newton step // Run tests on the alphas and their derivatives, and do a newton step double alphachange, newalpha, oldalpha; maxAlphachange = maxDalpha = 0; for (int i = 0; i < numClasses; i++) for (int j = 0; j < numFeatures; j++) { alphachange = - (dalphas[i][j] / ddalphas[i][j]); if (p[i][j] == 0 && q[i][j] == 0) continue; else if (false && (i*numFeatures+j) % (numClasses*numFeatures/2000) == 0 || Double.isNaN(alphas[i][j]) || Double.isNaN(alphachange)) // Print just a sampling of them... logger.info ("alpha["+i+"]["+j+"]="+alphas[i][j]+ " p="+p[i][j]+ " q="+q[i][j]+ " dalpha="+dalphas[i][j]+ " ddalpha="+ddalphas[i][j]+ " alphachange="+alphachange+ " min="+alphaMin[i][j]+ " max="+alphaMax[i][j]); if (Double.isNaN(alphas[i][j]) || Double.isNaN(dalphas[i][j]) || Double.isNaN(ddalphas[i][j]) || Double.isInfinite(alphas[i][j]) || Double.isInfinite(dalphas[i][j]) || Double.isInfinite(ddalphas[i][j])) alphachange = 0; // assert (!Double.isNaN(alphas[i][j])); // assert (!Double.isNaN(dalphas[i][j])); // assert (!Double.isNaN(ddalphas[i][j])); oldalpha = alphas[i][j]; // xxx assert (ddalphas[i][j] <= 0); //assert (Math.abs(alphachange) < 100.0) : alphachange; // xxx arbitrary? // Trying to prevent a cycle if (Math.abs(alphachange + alphaChangeOld[i][j]) / Math.abs(alphachange) < 0.01) newalpha = alphas[i][j] + alphachange / 2; else newalpha = alphas[i][j] + alphachange; if (alphachange < 0 && alphaMax[i][j] > alphas[i][j]) { //System.out.println ("Updating alphaMax["+i+"]["+j+"] = "+alphas[i][j]); alphaMax[i][j] = alphas[i][j]; } if (alphachange > 0 && alphaMin[i][j] < alphas[i][j]) { //System.out.println ("Updating alphaMin["+i+"]["+j+"] = "+alphas[i][j]); alphaMin[i][j] = alphas[i][j]; } if (newalpha <= alphaMax[i][j] && newalpha >= alphaMin[i][j]) // Newton wants to jump to a point inside the boundaries; let it alphas[i][j] = newalpha; else { // Newton wants to jump to a point outside the boundaries; bisect instead assert (alphaMax[i][j] != Double.POSITIVE_INFINITY); assert (alphaMin[i][j] != Double.NEGATIVE_INFINITY); alphas[i][j] = alphaMin[i][j] + (alphaMax[i][j] - alphaMin[i][j]) / 2; //System.out.println ("Newton tried to exceed bounds; bisecting. dalphas["+i+"]["+j+"]="+dalphas[i][j]+" alphaMin="+alphaMin[i][j]+" alphaMax="+alphaMax[i][j]); } alphachange = alphas[i][j] - oldalpha; if (Math.abs(alphachange) > maxAlphachange) maxAlphachange = Math.abs (alphachange); if (Math.abs (dalphas[i][j]) > maxDalpha) maxDalpha = Math.abs (dalphas[i][j]); alphaChangeOld[i][j] = alphachange; } logger.info ("After "+newton+" Newton iterations, maximum alphachange="+maxAlphachange+ " dalpha="+maxDalpha); } // Allow some memory to be freed //q = null; ddalphas = dalphas = alphaChangeOld = alphaMin = alphaMax = null; // "q[e^{\alpha g}]", p.4 //System.out.println ("Calculating qeag..."); // Note that we are using a gaussian prior, so we don't multiply by (1/numInstances) double[][] qeag = new double[numClasses][numFeatures]; for (int i = 0; i < ilist.size(); i++) { assert (classifications[i].getLabelAlphabet() == ilist.getTargetAlphabet()); Instance inst = ilist.get(i); Labeling labeling = inst.getLabeling (); FeatureVector fv = (FeatureVector) inst.getData (); int fvMaxLocation = fv.numLocations()-1; for (int li = 0; li < numClasses; li++) { double modelLabelWeight = classifications[i].value(li); // Following line now done before outside of loop over instances // for (int fi = 0; fi < numFeatures; fi++) qeag[li][fi] += modelLabelWeight; // * 1.0; for (int fl = 0; fl < fv.numLocations(); fl++) { fli = fv.indexAtLocation(fl); // When the value of this feature "g" is zero, a value of 1.0 should be included // in the expectation; we'll actually add all these at the end (pre-"assuming" // that all features have value zero). Here we subtract the "assumed" // modelLabelWeight, and put in the true value based on non-zero valued feature "g". qeag[li][fli] += Math.log (modelLabelWeight * Math.exp (alphas[li][fli]) + (1-modelLabelWeight)); } } } //System.out.println ("Calculating klgain values..."); double[] klgains = new double[numFeatures]; double klgainIncr, alpha; for (int i = 0; i < numClasses; i++) for (int j = 0; j < numFeatures; j++) { assert (!Double.isInfinite(alphas[i][j])); alpha = alphas[i][j]; if (alpha == 0) continue; klgainIncr = (alpha * p[i][j]) - qeag[i][j] - (alpha*alpha/(2*gaussianPriorVariance)); if (klgainIncr < 0) { if (false) logger.info ("WARNING: klgainIncr["+i+"]["+j+"]="+klgainIncr+ " alpha="+alphas[i][j]+ " feature="+ilist.getDataAlphabet().lookupObject(j)+ " class="+ilist.getTargetAlphabet().lookupObject(i)); } else klgains[j] += klgainIncr; } if (false) { logger.info ("klgains.length="+klgains.length); for (int j = 0; j < numFeatures; j++) { if (j % (numFeatures/100) == 0) { for (int i = 0; i < numClasses; i++) { logger.info ("c="+i+" p["+ilist.getDataAlphabet().lookupObject(j)+"] = "+p[i][j]); logger.info ("c="+i+" q["+ilist.getDataAlphabet().lookupObject(j)+"] = "+q[i][j]); logger.info ("c="+i+" alphas["+ilist.getDataAlphabet().lookupObject(j)+"] = "+alphas[i][j]); logger.info ("c="+i+" qeag["+ilist.getDataAlphabet().lookupObject(j)+"] = "+qeag[i][j]); } logger.info ("klgains["+ilist.getDataAlphabet().lookupObject(j)+"] = "+klgains[j]); } } } return klgains; } public ExpGain (InstanceList ilist, LabelVector[] classifications, double gaussianPriorVariance) { super (ilist.getDataAlphabet(), calcExpGains (ilist, classifications, gaussianPriorVariance)); } private static LabelVector[] getLabelVectorsFromClassifications (Classification[] c) { LabelVector[] ret = new LabelVector[c.length]; for (int i = 0; i < c.length; i++) ret[i] = c[i].getLabelVector(); return ret; } public ExpGain (InstanceList ilist, Classification[] classifications, double gaussianPriorVariance) { super (ilist.getDataAlphabet(), calcExpGains (ilist, getLabelVectorsFromClassifications(classifications), gaussianPriorVariance)); } public static class Factory implements RankedFeatureVector.Factory { LabelVector[] classifications; double gaussianPriorVariance = 10.0; public Factory (LabelVector[] classifications) { this.classifications = classifications; } public Factory (LabelVector[] classifications, double gaussianPriorVariance) { this.classifications = classifications; this.gaussianPriorVariance = gaussianPriorVariance; } public RankedFeatureVector newRankedFeatureVector (InstanceList ilist) { assert (ilist.getTargetAlphabet() == classifications[0].getAlphabet()); return new ExpGain (ilist, classifications, gaussianPriorVariance); } // Serialization private static final long serialVersionUID = 1; private static final int CURRENT_SERIAL_VERSION = 0; private void writeObject (ObjectOutputStream out) throws IOException { out.writeInt (CURRENT_SERIAL_VERSION); out.writeInt(classifications.length); for (int i = 0; i < classifications.length; i++) out.writeObject(classifications[i]); } private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { int version = in.readInt (); int n = in.readInt(); this.classifications = new LabelVector[n]; for (int i = 0; i < n; i++) this.classifications[i] = (LabelVector)in.readObject(); } } }