/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ package cc.mallet.classify; import java.util.logging.*; import java.util.*; import java.io.*; import cc.mallet.classify.Classifier; import cc.mallet.optimize.ConjugateGradient; import cc.mallet.optimize.InvalidOptimizableException; import cc.mallet.optimize.LimitedMemoryBFGS; import cc.mallet.optimize.Optimizable; import cc.mallet.optimize.OptimizationException; import cc.mallet.optimize.Optimizer; import cc.mallet.optimize.OrthantWiseLimitedMemoryBFGS; import cc.mallet.optimize.tests.*; import cc.mallet.pipe.Pipe; import cc.mallet.types.Alphabet; import cc.mallet.types.ExpGain; import cc.mallet.types.FeatureInducer; import cc.mallet.types.FeatureSelection; import cc.mallet.types.FeatureVector; import cc.mallet.types.GradientGain; import cc.mallet.types.InfoGain; import cc.mallet.types.Instance; import cc.mallet.types.InstanceList; import cc.mallet.types.Label; import cc.mallet.types.LabelAlphabet; import cc.mallet.types.LabelVector; import cc.mallet.types.Labeling; import cc.mallet.types.MatrixOps; import cc.mallet.types.RankedFeatureVector; import cc.mallet.types.Vector; import cc.mallet.util.CommandOption; import cc.mallet.util.MalletLogger; import cc.mallet.util.MalletProgressMessageLogger; import cc.mallet.util.Maths; //Does not currently handle instances that are labeled with distributions //instead of a single label. /** * The trainer for a Maximum Entropy classifier. @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> */ public class MaxEntTrainer extends ClassifierTrainer<MaxEnt> implements ClassifierTrainer.ByOptimization<MaxEnt>, Boostable, Serializable { private static Logger logger = MalletLogger.getLogger(MaxEntTrainer.class.getName()); private static Logger progressLogger = MalletProgressMessageLogger.getLogger(MaxEntTrainer.class.getName()+"-pl"); int numIterations = Integer.MAX_VALUE; //public static final String EXP_GAIN = "exp"; //public static final String GRADIENT_GAIN = "grad"; //public static final String INFORMATION_GAIN = "info"; // xxx Why does TestMaximizable fail when this variance is very small? static final double DEFAULT_GAUSSIAN_PRIOR_VARIANCE = 1; static final double DEFAULT_L1_WEIGHT = 0.0; static final Class DEFAULT_MAXIMIZER_CLASS = LimitedMemoryBFGS.class; double gaussianPriorVariance = DEFAULT_GAUSSIAN_PRIOR_VARIANCE; double l1Weight = DEFAULT_L1_WEIGHT; Class maximizerClass = DEFAULT_MAXIMIZER_CLASS; InstanceList trainingSet = null; MaxEnt initialClassifier; MaxEntOptimizableByLabelLikelihood optimizable = null; Optimizer optimizer = null; // // CONSTRUCTORS // public MaxEntTrainer () {} /** Construct a MaxEnt trainer using a trained classifier as * initial values. */ public MaxEntTrainer (MaxEnt theClassifierToTrain) { this.initialClassifier = theClassifierToTrain; } /** Constructs a trainer with a parameter to avoid overtraining. 1.0 is * the default value. */ public MaxEntTrainer (double gaussianPriorVariance) { this.gaussianPriorVariance = gaussianPriorVariance; } // // CLASSIFIER OBJECT: stores parameters // public MaxEnt getClassifier () { if (optimizable != null) return optimizable.getClassifier(); return initialClassifier; } /** * Initialize parameters using the provided classifier. */ public void setClassifier (MaxEnt theClassifierToTrain) { // Is this necessary? What is the caller is about to set the training set to something different? -akm assert (trainingSet == null || Alphabet.alphabetsMatch(theClassifierToTrain, trainingSet)); if (this.initialClassifier != theClassifierToTrain) { this.initialClassifier = theClassifierToTrain; optimizable = null; optimizer = null; } } // // OPTIMIZABLE OBJECT: implements value and gradient functions // public Optimizable getOptimizable () { return optimizable; } public MaxEntOptimizableByLabelLikelihood getOptimizable (InstanceList trainingSet) { return getOptimizable(trainingSet, getClassifier()); } public MaxEntOptimizableByLabelLikelihood getOptimizable (InstanceList trainingSet, MaxEnt initialClassifier) { if (trainingSet != this.trainingSet || this.initialClassifier != initialClassifier) { this.trainingSet = trainingSet; this.initialClassifier = initialClassifier; if (optimizable == null || optimizable.trainingList != trainingSet) { optimizable = new MaxEntOptimizableByLabelLikelihood (trainingSet, initialClassifier); if (l1Weight == 0.0) { optimizable.setGaussianPriorVariance(gaussianPriorVariance); } else { // the prior term for L1-regularized classifiers // is implemented as part of the optimizer, // so don't include a prior calculation in the value and // gradient functions. optimizable.useNoPrior(); } optimizer = null; } } return optimizable; } // // OPTIMIZER OBJECT: maximizes value function // public Optimizer getOptimizer () { if (optimizer == null && optimizable != null) { optimizer = new ConjugateGradient(optimizable); } return optimizer; } /** This method is called by the train method. * This is the main entry point for the optimizable and optimizer * compontents. */ public Optimizer getOptimizer (InstanceList trainingSet) { // If the data is not set, or has changed, // initialize the optimizable object and // replace the optimizer. if (trainingSet != this.trainingSet || optimizable == null) { getOptimizable(trainingSet); optimizer = null; } // Build a new optimizer if (optimizer == null) { // If l1Weight is 0, this devolves to // standard L-BFGS, but the implementation // may be faster. optimizer = new LimitedMemoryBFGS(optimizable); //OrthantWiseLimitedMemoryBFGS(optimizable, l1Weight); } return optimizer; } /** * Specifies the maximum number of iterations to run during a single call * to <code>train</code> or <code>trainWithFeatureInduction</code>. Not * currently functional. * @return This trainer */ // XXX Since we maximize before using numIterations, this doesn't work. // Is that a bug? If so, should the default numIterations be higher? public MaxEntTrainer setNumIterations (int i) { numIterations = i; return this; } public int getIteration () { if (optimizable == null) return 0; else return Integer.MAX_VALUE; // return optimizer.getIteration (); } /** * Sets a parameter to prevent overtraining. A smaller variance for the prior * means that feature weights are expected to hover closer to 0, so extra * evidence is required to set a higher weight. * @return This trainer */ public MaxEntTrainer setGaussianPriorVariance (double gaussianPriorVariance) { this.gaussianPriorVariance = gaussianPriorVariance; return this; } /** * Use an L1 prior. Larger values mean parameters will be closer to 0. * Note that this setting overrides any Gaussian prior. */ public MaxEntTrainer setL1Weight(double l1Weight) { this.l1Weight = l1Weight; return this; } public MaxEnt train (InstanceList trainingSet) { return train (trainingSet, numIterations); } public MaxEnt train (InstanceList trainingSet, int numIterations) { logger.fine ("trainingSet.size() = "+trainingSet.size()); getOptimizer (trainingSet); // This will set this.optimizer, this.optimizable for (int i = 0; i < numIterations; i++) { try { finishedTraining = optimizer.optimize (1); } catch (InvalidOptimizableException e) { e.printStackTrace(); logger.warning("Catching InvalidOptimizatinException! saying converged."); finishedTraining = true; } catch (OptimizationException e) { e.printStackTrace(); logger.info ("Catching OptimizationException; saying converged."); finishedTraining = true; } if (finishedTraining) break; } // only if any number of iterations is allowed if (numIterations == Integer.MAX_VALUE) { // Run it again because in our and Sam Roweis' experience, BFGS can still // eke out more likelihood after first convergence by re-running without // being restricted by its gradient history. optimizer = null; getOptimizer(trainingSet); try { finishedTraining = optimizer.optimize (); } catch (InvalidOptimizableException e) { e.printStackTrace(); logger.warning("Catching InvalidOptimizatinException! saying converged."); finishedTraining = true; } catch (OptimizationException e) { e.printStackTrace(); logger.info ("Catching OptimizationException; saying converged."); finishedTraining = true; } } //TestMaximizable.testValueAndGradientCurrentParameters (mt); progressLogger.info("\n"); // progress messages are on one line; move on. //logger.info("MaxEnt ngetValueCalls:"+getValueCalls()+"\nMaxEnt ngetValueGradientCalls:"+getValueGradientCalls()); return optimizable.getClassifier(); } /** * <p>Trains a maximum entropy model using feature selection and feature induction * (adding conjunctions of features as new features).</p> * * @param trainingData A list of <code>Instance</code>s whose <code>data</code> * fields are binary, augmentable <code>FeatureVector</code>s. * and whose <code>target</code> fields are <code>Label</code>s. * @param validationData [not currently used] As <code>trainingData</code>, * or <code>null</code>. * @param testingData As <code>trainingData</code>, or <code>null</code>. * @param evaluator The evaluator to track training progress and decide whether * to continue, or <code>null</code>. * @param totalIterations The maximum total number of training iterations, * including those taken during feature induction. * @param numIterationsBetweenFeatureInductions How many iterations to train * between one round of feature induction and the next; this should usually * be fairly small, like 5 or 10, to avoid overfitting with current features. * @param numFeatureInductions How many rounds of feature induction to run * before beginning normal training. * @param numFeaturesPerFeatureInduction The maximum number of features to * choose during each round of featureInduction. * * @return The trained <code>MaxEnt</code> classifier */ /* // added - cjmaloof@linc.cis.upenn.edu public Classifier trainWithFeatureInduction (InstanceList trainingData, int totalIterations, int numIterationsBetweenFeatureInductions, int numFeatureInductions, int numFeaturesPerFeatureInduction) { return trainWithFeatureInduction (trainingData, null, totalIterations, numIterationsBetweenFeatureInductions, numFeatureInductions, numFeaturesPerFeatureInduction, EXP_GAIN); } */ /** * <p>Like the other version of <code>trainWithFeatureInduction</code>, but * allows some default options to be changed.</p> * * @param maxent An initial partially-trained classifier (default <code>null</code>). * This classifier may be modified during training. * @param gainName The estimate of gain (log-likelihood increase) we want our chosen * features to maximize. * Should be one of <code>MaxEntTrainer.EXP_GAIN</code>, * <code>MaxEntTrainer.GRADIENT_GAIN</code>, or * <code>MaxEntTrainer.INFORMATION_GAIN</code> (default <code>EXP_GAIN</code>). * * @return The trained <code>MaxEnt</code> classifier */ /* // Temporarily removed until I figure out how to handle induceFeaturesFor (testData) public Classifier trainWithFeatureInduction (InstanceList trainingData, int totalIterations, int numIterationsBetweenFeatureInductions, int numFeatureInductions, int numFeaturesPerFeatureInduction, String gainName) { // XXX This ought to be a parameter, except that setting it to true can // crash training ("Jump too small"). boolean saveParametersDuringFI = false; Alphabet inputAlphabet = trainingData.getDataAlphabet(); Alphabet outputAlphabet = trainingData.getTargetAlphabet(); int trainingIteration = 0; int numLabels = outputAlphabet.size(); MaxEnt maxent = getClassifier(); // Initialize feature selection FeatureSelection globalFS = trainingData.getFeatureSelection(); if (globalFS == null) { // Mask out all features; some will be added later by FeatureInducer.induceFeaturesFor(.) globalFS = new FeatureSelection (trainingData.getDataAlphabet()); trainingData.setFeatureSelection (globalFS); } //if (validationData != null) validationData.setFeatureSelection (globalFS); //if (testingData != null) testingData.setFeatureSelection (globalFS); getOptimizer(trainingData); // This will initialize this.me so getClassifier() below works maxent.setFeatureSelection(globalFS); // Run feature induction for (int featureInductionIteration = 0; featureInductionIteration < numFeatureInductions; featureInductionIteration++) { // Print out some feature information logger.info ("Feature induction iteration "+featureInductionIteration); // Train the model a little bit. We don't care whether it converges; we // execute all feature induction iterations no matter what. if (featureInductionIteration != 0) { // Don't train until we have added some features setNumIterations(numIterationsBetweenFeatureInductions); train (trainingData); } trainingIteration += numIterationsBetweenFeatureInductions; logger.info ("Starting feature induction with "+(1+inputAlphabet.size())+ " features over "+numLabels+" labels."); // Create the list of error tokens InstanceList errorInstances = new InstanceList (trainingData.getDataAlphabet(), trainingData.getTargetAlphabet()); // This errorInstances.featureSelection will get examined by FeatureInducer, // so it can know how to add "new" singleton features errorInstances.setFeatureSelection (globalFS); List errorLabelVectors = new ArrayList(); // these are length-1 vectors for (int i = 0; i < trainingData.size(); i++) { Instance instance = trainingData.get(i); FeatureVector inputVector = (FeatureVector) instance.getData(); Label trueLabel = (Label) instance.getTarget(); // Having trained using just the current features, see how we classify // the training data now. Classification classification = maxent.classify(instance); if (!classification.bestLabelIsCorrect()) { errorInstances.add(inputVector, trueLabel, null, null); errorLabelVectors.add(classification.getLabelVector()); } } logger.info ("Error instance list size = "+errorInstances.size()); int s = errorLabelVectors.size(); LabelVector[] lvs = new LabelVector[s]; for (int i = 0; i < s; i++) { lvs[i] = (LabelVector)errorLabelVectors.get(i); } RankedFeatureVector.Factory gainFactory = null; if (gainName.equals (EXP_GAIN)) gainFactory = new ExpGain.Factory (lvs, gaussianPriorVariance); else if (gainName.equals(GRADIENT_GAIN)) gainFactory = new GradientGain.Factory (lvs); else if (gainName.equals(INFORMATION_GAIN)) gainFactory = new InfoGain.Factory (); else throw new IllegalArgumentException("Unsupported gain name: "+gainName); FeatureInducer klfi = new FeatureInducer (gainFactory, errorInstances, numFeaturesPerFeatureInduction, 2*numFeaturesPerFeatureInduction, 2*numFeaturesPerFeatureInduction); // Note that this adds features globally, but not on a per-transition basis klfi.induceFeaturesFor (trainingData, false, false); if (testingData != null) klfi.induceFeaturesFor (testingData, false, false); logger.info ("MaxEnt FeatureSelection now includes "+globalFS.cardinality()+" features"); klfi = null; double[] newParameters = new double[(1+inputAlphabet.size()) * outputAlphabet.size()]; // XXX (Executing this block often causes an error during training; I don't know why.) if (saveParametersDuringFI) { // Keep current parameter values // XXX This relies on the implementation detail that the most recent features // added to an Alphabet get the highest indices. // Count parameters per output label int oldParamCount = maxent.parameters.length / outputAlphabet.size(); int newParamCount = 1+inputAlphabet.size(); // Copy params into the proper locations for (int i=0; i<outputAlphabet.size(); i++) { System.arraycopy(maxent.parameters, i*oldParamCount, newParameters, i*newParamCount, oldParamCount); } for (int i=0; i<oldParamCount; i++) if (maxent.parameters[i] != newParameters[i]) { System.out.println(maxent.parameters[i]+" "+newParameters[i]); System.exit(0); } } maxent.parameters = newParameters; maxent.defaultFeatureIndex = inputAlphabet.size(); } // Finished feature induction logger.info("Ended with "+globalFS.cardinality()+" features."); setNumIterations(totalIterations - trainingIteration); train (trainingData); return maxent; } */ public String toString() { StringBuilder builder = new StringBuilder(); builder.append("MaxEntTrainer"); if (numIterations < Integer.MAX_VALUE) { builder.append(",numIterations=" + numIterations); } if (l1Weight != 0.0) { builder.append(",l1Weight=" + l1Weight); } else { builder.append(",gaussianPriorVariance=" + gaussianPriorVariance); } return builder.toString(); } }