package cc.mallet.optimize; import java.util.Arrays; import java.util.logging.Logger; import java.text.DecimalFormat; import cc.mallet.optimize.Optimizer; import cc.mallet.types.MatrixOps; import cc.mallet.util.MalletLogger; /** * @author Greg Druck * @author Kedar Bellare */ public class StochasticMetaAscent implements Optimizer.ByBatches { private static Logger logger = MalletLogger.getLogger(StochasticMetaAscent.class.getName()); private final int MAX_ITER = 200; private final double LAMBDA = 1.0; private final double TOLERANCE = 0.01; private final double EPS = 1e-10; private double mu = 0.1; private int totalIterations = 0; private double eta_init = 0.03; private boolean useHessian = true; private double[] gain; private double[] gradientTrace; Optimizable.ByBatchGradient maxable = null; public StochasticMetaAscent (Optimizable.ByBatchGradient maxable) { this.maxable = maxable; } public void setInitialStep(double step) { eta_init = step; } public void setMu(double m) { mu = m; } public void setUseHessian(boolean flag) { useHessian = flag; } public boolean optimize(int numBatches, int[] batchAssignments) { return optimize(MAX_ITER,numBatches,batchAssignments); } public boolean optimize(int numIterations, int numBatches, int[] batchAssignments) { int numParameters = maxable.getNumParameters(); double[] parameters = new double[numParameters]; double[] gradient = new double[numParameters]; double[] hessianProduct = new double[numParameters]; // Only initialize these if they are null // in case someone wants to optimize a // few iterations at a time. if (gain == null) { System.err.println ("StochasticMetaAscent: initialStep="+eta_init+" metaStep="+mu); gain = new double[numParameters]; Arrays.fill(gain, eta_init); gradientTrace = new double[numParameters]; } maxable.getParameters(parameters); for (int iteration = 0; iteration < numIterations; iteration++) { double oldApproxValue = 0; double approxValue = 0; for (int batch = 0; batch < numBatches; batch++) { logger.info("Iteration " + (totalIterations + iteration) + ", batch " + batch + " of " + numBatches); // Get current parameters maxable.getParameters(parameters); // Update value and gradient for the current batch double initialValue = maxable.getBatchValue (batch, batchAssignments); oldApproxValue += initialValue; if (Double.isNaN (initialValue)) { throw new IllegalArgumentException ("NaN in value computation. Probably you need to reduce initialStep or metaStep."); } maxable.getBatchValueGradient(gradient,batch,batchAssignments); // The code below was originally written for stochastic meta // descent. We are maximizing, so we want ascent. Flip the // signs on the gradient to make it point downhill. MatrixOps.timesEquals(gradient, -1); if (useHessian) { computeHessianProduct(maxable, parameters, batch, batchAssignments, gradient, gradientTrace, hessianProduct); } reportOnVec ("x", parameters); reportOnVec ("step", gain); reportOnVec ("grad", gradient); reportOnVec ("trace", gradientTrace); // Update learning rates for individual parameters for (int index = 0; index < numParameters; index++) { // for the first iteration, this will just be the initial step // since gradientTrace will be all zeros gain[index] *= Math.max(0.5, 1 - mu * gradient[index] * gradientTrace[index]); // adjust parameters based on direction parameters[index] -= gain[index] * gradient[index]; if (useHessian) { // adjust gradient trace gradientTrace[index] = LAMBDA * gradientTrace[index] - gain[index] * (gradient[index] + LAMBDA * hessianProduct[index]); } else { // adjust gradient trace gradientTrace[index] = LAMBDA * gradientTrace[index] - gain[index] * (gradient[index] + LAMBDA * gradientTrace[index]); } } // Set new parameters maxable.setParameters(parameters); double finalValue = maxable.getBatchValue (batch, batchAssignments); approxValue += finalValue; logger.info ("StochasticMetaAscent: initial value: "+initialValue+" final value:"+finalValue); } logger.info("StochasticMetaDescent: Value at iteration (" + (totalIterations + iteration) + ")= " + approxValue); // converge criteria from GradientAscent and LimitedMemoryBFGS if (2.0*Math.abs(approxValue-oldApproxValue) <= TOLERANCE*(Math.abs(approxValue)+Math.abs(oldApproxValue)+EPS)) { logger.info ("Stochastic Meta Ascent: Value difference " +Math.abs(approxValue-oldApproxValue) +" below " + "tolerance; saying converged."); totalIterations += iteration; return true; } oldApproxValue = approxValue; } totalIterations += numIterations; return false; } private void reportOnVec (String s, double[] v) { DecimalFormat f = new DecimalFormat ("0.####"); System.out.println ("StochasticMetaAscent: "+s+":"+ " min "+ f.format(MatrixOps.min (v)) + " max "+ f.format(MatrixOps.max (v)) + " mean "+ f.format(MatrixOps.mean (v)) + " 2norm "+ f.format(MatrixOps.twoNorm (v)) + " abs-norm "+ f.format(MatrixOps.absNorm (v)) ); } // compute finite difference approximation of the Hessian product private void computeHessianProduct(Optimizable.ByBatchGradient maxable, double[] parameters, int batchIndex, int[] batchAssignments, double[] currentGradient, double[] vector, double[] result) { int numParameters = maxable.getNumParameters(); double eps = 1.0e-6; double[] epsGradient = new double[numParameters]; double[] oldParameters = new double[numParameters]; // adjust parameters by (eps * vector) and recompute gradient System.arraycopy(parameters,0,oldParameters,0,numParameters); MatrixOps.plusEquals(parameters, vector, eps); maxable.setParameters(parameters); maxable.getBatchValueGradient(epsGradient, batchIndex, batchAssignments); // restore old parameters maxable.setParameters(oldParameters); // calculate Hessian product for (int index = 0; index < result.length; index++) { result[index] = (-epsGradient[index] - currentGradient[index]) / eps; } } }