/* * File: LevenbergMarquardtEstimation.java * Authors: Kevin R. Dixon * Company: Sandia National Laboratories * Project: Cognitive Foundry * * Copyright Apr 13, 2008, Sandia Corporation. * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive * license for use of this work by or on behalf of the U.S. Government. * Export of this program may require a license from the United States * Government. See CopyrightHistory.txt for complete details. * */ package gov.sandia.cognition.learning.algorithm.regression; import gov.sandia.cognition.annotation.PublicationReference; import gov.sandia.cognition.annotation.PublicationReferences; import gov.sandia.cognition.annotation.PublicationType; import gov.sandia.cognition.learning.function.cost.SumSquaredErrorCostFunction; import gov.sandia.cognition.math.matrix.Matrix; import gov.sandia.cognition.math.matrix.Vector; /** * Implementation of the nonlinear regression algorithm, known as * Levenberg-Marquardt Estimation (or LMA). In its pure form, this algorithm * computes the least-squares solution to the parameters of a functional form * given a (weighted) set of input-output Vector pairs. While the algorithm * is specified in terms of parameter gradients, it has been proven that a * forward difference Jacobian approximation does not hurt the convergence * properties of LMA. (Thus, you can use GradientDescendableApproximator with * impunity.) LMA requires the storage of the explicit Jacobian, which may * not be possible for large problems. * <BR> * When gradients are available, LMA appears competitive with BFGS for function * evaluations, but BFGS is ~2 times faster than LMA. * <BR> * When gradients are not available, LMA slows down by a factor of ~3 and * requires "M" more function evaluations, where "M" is the number of * parameters in the function. However, LMA with approximated * parameter Jacobian is ~4 times FASTER than Powell's minimization (and * Powell's method requires about ~8 times more function evaluations). * <BR> * Take home message: Use ParameterDifferentiableCostMinimizer with BFGS when * possible, or LMA with approximated Jacobian when not gradient information * is not available. * <BR> * Loosely based on Numerical Recipes in C, p.685-687 * * @author Kevin R. Dixon * @since 2.1 */ @PublicationReferences( references={ @PublicationReference( author={ "William H. Press", "Saul A. Teukolsky", "William T. Vetterling", "Brian P. Flannery" }, title="Numerical Recipes in C, Second Edition", type=PublicationType.Book, year=1992, pages={685,687}, notes="Section 15.5", url="http://www.nrbook.com/a/bookcpdf.php" ), @PublicationReference( author="Wikipedia", title="Levenberg-Marquardt algorithm", type=PublicationType.WebPage, year=2008, url="http://en.wikipedia.org/wiki/Levenberg-Marquardt_algorithm" ) } ) public class LevenbergMarquardtEstimation extends LeastSquaresEstimator { /** * Default initial value of the damping factor {@value} */ public static final double DEFAULT_DAMPING = 1.0; /** * Divisor of the damping factor on unsuccessful iteration, dividing * damping factor on cost-reducing iteration {@value} */ public static final double DEFAULT_DAMPING_DIVISOR = 10.0; /** * Default maximum number of iterations without improvement before stopping {@value} */ public static final int DEFAULT_MAX_ITERATIONS_WITHOUT_IMPROVEMENT = 4; /** * Number of sequential unsuccessful iterations without a cost-reducing step */ private int iterationsWithoutImprovement; /** * Maximum number of iterations without improvement before stopping */ private int maxIterationsWithoutImprovement; /** * Current damping factor for the ridge regression */ private double dampingFactor; /** * Divisor of the damping factor on a successful iteration, must be greater * then 1.0, typically ~10.0 */ private double dampingFactorDivisor; /** * Creates a new instance of LevenbergMarquardtEstimation */ public LevenbergMarquardtEstimation() { this( DEFAULT_DAMPING, DEFAULT_DAMPING_DIVISOR, DEFAULT_MAX_ITERATIONS, DEFAULT_MAX_ITERATIONS_WITHOUT_IMPROVEMENT, DEFAULT_TOLERANCE ); } /** * Creates a new instance of LevenbergMarquardtEstimation * @param dampingFactor * Current damping factor for the ridge regression * @param dampingFactorDivisor * Divisor of the damping factor on a successful iteration, must be greater * then 1.0, typically ~10.0 * @param maxIterations * Maximum iterations before stopping * @param maxIterationsWithoutImprovement * Number of sequential unsuccessful iterations without a cost-reducing step * @param tolerance * Stopping criterion for the algorithm, typically ~1e-5 */ public LevenbergMarquardtEstimation( double dampingFactor, double dampingFactorDivisor, int maxIterations, int maxIterationsWithoutImprovement, double tolerance ) { super( maxIterations, tolerance ); this.setDampingFactor(dampingFactor); this.setDampingFactorDivisor(dampingFactorDivisor); this.setMaxIterationsWithoutImprovement(maxIterationsWithoutImprovement); } /** * Parameters used to generate the lastCost */ private Vector bestParameters; /** * Cost associated with lastParameters */ private SumSquaredErrorCostFunction.Cache bestParametersCost; @Override protected boolean initializeAlgorithm() { this.setResult( this.getObjectToOptimize().clone() ); this.setIterationsWithoutImprovement(0); this.bestParameters = this.getResult().convertToVector(); this.bestParametersCost = SumSquaredErrorCostFunction.Cache.compute( this.getResult(), this.getData() ); this.setResultCost( this.bestParametersCost.parameterCost ); return true; } @Override protected boolean step() { // These negatives are to compensate because the derivative is the // negative of the error. Since we're computing the derivative, but // the descent direction is the negative of the gradient, we need to // scale the matrices by -1.0 Matrix JtJpI = this.bestParametersCost.JtJ.scale( -1.0 ); Vector Jte = this.bestParametersCost.Jte; int M = JtJpI.getNumRows(); for( int i = 0; i < M; i++ ) { // Again, the damping factor is subtracted to compensate for the // negative between gradient and direction double v = JtJpI.getElement(i, i); JtJpI.setElement(i, i, v - this.getDampingFactor() ); } // This is the ridge-regression (Tikhonov regularization) step to solve // for the parameter change Vector trialParameters = JtJpI.solve(Jte); trialParameters.plusEquals(this.bestParameters); // If the trial parameters reduce the cost, then accept them this.getResult().convertFromVector( trialParameters ); SumSquaredErrorCostFunction.Cache trialCost = SumSquaredErrorCostFunction.Cache.compute( this.getResult(), this.getData() ); if( trialCost.parameterCost < this.bestParametersCost.parameterCost ) { this.iterationsWithoutImprovement = 0; this.dampingFactor /= this.dampingFactorDivisor; this.bestParameters = trialParameters; // Test to see if we're done double delta = trialParameters.norm2() * (this.bestParametersCost.parameterCost - trialCost.parameterCost); this.bestParameters = trialParameters; this.bestParametersCost = trialCost; this.setResultCost( this.bestParametersCost.parameterCost ); if( delta < this.getTolerance() ) { return false; } } // The trial parameters aren't better than the best parameters, // so increase the regularization damping factor else { this.dampingFactor *= this.dampingFactorDivisor; this.iterationsWithoutImprovement++; } if( this.iterationsWithoutImprovement > this.getMaxIterationsWithoutImprovement() ) { return false; } return true; } @Override protected void cleanupAlgorithm() { // Make sure we've uploaded the best parameters into the function // We won't have necessarily have the best parameters uploaded if // we didn't finish on the "delta" convergence condition this.getResult().convertFromVector( this.bestParameters ); } /** * Getter for iterationsWithoutImprovement * @return * Number of sequential unsuccessful iterations without a cost-reducing step */ public int getIterationsWithoutImprovement() { return this.iterationsWithoutImprovement; } /** * Setter for iterationsWithoutImprovement * @param iterationsWithoutImprovement * Number of sequential unsuccessful iterations without a cost-reducing step */ public void setIterationsWithoutImprovement( int iterationsWithoutImprovement ) { this.iterationsWithoutImprovement = iterationsWithoutImprovement; } /** * Getter for maxIterationsWithoutImprovement * @return * Maximum number of iterations without improvement before stopping */ public int getMaxIterationsWithoutImprovement() { return this.maxIterationsWithoutImprovement; } /** * Setter for maxIterationsWithoutImprovement * @param maxIterationsWithoutImprovement * Maximum number of iterations without improvement before stopping */ public void setMaxIterationsWithoutImprovement( int maxIterationsWithoutImprovement ) { this.maxIterationsWithoutImprovement = maxIterationsWithoutImprovement; } /** * Getter for dampingFactor * @return * Current damping factor for the ridge regression */ public double getDampingFactor() { return this.dampingFactor; } /** * Setter for dampingFactor * @param dampingFactor * Current damping factor for the ridge regression */ public void setDampingFactor( double dampingFactor ) { this.dampingFactor = dampingFactor; } /** * Getter for dampingFactorDivisor * @return * Divisor of the damping factor on a successful iteration, must be greater * then 1.0, typically ~10.0 */ public double getDampingFactorDivisor() { return dampingFactorDivisor; } /** * Setter for dampingFactorDivisor * @param dampingFactorDivisor * Divisor of the damping factor on a successful iteration, must be greater * then 1.0, typically ~10.0 */ public void setDampingFactorDivisor( double dampingFactorDivisor ) { this.dampingFactorDivisor = dampingFactorDivisor; } }