OrthantWiseLimitedMemoryBFGS.java example

Explorer
topic-modeling-master
package cc.mallet.optimize;

import java.util.LinkedList;
import java.util.logging.Logger;

import cc.mallet.types.MatrixOps;
import cc.mallet.util.MalletLogger;

/**
 * Implementation of orthant-wise limited memory quasi Newton method for
 * optimizing convex L1-regularized objectives. See:
 * "Scalable training of l1-regularized log-linear models" by Galen Andrew and
 * Jianfeng Gao in ICML 2007 for details. This code is an adaptation of the
 * freely-available C++ code on Galen's webpage.
 * 
 * @author Kedar Bellare
 */
public class OrthantWiseLimitedMemoryBFGS implements Optimizer {
	private static Logger logger = MalletLogger
			.getLogger(OrthantWiseLimitedMemoryBFGS.class.getName());

	boolean converged = false;
	Optimizable.ByGradientValue optimizable;
	// name of optimizable for value output
	String optName;
	final int maxIterations = 1000;
	final double tolerance = .0001;
	final double gradientTolerance = .001;
	final double eps = 1.0e-5;
	double l1Weight;

	// The number of corrections used in BFGS update
	// ideally 3 <= m <= 7. Larger m means more cpu time, memory.
	final int m = 4;

	// State of optimizer search
	// oldValue = value before line search, value = value after line search
	double oldValue, value, yDotY;
	// grad = gradient
	double[] grad, oldGrad, direction, steepestDescentDirection, parameters,
			oldParameters;
	// s = list of m previous "difference in parameters" values
	// y = list of m previous "difference in grad" values
	LinkedList<double[]> s, y;
	// rho = intermediate calculation
	LinkedList<Double> rhos;
	double[] alphas;
	int iterations;

	public OrthantWiseLimitedMemoryBFGS(Optimizable.ByGradientValue function) {
		this(function, 0.0);
	}

	public OrthantWiseLimitedMemoryBFGS(Optimizable.ByGradientValue function,
			double l1wt) {
		this.optimizable = function;
		this.l1Weight = l1wt;
		String parts[] = optimizable.getClass().getName().split("\\.");
		this.optName = parts[parts.length - 1];

		// initialize optimizer state
		iterations = 0;
		s = new LinkedList<double[]>();
		y = new LinkedList<double[]>();
		rhos = new LinkedList<Double>();
		alphas = new double[m];
		MatrixOps.setAll(alphas, 0.0);
		yDotY = 0;

		int numParameters = optimizable.getNumParameters();

		// get initial parameters
		parameters = new double[numParameters];
		optimizable.getParameters(parameters);

		// get initial value
		value = evalL1();

		// get initial gradient
		grad = new double[numParameters];
		evalGradient();

		// initialize direction
		direction = new double[numParameters];
		steepestDescentDirection = new double[numParameters];

		// initialize backups
		oldParameters = new double[numParameters];
		oldGrad = new double[numParameters];
	}

	public Optimizable getOptimizable() {
		return optimizable;
	}

	public boolean isConverged() {
		return converged;
	}

	public int getIteration() {
		return iterations;
	}

	public boolean optimize() {
		return optimize(Integer.MAX_VALUE);
	}

	public boolean optimize(int numIterations) {
		logger.fine("Entering OWL-BFGS.optimize(). L1 weight=" + l1Weight
				+ " Initial Value=" + value);

		for (int iter = 0; iter < numIterations; iter++) {
			// create descent direction
			makeSteepestDescDir();

			// adjust for curvature
			mapDirByInverseHessian(yDotY);

			// fix direction signs
			fixDirSigns();

			// backup parameters and gradient; then perform line-search
			storeSrcInDest(parameters, oldParameters);
			storeSrcInDest(grad, oldGrad);
			backTrackingLineSearch();

			// update gradient after line search
			evalGradient();

			// check for termination conditions
			if (checkValueTerminationCondition()) {
				logger.info("Exiting OWL-BFGS on termination #1:");
				logger.info("value difference below tolerance (oldValue: "
						+ oldValue + " newValue: " + value);
				converged = true;
				return true;
			}

			if (checkGradientTerminationCondition()) {
				logger.info("Exiting OWL-BFGS on termination #2:");
				logger.info("gradient=" + MatrixOps.twoNorm(grad) + " < "
						+ gradientTolerance);
				converged = true;
				return true;
			}

			// update hessian approximation
			yDotY = shift();

			iterations++;
			if (iterations > maxIterations) {
				logger.info("Too many iterations in OWL-BFGS. "
						+ "Continuing with current parameters.");
				converged = true;
				return true;
			}
		}

		return false;
	}

	/**
	 * Evaluate value. Make it a minimization problem.
	 */
	private double evalL1() {
		double val = -optimizable.getValue();
		double sumAbsWt = 0;
		if (l1Weight > 0) {
			for (double param : parameters) {
				if (Double.isInfinite(param))
					continue;
				sumAbsWt += Math.abs(param) * l1Weight;
			}
		}
		logger.info("getValue() (" + optName + ".getValue() = " + val
				+ " + |w|=" + sumAbsWt + ") = " + (val + sumAbsWt));

		return val + sumAbsWt;
	}

	/**
	 * Evaluate gradient, make it a descent direction.
	 */
	private void evalGradient() {
		optimizable.getValueGradient(grad);
		adjustGradForInfiniteParams(grad);
		MatrixOps.timesEquals(grad, -1.0);
	}

	/**
	 * Creates steepest ascent direction from gradient and L1-regularization.
	 */
	private void makeSteepestDescDir() {
		if (l1Weight == 0) {
			for (int i = 0; i < grad.length; i++) {
				direction[i] = -grad[i];
			}
		} else {
			for (int i = 0; i < grad.length; i++) {
				if (parameters[i] < 0) {
					direction[i] = -grad[i] + l1Weight;
				} else if (parameters[i] > 0) {
					direction[i] = -grad[i] - l1Weight;
				} else {
					if (grad[i] < -l1Weight) {
						direction[i] = -grad[i] - l1Weight;
					} else if (grad[i] > l1Weight) {
						direction[i] = -grad[i] + l1Weight;
					} else {
						direction[i] = 0;
					}
				}
			}
		}

		storeSrcInDest(direction, steepestDescentDirection);
	}

	private void adjustGradForInfiniteParams(double d[]) {
		for (int i = 0; i < parameters.length; i++) {
			if (Double.isInfinite(parameters[i]))
				d[i] = 0;
		}
	}

	/**
	 * Adjusts direction based on approximate hessian inverse.
	 * 
	 * @param yDotY
	 *            y^T * y in BFGS calculation.
	 */
	private void mapDirByInverseHessian(double yDotY) {
		if (s.size() == 0)
			return;

		int count = s.size();
		for (int i = count - 1; i >= 0; i--) {
			alphas[i] = -MatrixOps.dotProduct(s.get(i), direction)
					/ rhos.get(i);
			MatrixOps.plusEquals(direction, y.get(i), alphas[i]);
		}

		double scalar = rhos.get(count - 1) / yDotY;
		logger.fine("Direction multiplier = " + scalar);
		MatrixOps.timesEquals(direction, scalar);

		for (int i = 0; i < count; i++) {
			double beta = MatrixOps.dotProduct(y.get(i), direction)
					/ rhos.get(i);
			MatrixOps.plusEquals(direction, s.get(i), -alphas[i] - beta);
		}
	}

	private void fixDirSigns() {
		if (l1Weight > 0) {
			for (int i = 0; i < direction.length; i++) {
				if (direction[i] * steepestDescentDirection[i] <= 0) {
					direction[i] = 0;
				}
			}
		}
	}

	private double dirDeriv() {
		if (l1Weight == 0) {
			return MatrixOps.dotProduct(direction, grad);
		} else {
			double val = 0.0;
			for (int i = 0; i < direction.length; i++) {
				if (direction[i] != 0) {
					if (parameters[i] < 0) {
						val += direction[i] * (grad[i] - l1Weight);
					} else if (parameters[i] > 0) {
						val += direction[i] * (grad[i] + l1Weight);
					} else if (direction[i] < 0) {
						val += direction[i] * (grad[i] - l1Weight);
					} else if (direction[i] > 0) {
						val += direction[i] * (grad[i] + l1Weight);
					}
				}
			}

			return val;
		}
	}

	private double shift() {
		double[] nextS = null, nextY = null;

		int listSize = s.size();
		if (listSize < m) {
			nextS = new double[parameters.length];
			nextY = new double[parameters.length];
		} else {
			nextS = s.removeFirst();
			nextY = y.removeFirst();
			rhos.removeFirst();
		}

		double rho = 0.0;
		double yDotY = 0.0;
		for (int i = 0; i < parameters.length; i++) {
			if (Double.isInfinite(parameters[i])
					&& Double.isInfinite(oldParameters[i])
					&& parameters[i] * oldParameters[i] > 0)
				nextS[i] = 0;
			else
				nextS[i] = parameters[i] - oldParameters[i];

			if (Double.isInfinite(grad[i]) && Double.isInfinite(oldGrad[i])
					&& grad[i] * oldGrad[i] > 0)
				nextY[i] = 0;
			else
				nextY[i] = grad[i] - oldGrad[i];

			rho += nextS[i] * nextY[i];
			yDotY += nextY[i] * nextY[i];
		}

		logger.fine("rho=" + rho);
		if (rho < 0) {
			throw new InvalidOptimizableException("rho = " + rho + " < 0: "
					+ "Invalid hessian inverse. "
					+ "Gradient change should be opposite of parameter change.");
		}

		s.addLast(nextS);
		y.addLast(nextY);
		rhos.addLast(rho);

		// update old params and grad
		storeSrcInDest(parameters, oldParameters);
		storeSrcInDest(grad, oldGrad);

		return yDotY;
	}

	private void storeSrcInDest(double src[], double dest[]) {
		System.arraycopy(src, 0, dest, 0, src.length);
	}

	// backtrack line search
	private void backTrackingLineSearch() {
		double origDirDeriv = dirDeriv();
		if (origDirDeriv >= 0) {
			throw new InvalidOptimizableException(
					"L-BFGS chose a non-ascent direction: check your gradient!");
		}

		double alpha = 1.0;
		double backoff = 0.5;
		if (iterations == 0) {
			double normDir = Math.sqrt(MatrixOps.dotProduct(direction,
					direction));
			alpha = 1.0 / normDir;
			backoff = 0.1;
		}

		final double c1 = 1e-4;
		// store old value
		oldValue = value;

		logger.fine("*** Starting line search iter=" + iterations);
		logger.fine("iter[" + iterations + "] Value at start of line search = "
				+ value);

		while (true) {
			// update parameters and gradient
			getNextPoint(alpha);

			// find new value
			value = evalL1();

			logger.fine("iter[" + iterations + "] Using alpha = " + alpha
					+ " new value = " + value + " |grad|="
					+ MatrixOps.twoNorm(grad) + " |x|="
					+ MatrixOps.twoNorm(parameters));

			if (value <= oldValue + c1 * origDirDeriv * alpha)
				break;

			alpha *= backoff;
		}
	}

	private void getNextPoint(double alpha) {
		for (int i = 0; i < parameters.length; i++) {
			parameters[i] = oldParameters[i] + direction[i] * alpha;
			if (l1Weight > 0) {
				// do not allow to cross orthant boundaries if using
				// L1-regularization
				if (oldParameters[i] * parameters[i] < 0) {
					parameters[i] = 0.0;
				}
			}
		}

		optimizable.setParameters(parameters);
	}

	// termination conditions
	private boolean checkValueTerminationCondition() {
		return (2.0 * Math.abs(value - oldValue) <= tolerance
				* (Math.abs(value) + Math.abs(oldValue) + eps));
	}

	private boolean checkGradientTerminationCondition() {
		return MatrixOps.twoNorm(grad) < gradientTolerance;
	}
}