OWLQN.java example

Explorer
ark-tweet-nlp-master
- src
  - cmu
    - arktweetnlp
package cmu.arktweetnlp.impl;

import edu.stanford.nlp.math.ArrayMath;
import edu.stanford.nlp.optimization.DiffFunction;
import gnu.trove.set.hash.THashSet;

import java.util.*;


/**
 * Class implementing the Orthant-Wise Limited-memory Quasi-Newton
 * algorithm (OWL-QN). OWN-QN is a numerical optimization procedure for
 * finding the optimum of an objective of the form smooth function plus
 * L1-norm of the parameters. It has been used for training log-linear
 * models (such as logistic regression) with L1-regularization. The
 * algorithm is described in "Scalable training of L1-regularized
 * log-linear models" by Galen Andrew and Jianfeng Gao. This
 * implementation includes built-in capacity to train logistic regression
 * or least-squares models with L1 regularization. It is also possible to
 * use OWL-QN to optimize any arbitrary smooth convex loss plus L1
 * regularization by defining the function and its gradient using the
 * supplied "DifferentiableFunction" class, and passing an instance of
 * the function to the OWLQN object. For more information, please read
 * the included file README.txt. Also included in the distribution are
 * the ICML paper and slide presentation.
 *
 * Significant portions of this code are taken from
 * <a href="http://research.microsoft.com/en-us/downloads/b1eb1016-1738-4bd5-83a9-370c9d498a03/default.aspx">Galen Andew's implementation</a>
 *
 * @author Michel Galley
 * 
 * (NOTE FROM BRENDAN 2012-08-14: What license did Stanford release this as?  GPL ??)
 * 
 * modified by Michael Heilman (mheilman@cmu.edu) 
 * -allow for bias/intercept parameters that shouldn't be penalized
 * -make outside API calls easier (11/9/2010)
 * -removed lots of extraneous stuff from the stanford API
 * 
 * modified by Nathan Schneider (nschneid@cs.cmu.edu)
 * - support for constrained optimization with the projected gradient method
 * (see {@link #setConstrained(boolean)}
 * 
 */
public class OWLQN {

	private int maxIters = Integer.MAX_VALUE;
	
	/** Whether to use the "projected gradient" method to require weights to be 
	 * nonnegative and sum to 1. The projection operation is implemented in 
	 * {@link #project(double[])}.
	 */
	private static boolean constrained = false;

	interface TerminationCriterion {
		double getValue(OptimizerState state, StringBuilder out);
	}

	static class RelativeMeanImprovementCriterion implements TerminationCriterion {
		int numItersToAvg;
		Queue<Double> prevVals;

		RelativeMeanImprovementCriterion() {
			this(10);
		}

		RelativeMeanImprovementCriterion(int numItersToAvg) {
			this.numItersToAvg = numItersToAvg;
			this.prevVals = new LinkedList<Double>();
		}

		public double getValue(OptimizerState state, StringBuilder out) {

			double retVal = Double.POSITIVE_INFINITY;

			if (prevVals.size() >= numItersToAvg) {
				double prevVal = prevVals.peek();
				if (prevVals.size() == numItersToAvg) prevVals.poll();
				double averageImprovement = (prevVal - state.getValue()) / prevVals.size();
				double relAvgImpr = averageImprovement / Math.abs(state.getValue());
				String relAvgImprStr = String.format("%.4e",relAvgImpr);
				out.append("  (").append(relAvgImprStr).append(") ");
				retVal = relAvgImpr;
			} else {
				out.append("  (wait for "+numItersToAvg+" iters) ");
			}

			prevVals.offer(state.getValue());
			return retVal;
		}
	} // end static class RelativeMeanImprovementCriterion


	boolean quiet;
	boolean responsibleForTermCrit;

	public static Set<Integer> biasParameters = new THashSet<Integer>();

	TerminationCriterion termCrit;

	public interface WeightsPrinter {
		public void printWeights();
	}
	
	WeightsPrinter printer;
	
	
	public OWLQN(boolean quiet) {
		this.quiet = quiet;
		this.termCrit = new RelativeMeanImprovementCriterion();
		this.responsibleForTermCrit = true;
	}

	public OWLQN() {
		this(false);
	}

	public OWLQN(TerminationCriterion termCrit, boolean quiet) {
		this.quiet = quiet;
		this.termCrit = termCrit;
		this.responsibleForTermCrit = false;
	}

	/* (non-Javadoc)
	 * @see edu.cmu.cs.ark.tdf.IOptimizer#setQuiet(boolean)
	 */
	public void setQuiet(boolean q) {
		quiet = q;
	}

	/*
	public void minimize(DiffFunction function, double[] initial) {
		minimize(function, initial, 1.0);
	}

	public void minimize(DiffFunction function, double[] initial, double l1weight) {
		minimize(function, initial, l1weight, 1e-5);
	}

	public void minimize(DiffFunction function, double[] initial, double l1weight, double tol) {
		minimize(function, initial, l1weight, tol, 10);
	}
	*/
	
	
	/* (non-Javadoc)
	 * @see edu.cmu.cs.ark.tdf.IOptimizer#minimize(edu.stanford.nlp.optimization.DiffFunction, double[], double, double, int)
	 */
	public double[] minimize(DiffFunction function, double[] initial, double l1weight, double tol, int m) {

		OptimizerState state = new OptimizerState(function, initial, m, l1weight, quiet);

		if (!quiet) {
			System.err.printf("Optimizing function of %d variables with OWL-QN parameters:\n", state.dim);
			System.err.printf("   l1 regularization weight: %f.\n", l1weight);
			System.err.printf("   L-BFGS memory parameter (m): %d\n", m);
			System.err.printf("   Convergence tolerance: %f\n\n", tol);
			System.err.printf("Iter    n:\tnew_value\tdf\t(conv_crit)\tline_search\n");
			System.err.printf("Iter    0:\t%.4e\t\t(***********)\t", state.value);
		}

		StringBuilder buf = new StringBuilder();
		termCrit.getValue(state, buf);

		for(int i=0; i<maxIters; i++){
			buf.setLength(0);
			state.updateDir();				
			state.backTrackingLineSearch();
			
			double termCritVal = termCrit.getValue(state, buf);
			if (!quiet) {
				int numnonzero = ArrayMath.countNonZero(state.newX);
				System.err.printf("Iter %4d:\t%.4e\t%d", state.iter, state.value, numnonzero);
				System.err.print("\t"+ buf.toString());
				
				if (printer!=null)
					printer.printWeights();
			}

			//mheilman: I added this check because OWLQN was failing without it sometimes 
			//for large L1 penalties and few features...
			//This checks that the parameters changed in the last iteration.
			//If they didn't, then OWL-QN will try to divide by zero when approximating the Hessian.
			//The ro values end up 0 when the line search ends up trying a newX that equals X (or newGrad and grad).
			//That causes the differences stored in sList and yList to be zero, which eventually causes 
			//values in roList to be zero.  Below, ro values appear in the denominator, and they would 
			//cause the program to crash in the mapDirByInverseHessian() method if any were zero.
			//This only appears to happen once the parameters have already converged.  
			//I suspect that numerical loss of precision is the cause.
			if(arrayEquals(state.x, state.newX)){
				System.err.println("Warning: Stopping OWL-QN since there was no change in the parameters in the last iteration.  This probably means convergence has been reached.");
				break;
			}
					
			if (termCritVal < tol)
				break;

			state.shift();
		}

		if (!quiet) {
			System.err.println();
			System.err.printf("Finished with optimization.  %d/%d non-zero weights.\n",
					ArrayMath.countNonZero(state.newX), state.newX.length);
		}

		return state.newX;
	}

	
	private boolean arrayEquals(double [] x, double [] y){
		if(x.length != y.length) return false;
		for(int i=0; i<x.length; i++){
			if(x[i] != y[i]) return false;
		}
		return true;
	}
	

	/* (non-Javadoc)
	 * @see edu.cmu.cs.ark.tdf.IOptimizer#setMaxIters(int)
	 */
	public void setMaxIters(int maxIters) {
		this.maxIters = maxIters;
	}

	/* (non-Javadoc)
	 * @see edu.cmu.cs.ark.tdf.IOptimizer#getMaxIters()
	 */
	public int getMaxIters() {
		return maxIters;
	}

	/** Specify a callback for printing the weights after each iteration of optimization. */
	public void setWeightsPrinting(WeightsPrinter printer) {
		this.printer = printer;
	}
	
	private static int numUnconstrainedWeights = -1;
	
	/** Sets {@link #constrained} */
	public static void setConstrained(boolean constrained) {
		OWLQN.constrained = constrained;
		OWLQN.numUnconstrainedWeights = (constrained) ? 0 : -1;
	}
	
	/** Specify that each parameter vector will start with numUnconstrainedWeights; only the 
	 * remaining weights are subject to the constraints. A negative value indicates that 
	 * no constraints should be applied.
	 */
	public static void setConstrained(int numUnconstrainedWeights) {
		OWLQN.numUnconstrainedWeights = numUnconstrainedWeights;
		OWLQN.constrained = (numUnconstrainedWeights<0) ? false : true;
	}
	
	public static boolean isConstrained() {
		return constrained;
	}
	
	/** Projects the full weight vector into the feasible region where constrains are operative only on a subset of weights.
	 * The first numUnconstrainedWeights weights are preserved.
	 */
	protected static double[] projectWeights(double[] c) {
		if (numUnconstrainedWeights==0)
			return project(c);
		
		double[] constrained = new double[c.length-numUnconstrainedWeights];
		for (int j=numUnconstrainedWeights; j<c.length; j++) {
			constrained[j-numUnconstrainedWeights] = c[j];
		}
		double[] projected = project(constrained);
		double[] x = new double[c.length];
		for (int j=0; j<c.length; j++) {
			x[j] = (j<numUnconstrainedWeights) ? c[j] : projected[j-numUnconstrainedWeights];
		}
		return x;
	}
	
	/**
	 * Implementation of the (Michelot 1986) technique for projecting 
	 * a real vector into the feasible region K defined by the following constraints:
	 *   - components must sum to 1, and
	 *   - they must be nonnegative.
	 *  
	 * PROJECTION ALGORITHM
	 * <pre>
	   Given: a real vector c to be projected into a linearly-constrained subspace
			I ← ∅	// index set
			x ← c
			do
				Compute x' = P_V_I(x): x'_i =
				        0 if i ∈ I,
				        x_i - (1/(dim(c)-dim(I))) * (Σ_j { x_j } - 1) otherwise
				if x' ≥ 0: return x'
				I ← I ∪ {i | x'_i < 0}
				x ← P_X_I(x'): i.e. x' but with any negative components replaced by 0
			repeat
		Output: projection of c into constrained subspace K, i.e. P_K(c)
		</pre>
	 */
	public static double[] project(double[] c) {
		Set<Integer> indexSet = new THashSet<Integer>();
		double[] x = c;
		
		while (true) {
			double sum = 0.0;
			for (double xj : x) {
				sum += xj;
			}
			final double adjustmentTerm = (sum-1.0)/(c.length - indexSet.size());
			
			boolean nonnegative = true;
			for (int i=0; i<x.length; i++) {
				x[i] = (indexSet.contains(i)) ? 0.0 : x[i] - adjustmentTerm;
				if (x[i]<0.0) {
					nonnegative = false;
					indexSet.add(i);
					x[i] = 0.0;
				}
			}
			if (nonnegative)
				return x;
		}
	}

} // end static class OWLQN

class OptimizerState {

	double[] x, grad, newX, newGrad, dir;
	double[] steepestDescDir;
	LinkedList<double[]> sList = new LinkedList<double[]>();
	LinkedList<double[]> yList = new LinkedList<double[]>();
	LinkedList<Double> roList = new LinkedList<Double>();
	double[] alphas;
	double value;
	int iter, m;
	int dim;
	DiffFunction func;
	double l1weight;

	boolean quiet;
	
	//mheilman: I added this for debugging
	private String arrayToString(double [] arr){
		String res = "";
		for(int i=0; i<arr.length; i++){
			if(i>0) res += "\t";
			res += arr[i];
		}
		return res;
	}
	
	//mheilman: I added this for debugging the updateDir() method.
	private void printStateValues() {
		System.err.println("\nSLIST:");
		for(int i=0; i<sList.size(); i++){
			System.err.println(arrayToString(sList.get(i)));
		}
		
		System.err.println("YLIST:");
		for(int i=0; i<yList.size(); i++){
			System.err.println(arrayToString(yList.get(i)));
		}
		
		System.err.println("ROLIST:");
		for(int i=0; i<roList.size(); i++){
			System.err.println(roList.get(i));
		}
		System.err.println();
	}
	

	void mapDirByInverseHessian() {
		int count = sList.size();

		if (count != 0) {				
			for (int i = count - 1; i >= 0; i--) {
				//mheilman: The program will try to divide by zero here unless there is a check 
				//that the parameters change at each iteration.  See comments in the minimize() method.
				//A roList value is the inner product of the change in the gradient 
				//and the change in parameters between the current and last iterations.  
				//See the discussion of L-BFGS in Nocedal and Wright's Numerical Optimization book 
				//(though I think that defines rho as the multiplicative inverse of what is here).
				alphas[i] = -ArrayMath.innerProduct(sList.get(i), dir) / roList.get(i); 
				ArrayMath.addMultInPlace(dir, yList.get(i), alphas[i]);
			}

			double[] lastY = yList.get(count - 1);
			double yDotY = ArrayMath.innerProduct(lastY, lastY);
			double scalar = roList.get(count - 1) / yDotY;
			ArrayMath.multiplyInPlace(dir, scalar);

			for (int i = 0; i < count; i++) {
				double beta = ArrayMath.innerProduct(yList.get(i), dir) / roList.get(i);
				ArrayMath.addMultInPlace(dir, sList.get(i), -alphas[i] - beta);
			}
		}
	}

	void makeSteepestDescDir() {
		if (l1weight == 0) {
			ArrayMath.multiplyInto(dir, grad, -1);
		} else {

			for (int i=0; i<dim; i++) {
				//mheilman: I added this if-statement to avoid penalizing bias parameters.
				if(OWLQN.biasParameters.contains(i)){
					dir[i] = -grad[i];
					continue;
				}
				if (x[i] < 0) {
					dir[i] = -grad[i] + l1weight;
				} else if (x[i] > 0) {
					dir[i] = -grad[i] - l1weight;
				} else {
					if (grad[i] < -l1weight) {
						dir[i] = -grad[i] - l1weight;
					} else if (grad[i] > l1weight) {
						dir[i] = -grad[i] + l1weight;
					} else {
						dir[i] = 0;
					}
				}
			}
		}
		steepestDescDir = dir.clone(); // deep copy needed
	}

	void fixDirSigns() {
		if (l1weight > 0) {
			for (int i = 0; i<dim; i++) {
				if(OWLQN.biasParameters.contains(i)){
					continue;
				}
				if (dir[i] * steepestDescDir[i] <= 0) {
					dir[i] = 0;
				}
			}
		}
	}

	void updateDir() {
		//printStateValues(); //mheilman: I added this for debugging.
		makeSteepestDescDir();
		mapDirByInverseHessian();
		fixDirSigns();
		//mheilman: I commented out this debugging check.
		//if(!quiet) testDirDeriv();
	}

	/*
	void testDirDeriv() {
		double dirNorm = Math.sqrt(ArrayMath.innerProduct(dir, dir));
		double eps = 1.05e-8 / dirNorm;
		getNextPoint(eps);
		//double val2 = evalL1();
		//double numDeriv = (val2 - value) / eps;
		//double deriv = dirDeriv();
		//if (!quiet) System.err.print("  Grad check: " + numDeriv + " vs. " + deriv + "  ");
	}
	 */
	
	double dirDeriv() {
		if (l1weight == 0) {
			return ArrayMath.innerProduct(dir, grad);
		} else {
			double val = 0.0;
			for (int i = 0; i < dim; i++) {
				//mheilman: I added this if-statement to avoid penalizing bias parameters.
				if(OWLQN.biasParameters.contains(i)){
					val += dir[i] * grad[i]; 
					continue;
				}
				if (dir[i] != 0) {
					if (x[i] < 0) {
						val += dir[i] * (grad[i] - l1weight);
					} else if (x[i] > 0) {
						val += dir[i] * (grad[i] + l1weight);
					} else if (dir[i] < 0) {
						val += dir[i] * (grad[i] - l1weight);
					} else if (dir[i] > 0) {
						val += dir[i] * (grad[i] + l1weight);
					}
				}
			}
			return val;
		}
	}

	private boolean getNextPoint(double alpha) {			
		ArrayMath.addMultInto(newX, x, dir, alpha);
		/*if (OWLQN.isConstrained())
			newX = OWLQN.projectWeights(newX);*/ //TODO
		if (l1weight > 0) {
			for (int i=0; i<dim; i++) {
				//mheilman: I added this if-statement to avoid penalizing bias parameters.
				if(OWLQN.biasParameters.contains(i)){
		        	  continue;
		        }
				if (x[i] * newX[i] < 0.0) {
					newX[i] = 0.0;
				}
			}
		}
		return true;
	}
	
	

	double evalL1() {

		double val = func.valueAt(newX);
		// Don't remove clone(), otherwise newGrad and grad may end up referencing the same vector
		// (that's the case with LogisticObjectiveFunction)
		newGrad = func.derivativeAt(newX).clone();
		if (l1weight > 0) {
			for (int i=0; i<dim; i++) {
				//mheilman: I added this if-statement to avoid penalizing bias parameters.
				if(OWLQN.biasParameters.contains(i)){
		        	  continue;
		        }
				val += Math.abs(newX[i]) * l1weight;
			}
		}

		return val;
	}

	void backTrackingLineSearch() {

		double origDirDeriv = dirDeriv();
		// if a non-descent direction is chosen, the line search will break anyway, so throw here
		// The most likely reason for this is a bug in your function's gradient computation
		if (origDirDeriv >= 0) {
			throw new RuntimeException("L-BFGS chose a non-descent direction: check your gradient!");
		}
		
		

		double alpha = 1.0;
		double backoff = 0.5;
		if (iter == 1) {
			double normDir = Math.sqrt(ArrayMath.innerProduct(dir, dir));
			alpha = (1 / normDir);
			backoff = 0.1;
		}
		
		double c1 = 1e-4;
		double oldValue = value;
		while (true) {
			getNextPoint(alpha);
			value = evalL1();
			
			//mheilman: I think loss of precision can happen here for pathological cases because 
			//origDirDeriv can be many orders of magnitude less than value and oldValue.
			//Then, the right side will just end up being oldValue.
			if (value <= oldValue + c1 * origDirDeriv * alpha){
				break;
			}

			//mheilman: I added this extra check to keep the program
			//from backing off for a long time until numerical underflow happens.
			//If the line search hasn't found something by 1e-30, it's probably not going to,
			//and I think not having this check may cause the program to crash 
			//for some rare, pathological cases.
			if(alpha < 1e-30){
				System.err.println("Warning: The line search backed off to alpha < 1e-30, and stayed with the current parameter values.  This probably means converged has been reached.");
				value = oldValue;
				break;
			}

			if (!quiet) System.err.print(".");

			alpha *= backoff;
		}

		if (!quiet) System.err.println();
	}

	
	void shift() {
		double[] nextS = null, nextY = null;

		int listSize = sList.size();

		if (listSize < m) {
			try {
				nextS = new double[dim];
				nextY = new double[dim];
			} catch (OutOfMemoryError e) {
				m = listSize;
				nextS = null;
			}
		}

		if (nextS == null) {
			nextS = sList.poll();
			nextY = yList.poll();
			roList.poll();
		}

		ArrayMath.addMultInto(nextS, newX, x, -1);
		ArrayMath.addMultInto(nextY, newGrad, grad, -1);

		double ro = ArrayMath.innerProduct(nextS, nextY);
		assert(ro != 0.0);

		sList.offer(nextS);
		yList.offer(nextY);
		roList.offer(ro);

		double[] tmpX = newX;
		newX = x;
		x = tmpX;
		
		// TODO: added: nschneid
		/*if (OWLQN.isConstrained()) {
			newX = OWLQN.projectWeights(newX);
			x = OWLQN.projectWeights(x);
		}*/

		double[] tmpGrad = newGrad;
		newGrad = grad;
		grad = tmpGrad;

		++iter;
	}

	double getValue() { return value; }

	OptimizerState(DiffFunction f, double[] init, int m, double l1weight, boolean quiet) {
		this.x = init;
		this.grad = new double[init.length];
		this.newX = init.clone();
		this.newGrad = new double[init.length];
		this.dir = new double[init.length];
		this.steepestDescDir = newGrad.clone();
		this.alphas = new double[m];
		this.iter = 1;
		this.m = m;
		this.dim = init.length;
		this.func = f;
		this.l1weight = l1weight;
		this.quiet = quiet;

		if (m <= 0)
			throw new RuntimeException("m must be an integer greater than zero.");

		value = evalL1();
		grad = newGrad.clone();
	}
	
}