LBFGSWrapper.java example

Explorer
relax-decode-master
- third-party
package joshua.discriminative.training.lbfgs;


/** 
* @author Zhifei Li, <zhifei.work@gmail.com>
* @version $LastChangedDate: 2008-10-20 00:12:30 -0400  $
*/
public abstract class LBFGSWrapper {
	//==== configurable variables
	private int numPara;
	private double[] weightsVector;//we can initiate this
	private double lastFunctionVal;
	
	private  boolean isMinimizer = true;
	private  int maxNumCall = 100;//run at most 100 iterations (i.e., number of funcion and gradient evaluation) for this particular run
	
	//==== stop criterion
	private double relativeFuncThreshold = 1e-3;//if the relative change of the function value is smaller than this value, then we terminate
	private int maxPassConverge = 3;
	
	//==== default values, required by LBFGS optimization
	private boolean provideDiagonalMatrix =false;
	private  double[] diag;
	private int numCorrections = 21;//number of histories used to approximate hessian, our problem is small, so we can use a large value
	private  double epsilon =  1.0e-5; //determines the accuracy with which the solution is to be found: gnorm < eps*xnorm.
	private double xtol = 1.0e-16; //machine precision
	private int[] iprint;
	private int[] iflag;
	
	
	private boolean useRProp = false;
	private RProp rProp = null;		
	
	

	boolean useL2Regula = false;
	double varianceForL2 = 1;
	
	//to regular that the current model does not derivate from the orignal model too much
	boolean useModelDivergenceRegula = false;
	double lambda = 1;
	double[] initWeights;
	
	//print debug information
	int printFirstN = 0;
	
	/**Input:
	 * curWeights: the current weight vectors
	 * 
	 * Output:
	 * resFuncVal: this array has one element, and should be set as the function value
	 * 
	 * Return:
	 * the gradient vector based on the current weight vectors
	 * 
	 * Notes: 
	 * the LBFGS will not change the returned gradient vector, nor does the resFuncVal
	 * */
	public abstract double[] computeFuncValAndGradient(double[] curWeights, double[] resFuncVal);
	
	public LBFGSWrapper(int numPara, double[] initWeights,  boolean isMinimizer, boolean useL2Regula, double varianceForL2, boolean useModelDivergenceRegula, double lambda, int printFirstN){
		this.isMinimizer = isMinimizer;
		this.useL2Regula = useL2Regula;
		this.varianceForL2 = varianceForL2;
		//System.out.println("Minimize the function: " + isMinimizer);
			
		//### set the weight vectors
		this.numPara = numPara;
		this.weightsVector = new double[numPara];
		for(int i=0; i<numPara; i++){
			if(initWeights!=null)
				weightsVector[i] = initWeights[i];
			else
				weightsVector[i] = 1.0/numPara;//TODO
		}
		
		//for model divergence regularization
		this.useModelDivergenceRegula = useModelDivergenceRegula;
		this.lambda = lambda;
		if(useModelDivergenceRegula){
			this.initWeights = copyInitWeights(initWeights);
		}
		
		
		this.diag = new double[numPara];//lbfgs requires this even we do not set the values
		
		//### set the print option
		this.iprint = new int[2];
		this.iprint[0] = -1; //specifies the frequency of the output: output at each iterations
		this.iprint[1] = 0;// specifies the type of output generated:
		
		//### set the status flag
		this.iflag = new int[1];
		this.iflag[0]=0;//this will make sure the LBFGS clear all the state information
		
		//num_corrections = num_para<7 ? num_para:7;
		
		if(useRProp){
			System.out.println("===========using RProp =============");
			rProp = new RProp(initWeights, numPara, isMinimizer);
		}
		
		this.printFirstN = printFirstN;
	}
	
	 
	
	/*call LBFGS for multiple iteratons to get the best weights
	 **/
	public double[] runLBFGS(){
		//System.out.println("================ beging to run LBFGS =======================");
        int numCalls=0;
        double bestFunctionVal=0;
        lastFunctionVal=0;
        double[]  gradientVector=null;
        double[] resFuncVal = new double[1];
        int checkConverge=0;
       
        while (numCalls==0 || ( isLBFGSConverged() == false) && (numCalls <= maxNumCall)){
        	gradientVector = computeFuncValAndGradient(getCurWeightVector(), resFuncVal);
        	
        	if(this.useModelDivergenceRegula){
        		this.doL2ForConditionalEntropy(this.initWeights, getCurWeightVector(), gradientVector, resFuncVal, this.lambda);
        	}
        	
        	if(useL2Regula){
        		//adjust gradientVector and resFuncVal
        		doL2(gradientVector, resFuncVal);	
        	}
        	        	
        	double oldFunctionVal = lastFunctionVal;
        	lastFunctionVal = resFuncVal[0];
            
        	//check convergence
        	if( numCalls!=0 && 
                Math.abs( (lastFunctionVal-oldFunctionVal)/oldFunctionVal )<relativeFuncThreshold){
            	System.out.println("oldFunctionVal="+oldFunctionVal + "; new="+lastFunctionVal
            			+ "; checkConverge" + checkConverge);
            	checkConverge++;
            	if(checkConverge>=maxPassConverge){//does not change for three consecutive times
            		//System.out.println("the function value does not change much; break at iter " + num_calls);
            		System.out.println("LBFGS early stops because the function value does not change; break at iter " + numCalls);
            		break;
            	}       	             	 
            }else{
            	checkConverge=0;
            }
            
            if(numCalls==0) 
            	bestFunctionVal = lastFunctionVal;
            else{
            	if(isMinimizer) 
            		bestFunctionVal = (bestFunctionVal < lastFunctionVal) ? bestFunctionVal : lastFunctionVal; 
            	else
            		bestFunctionVal = (bestFunctionVal > lastFunctionVal) ? bestFunctionVal : lastFunctionVal;
            }
           
            boolean success = false;
            if(this.useRProp)
            	success = runOneIterRPropTraining(lastFunctionVal, gradientVector);//auto change weights_vector
            else
            	success = runOneIterLBFGSTraining(lastFunctionVal, gradientVector);//auto change weights_vector
            
            //TODO: should we maitain the best function value and weight vector since the lbfgs-line-search might fails (but even it fails, it seems the func is maximum among all iterations)

            if(success!=true) { 
            	System.out.println("Line search fail after number of calls " + numCalls);
            	break;
            };
            numCalls++;
            printStatistics(numCalls, lastFunctionVal, gradientVector, weightsVector);
            //System.exit(1);//????????????
        }
        printStatistics(numCalls, lastFunctionVal, gradientVector, weightsVector);
       
        if(isMinimizer==true  && lastFunctionVal>bestFunctionVal) {
        	System.out.println("LBFGS returns a bad optimal value; best: " + bestFunctionVal + "; last: " + lastFunctionVal);
        }
        
        if(isMinimizer==false && lastFunctionVal<bestFunctionVal) {
        	System.out.println("LBFGS returns a bad optimal value; best: " + bestFunctionVal + "; last: " + lastFunctionVal);
        }	
 
        return weightsVector;
    }
	
	public double getCurFuncVal(){
		return lastFunctionVal;
	}
	
	
	public void printStatistics(int iter_num, double func_val, double[] gradient_vector, double[] weights_vector){
		System.out.println("=======Func value: " + func_val + " at iteration number " + iter_num);
		
		if(printFirstN<=0)
			return;
		if(gradient_vector!=null){
			System.out.print("Gradient vector: ");
			for(int i=0; i<gradient_vector.length && i<this.printFirstN; i++){
				//System.out.print(" " + gradient_vector[i]);
				System.out.print(String.format(" %.4f", gradient_vector[i]));
				
			}
			System.out.print("\n");
		}
		
		if(weights_vector!=null){
			System.out.print("Weight vector: ");
			for(int i=0; i<weights_vector.length && i<this.printFirstN; i++){
				//System.out.print(" " + weights_vector[i]);
				System.out.print(String.format(" %.4f", weights_vector[i]));
			}
			System.out.print("\n");
		}
	}
	

	/*the default LBFGS minimizes the function; so we need to negate the function and graident_vector if we want to maximize the funciton
	 * */
	private boolean runOneIterLBFGSTraining(double functionValue, double[] gradientVector){	
		if(gradientVector.length!=numPara){
			System.out.println("the number of elements in graident vector does not equal to num of parameters to be tuned");
			System.exit(0);
		}
		//System.out.println("##############in runOneIterTraining");
		try {
			if(isMinimizer)
				LBFGS.lbfgs(numPara, numCorrections, weightsVector, functionValue, gradientVector, provideDiagonalMatrix, diag, iprint, epsilon, xtol, iflag);
			else{
				double[] negGradientVector = new double[gradientVector.length];
				for(int i=0; i<gradientVector.length; i++){
					negGradientVector[i] = -gradientVector[i];
				}
				LBFGS.lbfgs(numPara, numCorrections, weightsVector, -functionValue, negGradientVector, provideDiagonalMatrix, diag, iprint, epsilon, xtol, iflag);
			}			
        } catch (LBFGS.ExceptionWithIflag e)  {
        	/*The line search fails when the function value and gradient does not change much for at least 20 iteratoins; so this should be fine since we are in a good shape anyway
        	 *According to the orignal paper (www.ece.northwestern.edu/~nocedal/PDFfiles/lbfgsb.pdf):
        	 *If the line search is unable to fnd a point with a sufficiently lower value of the objective after 
        	 *20 evaluations of the objective function and gradients, we conclude that the current direction is not useful*/
            //System.err.println( "lbfgs failed.\n"+e );
            if (e.iflag == -1) {
               // System.err.println("Possible reasons could be: \n \t 1. Bug in the feature generation or data handling code\n\t 2. Not enough features tO make observed feature value==expected value\n");
            }
            //System.exit(1);//TODO
            return false;
        }
        return true;
	}
	
	
	/*the default LBFGS minimizes the function; so we need to negate the function and graident_vector if we want to maximize the funciton
	 * */
	private boolean runOneIterRPropTraining(double functionValue, double[] gradientVector){	
		if(gradientVector.length!=numPara){
			System.out.println("the number of elements in graident vector does not equal to num of parameters to be tuned");
			System.exit(0);
		}
	
		weightsVector = rProp.computeWeight(gradientVector);	
		iflag[0] = 1;//not converged
        return true;
	}
	
	
	private double[] getCurWeightVector(){
		return weightsVector;
	}
	
	//note: at the begining, we will set iflag[0]=0; so this function should be called only after calling runOneIterLBFGS
	private boolean isLBFGSConverged(){
		return ( (iflag[0] == 0)? true: false );
	}
	
	/**TODO: This rely on 
	 * the corrctness of weightsVector*/
	private void doL2(double[]  gradientVector,  double[] resFuncVal){
		double l2Norm = 0;
		for(int k=0; k<gradientVector.length; k++){
			l2Norm += weightsVector[k]*weightsVector[k];
			if(this.isMinimizer)
				gradientVector[k] +=  weightsVector[k]/this.varianceForL2;
			else
				gradientVector[k] -=  weightsVector[k]/this.varianceForL2;
		}		
		if(this.isMinimizer)
			resFuncVal[0] += l2Norm/(2.0*this.varianceForL2);
		else
			resFuncVal[0] -= l2Norm/(2.0*this.varianceForL2);
		System.out.println("l2Norm is " + l2Norm + " for isMinimizer=" + this.isMinimizer);
	}
	
	
	  
    //===================== for regularization of minimum conditional entropy
	
	private double[] copyInitWeights(double[] weights){
		double[] initWeights = new double[weights.length];
		for(int i=0; i<weights.length; i++)
			initWeights[i] = weights[i];
		return initWeights;
	}
    

    // f + lambda*l2
	private void doL2ForConditionalEntropy(double[] initWeights, double[] curWeights, double[]  gradientVector,  double[] resFuncVal, double lambda){
		double l2Norm = 0;
		
		for(int k=0; k<gradientVector.length; k++){
			double difference = curWeights[k]  - initWeights[k];
			l2Norm += difference*difference;		
			gradientVector[k] += 2*lambda*difference;
		}
		resFuncVal[0] += lambda*l2Norm;    
		System.out.println("L2ForConditionalEntropy is " + l2Norm + " for isMinimizer=" + this.isMinimizer);
	}
}