package joshua.discriminative.training.lbfgs; /** * @author Zhifei Li, <zhifei.work@gmail.com> * @version $LastChangedDate: 2008-10-20 00:12:30 -0400 $ */ public abstract class LBFGSWrapper { //==== configurable variables private int numPara; private double[] weightsVector;//we can initiate this private double lastFunctionVal; private boolean isMinimizer = true; private int maxNumCall = 100;//run at most 100 iterations (i.e., number of funcion and gradient evaluation) for this particular run //==== stop criterion private double relativeFuncThreshold = 1e-3;//if the relative change of the function value is smaller than this value, then we terminate private int maxPassConverge = 3; //==== default values, required by LBFGS optimization private boolean provideDiagonalMatrix =false; private double[] diag; private int numCorrections = 21;//number of histories used to approximate hessian, our problem is small, so we can use a large value private double epsilon = 1.0e-5; //determines the accuracy with which the solution is to be found: gnorm < eps*xnorm. private double xtol = 1.0e-16; //machine precision private int[] iprint; private int[] iflag; private boolean useRProp = false; private RProp rProp = null; boolean useL2Regula = false; double varianceForL2 = 1; //to regular that the current model does not derivate from the orignal model too much boolean useModelDivergenceRegula = false; double lambda = 1; double[] initWeights; //print debug information int printFirstN = 0; /**Input: * curWeights: the current weight vectors * * Output: * resFuncVal: this array has one element, and should be set as the function value * * Return: * the gradient vector based on the current weight vectors * * Notes: * the LBFGS will not change the returned gradient vector, nor does the resFuncVal * */ public abstract double[] computeFuncValAndGradient(double[] curWeights, double[] resFuncVal); public LBFGSWrapper(int numPara, double[] initWeights, boolean isMinimizer, boolean useL2Regula, double varianceForL2, boolean useModelDivergenceRegula, double lambda, int printFirstN){ this.isMinimizer = isMinimizer; this.useL2Regula = useL2Regula; this.varianceForL2 = varianceForL2; //System.out.println("Minimize the function: " + isMinimizer); //### set the weight vectors this.numPara = numPara; this.weightsVector = new double[numPara]; for(int i=0; i<numPara; i++){ if(initWeights!=null) weightsVector[i] = initWeights[i]; else weightsVector[i] = 1.0/numPara;//TODO } //for model divergence regularization this.useModelDivergenceRegula = useModelDivergenceRegula; this.lambda = lambda; if(useModelDivergenceRegula){ this.initWeights = copyInitWeights(initWeights); } this.diag = new double[numPara];//lbfgs requires this even we do not set the values //### set the print option this.iprint = new int[2]; this.iprint[0] = -1; //specifies the frequency of the output: output at each iterations this.iprint[1] = 0;// specifies the type of output generated: //### set the status flag this.iflag = new int[1]; this.iflag[0]=0;//this will make sure the LBFGS clear all the state information //num_corrections = num_para<7 ? num_para:7; if(useRProp){ System.out.println("===========using RProp ============="); rProp = new RProp(initWeights, numPara, isMinimizer); } this.printFirstN = printFirstN; } /*call LBFGS for multiple iteratons to get the best weights **/ public double[] runLBFGS(){ //System.out.println("================ beging to run LBFGS ======================="); int numCalls=0; double bestFunctionVal=0; lastFunctionVal=0; double[] gradientVector=null; double[] resFuncVal = new double[1]; int checkConverge=0; while (numCalls==0 || ( isLBFGSConverged() == false) && (numCalls <= maxNumCall)){ gradientVector = computeFuncValAndGradient(getCurWeightVector(), resFuncVal); if(this.useModelDivergenceRegula){ this.doL2ForConditionalEntropy(this.initWeights, getCurWeightVector(), gradientVector, resFuncVal, this.lambda); } if(useL2Regula){ //adjust gradientVector and resFuncVal doL2(gradientVector, resFuncVal); } double oldFunctionVal = lastFunctionVal; lastFunctionVal = resFuncVal[0]; //check convergence if( numCalls!=0 && Math.abs( (lastFunctionVal-oldFunctionVal)/oldFunctionVal )<relativeFuncThreshold){ System.out.println("oldFunctionVal="+oldFunctionVal + "; new="+lastFunctionVal + "; checkConverge" + checkConverge); checkConverge++; if(checkConverge>=maxPassConverge){//does not change for three consecutive times //System.out.println("the function value does not change much; break at iter " + num_calls); System.out.println("LBFGS early stops because the function value does not change; break at iter " + numCalls); break; } }else{ checkConverge=0; } if(numCalls==0) bestFunctionVal = lastFunctionVal; else{ if(isMinimizer) bestFunctionVal = (bestFunctionVal < lastFunctionVal) ? bestFunctionVal : lastFunctionVal; else bestFunctionVal = (bestFunctionVal > lastFunctionVal) ? bestFunctionVal : lastFunctionVal; } boolean success = false; if(this.useRProp) success = runOneIterRPropTraining(lastFunctionVal, gradientVector);//auto change weights_vector else success = runOneIterLBFGSTraining(lastFunctionVal, gradientVector);//auto change weights_vector //TODO: should we maitain the best function value and weight vector since the lbfgs-line-search might fails (but even it fails, it seems the func is maximum among all iterations) if(success!=true) { System.out.println("Line search fail after number of calls " + numCalls); break; }; numCalls++; printStatistics(numCalls, lastFunctionVal, gradientVector, weightsVector); //System.exit(1);//???????????? } printStatistics(numCalls, lastFunctionVal, gradientVector, weightsVector); if(isMinimizer==true && lastFunctionVal>bestFunctionVal) { System.out.println("LBFGS returns a bad optimal value; best: " + bestFunctionVal + "; last: " + lastFunctionVal); } if(isMinimizer==false && lastFunctionVal<bestFunctionVal) { System.out.println("LBFGS returns a bad optimal value; best: " + bestFunctionVal + "; last: " + lastFunctionVal); } return weightsVector; } public double getCurFuncVal(){ return lastFunctionVal; } public void printStatistics(int iter_num, double func_val, double[] gradient_vector, double[] weights_vector){ System.out.println("=======Func value: " + func_val + " at iteration number " + iter_num); if(printFirstN<=0) return; if(gradient_vector!=null){ System.out.print("Gradient vector: "); for(int i=0; i<gradient_vector.length && i<this.printFirstN; i++){ //System.out.print(" " + gradient_vector[i]); System.out.print(String.format(" %.4f", gradient_vector[i])); } System.out.print("\n"); } if(weights_vector!=null){ System.out.print("Weight vector: "); for(int i=0; i<weights_vector.length && i<this.printFirstN; i++){ //System.out.print(" " + weights_vector[i]); System.out.print(String.format(" %.4f", weights_vector[i])); } System.out.print("\n"); } } /*the default LBFGS minimizes the function; so we need to negate the function and graident_vector if we want to maximize the funciton * */ private boolean runOneIterLBFGSTraining(double functionValue, double[] gradientVector){ if(gradientVector.length!=numPara){ System.out.println("the number of elements in graident vector does not equal to num of parameters to be tuned"); System.exit(0); } //System.out.println("##############in runOneIterTraining"); try { if(isMinimizer) LBFGS.lbfgs(numPara, numCorrections, weightsVector, functionValue, gradientVector, provideDiagonalMatrix, diag, iprint, epsilon, xtol, iflag); else{ double[] negGradientVector = new double[gradientVector.length]; for(int i=0; i<gradientVector.length; i++){ negGradientVector[i] = -gradientVector[i]; } LBFGS.lbfgs(numPara, numCorrections, weightsVector, -functionValue, negGradientVector, provideDiagonalMatrix, diag, iprint, epsilon, xtol, iflag); } } catch (LBFGS.ExceptionWithIflag e) { /*The line search fails when the function value and gradient does not change much for at least 20 iteratoins; so this should be fine since we are in a good shape anyway *According to the orignal paper (www.ece.northwestern.edu/~nocedal/PDFfiles/lbfgsb.pdf): *If the line search is unable to fnd a point with a sufficiently lower value of the objective after *20 evaluations of the objective function and gradients, we conclude that the current direction is not useful*/ //System.err.println( "lbfgs failed.\n"+e ); if (e.iflag == -1) { // System.err.println("Possible reasons could be: \n \t 1. Bug in the feature generation or data handling code\n\t 2. Not enough features tO make observed feature value==expected value\n"); } //System.exit(1);//TODO return false; } return true; } /*the default LBFGS minimizes the function; so we need to negate the function and graident_vector if we want to maximize the funciton * */ private boolean runOneIterRPropTraining(double functionValue, double[] gradientVector){ if(gradientVector.length!=numPara){ System.out.println("the number of elements in graident vector does not equal to num of parameters to be tuned"); System.exit(0); } weightsVector = rProp.computeWeight(gradientVector); iflag[0] = 1;//not converged return true; } private double[] getCurWeightVector(){ return weightsVector; } //note: at the begining, we will set iflag[0]=0; so this function should be called only after calling runOneIterLBFGS private boolean isLBFGSConverged(){ return ( (iflag[0] == 0)? true: false ); } /**TODO: This rely on * the corrctness of weightsVector*/ private void doL2(double[] gradientVector, double[] resFuncVal){ double l2Norm = 0; for(int k=0; k<gradientVector.length; k++){ l2Norm += weightsVector[k]*weightsVector[k]; if(this.isMinimizer) gradientVector[k] += weightsVector[k]/this.varianceForL2; else gradientVector[k] -= weightsVector[k]/this.varianceForL2; } if(this.isMinimizer) resFuncVal[0] += l2Norm/(2.0*this.varianceForL2); else resFuncVal[0] -= l2Norm/(2.0*this.varianceForL2); System.out.println("l2Norm is " + l2Norm + " for isMinimizer=" + this.isMinimizer); } //===================== for regularization of minimum conditional entropy private double[] copyInitWeights(double[] weights){ double[] initWeights = new double[weights.length]; for(int i=0; i<weights.length; i++) initWeights[i] = weights[i]; return initWeights; } // f + lambda*l2 private void doL2ForConditionalEntropy(double[] initWeights, double[] curWeights, double[] gradientVector, double[] resFuncVal, double lambda){ double l2Norm = 0; for(int k=0; k<gradientVector.length; k++){ double difference = curWeights[k] - initWeights[k]; l2Norm += difference*difference; gradientVector[k] += 2*lambda*difference; } resFuncVal[0] += lambda*l2Norm; System.out.println("L2ForConditionalEntropy is " + l2Norm + " for isMinimizer=" + this.isMinimizer); } }