GradientBasedOptimizer.java example

Explorer
relax-decode-master
- third-party
package joshua.discriminative.training.learning_algorithm;

import java.util.HashMap;
import java.util.Map;

/*This class implements common functions:
 * (1) gradient computation
 * (2) batch update
 * (3) cooling schedule
 * (4) regularization
 * */

public abstract class GradientBasedOptimizer {	
	
	private int BATCH_UPDATE_SIZE=30; //update the perceptron after processing BATCH_SIZE
	private int TRAIN_SIZE = 610000;//number of training examples
	private int CONVERGE_PASS = 1;//assume the model will converge after pass CONVERGE_PASS
	private double COOLING_SCHEDULE_T = TRAIN_SIZE*CONVERGE_PASS*1.0/BATCH_UPDATE_SIZE; //where parameter t was adjusted such that the gain is halved after one pass through the data (610k*2/30)
	private double INITIAL_GAIN = 0.1;
	
	private double SIGMA=1.0;//the smaller SIGMA, the sharp the prior is; the more regularized of the model (meaning more feature weights goes close to zero)
	private double REG_CONSTANT_RATIO = 0; 
	
	private boolean IS_MINIMIZE_SCORE = false;
	
	protected int numModelChanges = 0 ; //how many times the model is changed
	
	private boolean noRegularization=false;//default is with regularization; (perceptron does not use this)
	private boolean noCooling=false;//default is with cooling
	
	public void set_no_regularization(){
		noRegularization =true;
	}
	
	public void set_no_cooling(){
		noCooling =true;
	}
	
	public GradientBasedOptimizer(int train_size, int batch_update_size, int converge_pass, double init_gain, double sigma, boolean is_minimize_score){
		TRAIN_SIZE = train_size;
		BATCH_UPDATE_SIZE = batch_update_size;
		CONVERGE_PASS = converge_pass;
		INITIAL_GAIN = init_gain;
		COOLING_SCHEDULE_T = TRAIN_SIZE*CONVERGE_PASS*1.0/BATCH_UPDATE_SIZE;
		
		SIGMA = sigma;
		REG_CONSTANT_RATIO = BATCH_UPDATE_SIZE*1.0/(TRAIN_SIZE*SIGMA*SIGMA);
		
		IS_MINIMIZE_SCORE = is_minimize_score;
		System.out.println("TRAIN_SIZE: " + TRAIN_SIZE);
		System.out.println("BATCH_UPDATE_SIZE: " + BATCH_UPDATE_SIZE);
		System.out.println("CONVERGE_PASS: " + CONVERGE_PASS);
		System.out.println("INITIAL_GAIN: " + INITIAL_GAIN);
		System.out.println("COOLING_SCHEDULE_T: " + COOLING_SCHEDULE_T);
		System.out.println("SIGMA: " + SIGMA);
		System.out.println("REG_CONSTANT_RATIO: " + REG_CONSTANT_RATIO);
		System.out.println("IS_MINIMIZE_SCORE: " + IS_MINIMIZE_SCORE);
	}		
	
	public  abstract void initModel(double minValue, double maxValue);// random start
	
	public  abstract void updateModel(HashMap tbl_feats_empirical, HashMap tbl_feats_model);
	
	public  abstract HashMap getAvgModel();
	
	public  abstract HashMap getSumModel();
	
	public  abstract void setFeatureWeight(String feat, double weight);
	
	public int getBatchSize(){
		return BATCH_UPDATE_SIZE;
	}
	
//	########################### common between CRF and Perceptron ######################
	protected  HashMap<String, Double>  getGradient(HashMap<String, Double>  empiricalFeatsTbl, HashMap<String, Double> modelFeatsTbl){
		HashMap<String, Double> res = new HashMap<String, Double>();
		//##process tbl_feats_oracle
		for( Map.Entry<String, Double>  entry : empiricalFeatsTbl.entrySet() ){
			String key = entry.getKey();
			double gradient = entry.getValue();
			Double v_1best = modelFeatsTbl.get(key);
			if(v_1best!=null) 
				gradient -= v_1best;//v_oracle - v_1best
			if(gradient != 0)//TODO
				if(IS_MINIMIZE_SCORE)
					res.put(key,-gradient);//note: we are minizing the cost
				else
					res.put(key,gradient);//note: we are max the prob
		}
		
		//##process tbl_feats_1best
		for(Map.Entry<String, Double>  entry : modelFeatsTbl.entrySet() ){
			String key = entry.getKey();
			Double v_oracle = empiricalFeatsTbl.get(key);
			if(v_oracle==null)//this feat only activate in the 1best, not in oracle
				if(IS_MINIMIZE_SCORE)
					res.put(key,  entry.getValue());//note: we are minizing the cost
				else
					res.put(key,  -entry.getValue());//note: we are maximize the prob
		}
		//System.out.println("gradient size is: " + res.size());
		return res;
	}

	protected  double computeGain(int iterNumber){//the numbers of updating the model
		if(noCooling)
			return 1.0;
		else 
			return INITIAL_GAIN*COOLING_SCHEDULE_T/(COOLING_SCHEDULE_T+iterNumber);
	}
	
	protected  double computeRegularizationScale(double updateGain){
		if(noRegularization)
			return 1.0;
		else 
			return 1.0-REG_CONSTANT_RATIO*updateGain;
	}
//	end ########################### common between CRF and Perceptron ######################	
}