NbestRiskGradientComputer.java example

Explorer
relax-decode-master
- third-party
package joshua.discriminative.training.risk_annealer.nbest;

import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import joshua.decoder.BLEU;
import joshua.decoder.NbestMinRiskReranker;
import joshua.discriminative.FileUtilityOld;
import joshua.discriminative.training.risk_annealer.GradientComputer;

/** 
* @author Zhifei Li, <zhifei.work@gmail.com>
* @version $LastChangedDate: 2008-10-20 00:12:30 -0400 $
*/

public class NbestRiskGradientComputer extends GradientComputer {
//	### in general, if the nbest itself change, all the following values need to be changed
	//size: the number of hyp in the nbest
	//ArrayList<String> l_nbest_translations;//the translation itself: each source has multiple hypothesized translations
	//ArrayList<String> l_ref;
	private List<Double> gainsWithRespectToRef =  new ArrayList<Double>();;
	
	//size:the number of hyp in the nbest * num_features
	private List<Double> featureValues=   new ArrayList<Double>();;//each hyp has multiple features
	
	//size: number of source sentences
	private List<Integer> startPoss =  new ArrayList<Integer>();//inclusive
	private List<Integer> endPoss =  new ArrayList<Integer>();//exclusive
	
	//### if the weight vector or scale changes, we need to change the following two lists 
	//size: the number of hyp in the nbest
	private List<Double> hypProbs =  new ArrayList<Double>();
	//size: the number of source sentences * num_features
	private List<Double> expectedFeatureValues= new ArrayList<Double>();//each source sentence has a vector
	
	//for tuning of scaling factor
	//ArrayList<Double> l_hyp_final_score =  new ArrayList<Double>(); // this is the linear sum, no scaling
	//ArrayList<Double> l_expected_hyp_final_score = new ArrayList<Double>();//each source sentence has an expected score
	
	private boolean useLogBleu=false;
	
	//============== google linear corpus gain
	private boolean useGoogleLinearCorpusGain = true;
	double[] linearCorpusGainThetas; //weights in the Goolge linear corpus gain function
	
	//### nums
	private int totalNumSent;
	

	private double expectedGainSum;
	private double entropySum;
	
	private String nbesFile;
	private String[] refFiles;
	
	private boolean useShortestRefLen=true;
	
	
	//## for BLEU
	private static int bleuOrder =4;
	private static boolean doNgramClip = true;
		
	/*whenever the nbesFile_ or refFile_ changes, we need to reconstruct everything*
	 */
	public NbestRiskGradientComputer(String nbesFile, String[] refFiles, boolean useShortestRefLen, int totalNumSent,  
			int numFeatures, double gainFactor, double annealingScale, double coolingTemperature, boolean computeScalingGradient, double[] linearCorpusGainThetas){
		super( numFeatures, gainFactor, annealingScale, coolingTemperature, computeScalingGradient);
		this.nbesFile = nbesFile;
		this.refFiles = refFiles;
		this.useShortestRefLen = useShortestRefLen;
		this.totalNumSent = totalNumSent;
		
		this.linearCorpusGainThetas = linearCorpusGainThetas;
		if(this.linearCorpusGainThetas!=null){
			this.useGoogleLinearCorpusGain = true;
			//System.out.println("============= use google corpuse blue ========="+ linearCorpusGainThetas[0] +"; " +linearCorpusGainThetas[1]);
		}else
			this.useGoogleLinearCorpusGain = false;
		
		preprocessCorpus(this.nbesFile, this.refFiles);
	}
	
	/*use the latest weights, annealing_scale, and cooling_temperature
	 * the objective here is to maximize the expected gain
	 **/
	public void reComputeFunctionValueAndGradient(double[] weights) {
		double[] weights2 = weights; 
		if(shouldComputeGradientForScalingFactor){//first weight is for scaling parameter
			//==sanity check
			if(weights.length!=numFeatures+1){System.out.println("number of weights is not right"); System.exit(1);}
			
			scalingFactor = weights[0];//!!!!!!!!!! big bug: old code does not have this!!!!!!!!!!!!!!!
			//System.out.println("scaling is " + annealing_scale + "; weight is " + weights[0]);
			
			weights2 = new double[numFeatures];
			for(int i=0; i<numFeatures; i++)
				weights2[i] = weights[i+1];
		}
		
		//==reset values
		for(int i=0; i<numFeatures; i++)
			gradientsForTheta[i] = 0;
		if(shouldComputeGradientForScalingFactor)
			gradientForScalingFactor = 0;
		functionValue = 0;
		
		//### preprocessing and inference on the nbests
		redoCorpusInference(weights2, scalingFactor);
		
		//### compute gradient
		computeCorpusGradient(weights2, gradientsForTheta, temperature, scalingFactor);
		
		//### compute function value
		computeCorpusFuncVal(temperature);
		
		//printLastestStatistics();
		//System.exit(1);
	}
	
	public void printLastestStatistics(){
	 	System.out.println("Func value=" + getLatestFunctionValue() + "="  + getLatestExpectedGain()+"*"+gainFactor + "+" + getLatestEntropy()+"*"+temperature);
	 	System.out.println("AVG Expected_gain=" + (getLatestExpectedGain())/totalNumSent+ "%; avg entropy=" + getLatestEntropy()/totalNumSent);
	}
	
	
	private double getLatestEntropy(){
		if(Double.isNaN(entropySum)){System.out.println("func_val isNaN"); System.exit(1);} 
		return entropySum;
	}
	
	private double getLatestExpectedGain(){
		if(Double.isNaN(expectedGainSum)){System.out.println("func_val isNaN"); System.exit(1);} 
		return expectedGainSum;
	}
	
	
	
//######preprocess:	create: l_feature_value, l_gain_withrespectto_ref, l_start_pos, l_end_pos
	//do not need to store nbest and reference themselves
	private void preprocessCorpus(String nbestFile, String[] refFiles){
		System.out.println("preprocess nbest " + nbestFile + " and ref files " + refFiles);
		//### process nbest file
		BufferedReader nbestReader = FileUtilityOld.getReadFileStream(nbestFile,"UTF-8");
		BufferedReader[] refReaders = new BufferedReader[refFiles.length];
		for(int i=0; i<refFiles.length; i++)
			refReaders[i] = FileUtilityOld.getReadFileStream(refFiles[i],"UTF-8");
		
		String line=null;
		int oldSentId=-1;
		List<String> nbest = new ArrayList<String>();
		while((line=FileUtilityOld.readLineLzf(nbestReader))!=null){
			String[] fds = line.split("\\s+\\|{3}\\s+");
			int newSentID = new Integer(fds[0]);
			if(oldSentId!=-1 && oldSentId!=newSentID){
				String[] refs = new String[refReaders.length];
				for(int i=0; i<refReaders.length; i++)
				    refs[i]=FileUtilityOld.readLineLzf(refReaders[i]);
				preprocessSentNbest(nbest, oldSentId, refs);			
				nbest.clear();
			}
			oldSentId = newSentID;
			nbest.add(line);
		}
		//last source sentence
		String[] refs = new String[refReaders.length];
		for(int i=0; i<refReaders.length; i++)
		    refs[i]=FileUtilityOld.readLineLzf(refReaders[i]);
		preprocessSentNbest(nbest, oldSentId, refs);			
		nbest.clear();
		
		FileUtilityOld.closeReadFile(nbestReader);
		for(int i=0; i<refReaders.length; i++)
			FileUtilityOld.closeReadFile(refReaders[i]);
		
		System.out.println("after proprecessing");
		//System.out.println("l_start_pos size " + l_start_pos.toString());
		//System.out.println("l_end_pos size " + l_end_pos.toString());
		System.out.println("featureValues size " + featureValues.size());
		System.out.println("gainsWithRespectToRef size " + gainsWithRespectToRef.size());
		//System.exit(1);
	}
	
	
	
	
	
	private void preprocessSentNbest(List<String> nbest, int sentID, String[] refs){
		//### add start and end pos
		int start_pos = gainsWithRespectToRef.size();//inclusive
		int end_pos = start_pos + nbest.size();//exclusive
		startPoss.add(start_pos);
		endPoss.add(end_pos);
		
		//### compute gain for each hyp corresponding to ref; and add feature values
		for(String hyp : nbest){
			String[] fds = hyp.split("\\s+\\|{3}\\s+");			
			//gain
			double gain=0;
			if(useGoogleLinearCorpusGain){
				int hypLength = fds[1].split("\\s+").length;
				HashMap<String, Integer> refereceNgramTable = BLEU.constructMaxRefCountTable(refs, bleuOrder);
				HashMap<String, Integer> hypNgramTable = BLEU.constructNgramTable(fds[1], bleuOrder);
				gain = BLEU.computeLinearCorpusGain(linearCorpusGainThetas, hypLength, hypNgramTable,  refereceNgramTable); 
			}else{
				gain = BLEU.computeSentenceBleu(refs, fds[1], doNgramClip, bleuOrder, useShortestRefLen);
			}
			
			if(useLogBleu){
				if(gain==0)
					gainsWithRespectToRef.add(0.0);//log0=0
				else
					gainsWithRespectToRef.add(Math.log(gain));
			}else
				gainsWithRespectToRef.add(gain);
			//System.out.println("Gain is: " +gain + "||| " + ref + "||| " +fds[1]);
			hypProbs.add(0.0);//add fake probe
			
			//feat values
			String[] logFeatProb = fds[2].split("\\s+");
			for(int i=0; i< logFeatProb.length; i++){
				featureValues.add(new Double(logFeatProb[i]));
			}			
		}
		 
		//add fake feature expectations
		for(int i=0; i< numFeatures; i++){
			expectedFeatureValues.add(0.0);
		}
		
		//if(sent_id==1) System.exit(1);
	}
		
	
//=================Inference: based on current weight vector, scaling_factor
//	change l_hyp_probability and l_expected_feature_value, optional: l_hyp_final_score and l_expected_feature_value
	private void redoCorpusInference(double[] weights, double scaling_factor){
		for(int i=0; i<totalNumSent; i++){
			redoSentInference(i, weights, scaling_factor);
		}
		//System.exit(1);
	}	
	
	private void redoSentInference(int sent_id, double[] weights, double scaling_factor){
		int start_pos = startPoss.get(sent_id);
		int end_pos = endPoss.get(sent_id);
		List<Double> nbestLogProbs = hypProbs.subList(start_pos, end_pos);
		
		//### first reset nbest_logprobs to the new final score, this reflects the change of weight vector
		for(int i=0; i< nbestLogProbs.size(); i++){
			double final_score = 0;;
			for(int j=0; j<numFeatures; j++){
				double hyp_feat_val = getFeatVal(start_pos, i, j);
				final_score += hyp_feat_val*weights[j];
			}
			if(Double.isNaN(final_score)){
				System.out.println("final_score is NaN, must be wrong; " + final_score);
				for(int t=0; t<weights.length;t++)
					System.out.println("weight: "+ weights[t]);
				System.exit(1);
			}
			nbestLogProbs.set(i, final_score);
		}
		
		//### change the probability distribution
		NbestMinRiskReranker.computeNormalizedProbs(nbestLogProbs, scaling_factor);//this will automatically change l_hyp_probability
		
		//### re-compute the expectation of feature values		
		double[] expectedValues = new double[numFeatures];
		for(int i=0; i< nbestLogProbs.size(); i++){
			double prob = nbestLogProbs.get(i);
			for(int j=0; j<numFeatures; j++){
				double hypFeatVal = getFeatVal(start_pos, i, j);
				expectedValues[j] += hypFeatVal*prob;
			}
		}
		
		//set the expected feature values
		List<Double> expecedFeatScores = getSentExpectedFeatureScoreList(sent_id);
		double t_expected_sum=0;
		for(int j=0; j<numFeatures; j++){
			expecedFeatScores.set(j, expectedValues[j]);
			t_expected_sum +=  expectedValues[j]*weights[j];
		}
		//System.out.println("sub list size is " + l_expeced_feat_scores);
		
	}
//=================Inference: END
	
	
//=================compute Gradient
	private void computeCorpusGradient(double[] weights, double[] gradients, double temperature, double scale){	
		for(int i=0; i<totalNumSent; i++){
			accumulateSentGradient(i, temperature, weights, gradients, scale);
		}
	}
	
	//accumulate sentence gradient into gradients
	private void accumulateSentGradient(int sentID, double temperature,  double[] weights, double[] gradients, double scale){
		int start_pos = startPoss.get(sentID);
		int end_pos = endPoss.get(sentID);
		List<Double> nbestProbs = hypProbs.subList(start_pos, end_pos);
		List<Double> gainWithRespecitToRef = gainsWithRespectToRef.subList(start_pos, end_pos);
		List<Double> expectedFeatureValues = getSentExpectedFeatureScoreList(sentID);		
		
		double expectedHypFinalScore = 0;
		for(int j=0; j<numFeatures; j++){
			expectedHypFinalScore += expectedFeatureValues.get(j)*weights[j];
		}	
		
		for(int i=0; i< nbestProbs.size(); i++){
			double hypFinalScore = 0;			
			double prob = nbestProbs.get(i);			
			double gain = gainWithRespecitToRef.get(i)*gainFactor;
			double entropyFactor;
			if(prob==0)
				entropyFactor = -temperature*(0+1);//+TH(P); log(0)=0 as otherwise not well-defined 
			else
				entropyFactor = -temperature*(Math.log(prob)+1);//+TH(P)
			
			double anotherSentGradientForScaling = 0;	//another way to compute the gradient for scaling factor		
			for(int j=0; j<numFeatures; j++){
				double hypFeatVal = getFeatVal(start_pos, i, j);
				hypFinalScore += hypFeatVal*weights[j];
				double common = scale*prob*(hypFeatVal-expectedFeatureValues.get(j));
				double sentGradient = common * (gain+entropyFactor);
				gradients[j] += sentGradient;
				anotherSentGradientForScaling += sentGradient*weights[j];
			}
			
			anotherSentGradientForScaling /= scale;
			
			//compute gradient for the scaling factor			
			if(shouldComputeGradientForScalingFactor){
				double common = prob*(hypFinalScore-expectedHypFinalScore);	
				double sentGradientForScaling = common * (gain+entropyFactor);
				gradientForScalingFactor += sentGradientForScaling;
				
				//another way to compute the gradient for scaling factor
				//====== sanity check
				if(Math.abs(sentGradientForScaling-anotherSentGradientForScaling)>1e-2){
					System.out.println("gradientForScalingFactor is not equal; " + sentGradientForScaling + "!=" + anotherSentGradientForScaling + "; scale=" + scale);
					System.exit(1);
				}				
			}			
		}
	}
	
	private void computeCorpusFuncVal(double temperature){
		functionValue = 0;
		expectedGainSum = 0;
		entropySum = 0;
			
		for(int i=0; i<totalNumSent; i++){
			computeSentFuncVal(i, temperature);
		}
		//return func_val;
	}
	
	private void computeSentFuncVal(int sentID, double temperature){
		int start_pos = startPoss.get(sentID);
		int end_pos = endPoss.get(sentID);
		List<Double> nbestGains = gainsWithRespectToRef.subList(start_pos, end_pos);
		List<Double> nbestProbs = hypProbs.subList(start_pos, end_pos);
		double expectedGain = computeExpectedGain(nbestGains, nbestProbs);
		
		double entropy = computeEntropy(nbestProbs);//compute it always, though may not be used in the objective
		
		expectedGainSum += expectedGain;
		entropySum += entropy;
		functionValue +=  expectedGain*gainFactor+entropy*temperature;//maximize function
	}
//=================compute Gradient: END	

	
	
//######Utility function
//	natural base
	static public double computeEntropy( List<Double> nbestProbs){
		double entropy =0;
		double tSum=0;
		for(double prob : nbestProbs){
			if(prob!=0)//log0 is not well defined
				entropy -= prob*Math.log(prob);//natural base
			//if(Double.isNaN(entropy)){System.out.println("entropy becomes NaN, must be wrong; prob is " + prob ); System.exit(1);}
			tSum+=prob;
		}
//		sanity check
		if(Math.abs(tSum-1.0)>1e-4){System.out.println("probabilities not sum to one, must be wrong");	System.exit(1);}
		if(Double.isNaN(entropy)){System.out.println("entropy is NaN, must be wrong"); System.exit(1);}
		if(entropy<0 || entropy > Math.log(nbestProbs.size()+ 1e-2)){System.out.println("entropy is negative or above upper bound, must be wrong; " + entropy); System.exit(1);}
		//System.out.println("entropy is: " + entropy);
		return entropy;
	}
	
	
	//in the real domain, and use natural base
	//KL(P||Q)
	static public double computeKLDivergence( List<Double> P, List<Double> Q){
		double divergence =0;
	
		if(P.size()!=Q.size()){
			System.out.println("the size of the event space of two distributions is not the same");
			System.exit(1);
		}
		
		double pSum=0;
		double qSum=0;
		for(int i=0; i< P.size(); i++){
			double p = P.get(i);
			double q = Q.get(i);
			double logRatio=0;
			if(q==0 && p !=0 ){
				System.out.println("q is zero, but p is not, not well defined");
				System.exit(1);
			}else if(p==0 || q==0){
				logRatio = 0;
			}else{//both p and q non-zero
				logRatio = Math.log(p/q);
			}
			
			divergence += p * logRatio;
			pSum += p;
			qSum += q;
		}
//		sanity check
		if(divergence < 0 ){
			System.out.println("divergence is negative, must be wrong");
			System.exit(1);
		}
		if(Math.abs(pSum-1.0)>1e-4){
			System.out.println("P is not sum to one, must be wrong");
			System.exit(1);
		}
		if(Math.abs(qSum-1.0)>1e-4){
			System.out.println("Q is not sum to one, must be wrong");
			System.exit(1);
		}
		return divergence;
	}
	
	
	
	//Gain(e) = \sum_{e'} G(e, e')P(e')
	//cur_hyp: e
	//true_hyp: e'
	private double computeExpectedGain( List<Double> nbestGains, List<Double> nbestProbs){
		//### get normalization constant, remember features, remember the combined linear score
		double expectedGain = 0;
		for(int i=0; i<nbestGains.size(); i++){
			double gain = nbestGains.get(i);
			double trueProb = nbestProbs.get(i);
			expectedGain += trueProb*gain;
		}
		//System.out.println("Expected gain is " + expected_gain);		
//		sanity check
		if(Double.isNaN(expectedGain)){
			System.out.println("expected_gain isNaN, must be wrong");
			System.exit(1);
		}
		if(useGoogleLinearCorpusGain==false){
			if(useLogBleu){
				if(expectedGain>1e-2){
					System.out.println("Warning: expected_gain is not smaller than zero when using logBLEU, must be wrong: " + expectedGain);
					System.exit(1);
				}
			}else{
				if(expectedGain<-(1e-2) || expectedGain > 1+1e-2){
					System.out.println("Warning: expected_gain is not within [0,1], must be wrong: " + expectedGain);
					System.exit(1);
				}
			}
		}
		return expectedGain;
	} 
	
	private List<Double> getSentExpectedFeatureScoreList(int sentID){
		return expectedFeatureValues.subList(sentID*numFeatures, (sentID+1)*numFeatures);
	}

	
	private  double getFeatVal(int startPos, int hypID, int featID){
		return featureValues.get((startPos+hypID)*numFeatures+featID);
	}
	
	
	
//######Utility function: END
	
}