BLEU.java example

Explorer
relax-decode-master
- third-party
/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.decoder;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import joshua.corpus.vocab.SymbolTable;
import joshua.util.Ngram;
import joshua.util.Regex;


/**
 * this class implements: 
 * (1) sentence-level bleu, with smoothing
 * 
 * @author Zhifei Li, <zhifei.work@gmail.com>
 * @version $LastChangedDate: 2010-02-02 17:15:12 -0600 (Tue, 02 Feb 2010) $
 */
public class BLEU {
	//do_ngram_clip: consider global n-gram clip
	
	public  static double computeSentenceBleu(String[] refSents, String hypSent) {
		return computeSentenceBleu(refSents, hypSent, true, 4, false);
	}
	
	//====================multiple references
	/**
	 * 
	 * @param refSents 
	 * @param hypSent
	 * @param doNgramClip Should usually be true
	 * @param bleuOrder Should usually be 4
	 * @param useShortestRef Probably use false
	 */
	public  static double computeSentenceBleu(String[] refSents, String hypSent, boolean doNgramClip, int bleuOrder, boolean useShortestRef){
		//=== ref tbl
		HashMap<String, Integer> maxRefCountTbl = constructMaxRefCountTable(refSents, bleuOrder);
		
		//== ref len
		int[] refLens = new int[refSents.length];
		for(int i =0; i<refSents.length; i++){
			String[] refWords = Regex.spaces.split(refSents[i]);
			refLens[i] = refWords.length;					
		}
		
		double effectiveRefLen=computeEffectiveLen(refLens, useShortestRef);
		
		//=== hyp tbl
		String[] hypWrds = Regex.spaces.split(hypSent);
		HashMap<String, Integer> hypNgramTbl = new HashMap<String, Integer>();
		Ngram.getNgrams(hypNgramTbl, 1, bleuOrder, hypWrds);
		return computeSentenceBleu(effectiveRefLen, maxRefCountTbl, hypWrds.length, hypNgramTbl, doNgramClip, bleuOrder);
	}
	
	public static double computeEffectiveLen(int[] refLens, boolean useShortestRef ){
		if(useShortestRef){
			int res=Integer.MAX_VALUE;
			for(int i=0; i<refLens.length;i++)
				if(refLens[i]<res)
					res = refLens[i];
			return res;
		}else{//default is average length
			double res=0;
			for(int i=0; i<refLens.length;i++)
				res += refLens[i];
			return res*1.0/refLens.length;
		}
	}
	
	
	/**
	 * construct maxRefCount tbl for multiple references
	 */
	public  static HashMap<String, Integer> constructMaxRefCountTable(String[] refSents, int bleuOrder){		
		return constructMaxRefCountTable(null, refSents, bleuOrder);
	}
	
	/**words in the ngrams are using integer symbol ID
	 * */
	public  static HashMap<String, Integer> constructMaxRefCountTable(SymbolTable symbolTbl, String[] refSents, int bleuOrder){
		
		List<HashMap<String, Integer>> listRefNgramTbl = new ArrayList<HashMap<String, Integer>>();
		for(int i=0; i<refSents.length; i++){
			//if(refSents[i]==null){System.out.println("null ref sent"); System.exit(1);}
			//String[] refWords = refSents[i].split("\\s+");
			String[] refWords = Regex.spaces.split(refSents[i]);
			
			HashMap<String, Integer> refNgramTbl = new HashMap<String, Integer>();
			if(symbolTbl!=null)
				Ngram.getNgrams(symbolTbl, refNgramTbl, 1, bleuOrder, refWords);
			else
				Ngram.getNgrams(refNgramTbl, 1, bleuOrder, refWords);
			listRefNgramTbl.add(refNgramTbl);			
		}
		
		return computeMaxRefCountTbl(listRefNgramTbl);
	}
	
	
	/**compute max_ref_count for each ngram in the reference sentences
	 * */
	public static HashMap<String, Integer> computeMaxRefCountTbl(List<HashMap<String, Integer>> listRefNgramTbl){
		
		HashMap<String, Integer> merged = new HashMap<String, Integer>();
		
		//== get merged key set
		for(HashMap<String, Integer> tbl : listRefNgramTbl){
			for(String ngram : tbl.keySet()){
				merged.put(ngram, 0);
			}
		}
		
		//== get max ref count
		for(String ngram : merged.keySet()){
			int max=0;
			for(HashMap<String, Integer> tbl : listRefNgramTbl){
				Integer val = tbl.get(ngram);
				if(val!=null && val>max)
					max = val;
			}			
			
			merged.put(ngram, max);
		}
		return merged;
	}
	
	public  static double computeSentenceBleu(double effectiveRefLen, HashMap<String, Integer> maxRefCountTbl, int hypLen, 
			HashMap<String, Integer> hypNgramTbl, boolean doNgramClip, int bleuOrder){
		
		double resBleu = 0;
		
		int[] numNgramMatch = new int[bleuOrder];
		for(String ngram : hypNgramTbl.keySet()){//each ngram in hyp
			if(maxRefCountTbl.containsKey(ngram)){				
				int hypNgramCount =  hypNgramTbl.get(ngram);
				
				int effectiveNumMatch = hypNgramCount;				
				
				if(doNgramClip){//min{hypNgramCount, maxRefCount}
					int maxRefCount =  maxRefCountTbl.get(ngram);				
					effectiveNumMatch = (int)Support.findMin(hypNgramCount, maxRefCount); //ngram clip;
				}    			
		    		
				
				numNgramMatch[Regex.spaces.split(ngram).length-1] += effectiveNumMatch;
			}
		}
		
		resBleu = computeBleu(hypLen, effectiveRefLen, numNgramMatch, bleuOrder);
		//System.out.println("hyp_len: " + hyp_sent.length + "; ref_len:" + ref_sent.length + "; bleu: " + res_bleu +" num_ngram_matches: " + num_ngram_match[0] + " " +num_ngram_match[1]+
		//		" " + num_ngram_match[2] + " " +num_ngram_match[3]);
		//System.out.println("Blue is " + res_bleu);
		return resBleu;
	}
	
	
	//==============================multiple references end
	
	
	
	
	
	
	public  static double computeSentenceBleu(String refSent, String hypSent, boolean doNgramClip, int bleuOrder){
		String[] refWrds = Regex.spaces.split(refSent);
		String[] hypWrds = Regex.spaces.split(hypSent);
		HashMap<String, Integer> refNgramTbl = new HashMap<String, Integer>();
		Ngram.getNgrams(refNgramTbl, 1, bleuOrder, refWrds);
		HashMap<String, Integer> hypNgramTbl = new HashMap<String, Integer>();
		Ngram.getNgrams(hypNgramTbl, 1, bleuOrder, hypWrds);
		return computeSentenceBleu(refWrds.length, refNgramTbl, hypWrds.length, hypNgramTbl, doNgramClip, bleuOrder);
	}
	
	public  static double computeSentenceBleu(int refLen, HashMap<String, Integer> refNgramTbl, int hypLen, HashMap<String, Integer> hypNgramTbl, boolean doNgramClip, int bleuOrder){
		double resBleu = 0;
		
		int[] numNgramMatch = new int[bleuOrder];
		for(Iterator<String> it = hypNgramTbl.keySet().iterator(); it.hasNext();){
			String ngram = it.next();
			if (refNgramTbl.containsKey(ngram)) {
				if (doNgramClip) {
					numNgramMatch[Regex.spaces.split(ngram).length-1] += Support.findMin(refNgramTbl.get(ngram), hypNgramTbl.get(ngram)); //ngram clip
				} else {
					numNgramMatch[Regex.spaces.split(ngram).length-1] += hypNgramTbl.get(ngram);//without ngram count clipping
				}
    		}
		}
		resBleu = computeBleu(hypLen, refLen, numNgramMatch, bleuOrder);
		//System.out.println("hyp_len: " + hyp_sent.length + "; ref_len:" + ref_sent.length + "; bleu: " + res_bleu +" num_ngram_matches: " + num_ngram_match[0] + " " +num_ngram_match[1]+
		//		" " + num_ngram_match[2] + " " +num_ngram_match[3]);
		//System.out.println("Blue is " + res_bleu);
		return resBleu;
	}
	
	//sentence-bleu: BLEU= bp * prec; where prec = exp (sum 1/4 * log(prec[order]))
	public static double computeBleu(int hypLen, double refLen, int[] numNgramMatch, int bleuOrder){
		if (hypLen <= 0 || refLen <= 0) {
			System.out.println("error: ref or hyp is zero len");
			System.exit(1);
		}
		double res = 0;
		double wt = 1.0/bleuOrder;
		double prec = 0;
		double smooth_factor=1.0;
		for (int t = 0; t < bleuOrder && t < hypLen; t++) {
			if (numNgramMatch[t] > 0) {
				prec += wt*Math.log(numNgramMatch[t]*1.0/(hypLen-t));
			} else {
				smooth_factor *= 0.5;//TODO
				prec += wt*Math.log(smooth_factor/(hypLen-t));
			}
		}
		double bp = (hypLen >= refLen) ? 1.0 : Math.exp(1-refLen/hypLen);
		res = bp*Math.exp(prec);
		//System.out.println("hyp_len: " + hyp_len + "; ref_len:" + ref_len + "prec: " + Math.exp(prec) + "; bp: " + bp + "; bleu: " + res);
		return res;
	}
	
	
	
	
	public  static HashMap<String, Integer> constructNgramTable(String sentence, int bleuOrder){		
		HashMap<String, Integer> ngramTable = new HashMap<String, Integer>();	
		String[] refWrds = Regex.spaces.split(sentence);						
		Ngram.getNgrams(ngramTable, 1, bleuOrder, refWrds);
		return ngramTable;
	}


	
	//================================ Google linear corpus gain ============================================
	public  static double computeLinearCorpusGain(double[] linearCorpusGainThetas, String[] refSents, String hypSent){
		int bleuOrder = 4;
		int hypLength =Regex.spaces.split(hypSent).length;
		HashMap<String, Integer> refereceNgramTable = BLEU.constructMaxRefCountTable(refSents, bleuOrder);
		HashMap<String, Integer> hypNgramTable = BLEU.constructNgramTable(hypSent, bleuOrder); 
		return computeLinearCorpusGain(linearCorpusGainThetas, hypLength, hypNgramTable,  refereceNgramTable);
	}
	/** 
	 * speed consideration: assume hypNgramTable has a smaller
	 * size than referenceNgramTable does
	 */
	public static double computeLinearCorpusGain(double[] linearCorpusGainThetas, int hypLength, Map<String,Integer> hypNgramTable,  Map<String,Integer> referenceNgramTable) {
		double res = 0;
		res += linearCorpusGainThetas[0] * hypLength;
		for (Entry<String,Integer> entry : hypNgramTable.entrySet()) {
			String   ngram = entry.getKey();
			if(referenceNgramTable.containsKey(ngram)){//delta function
				int ngramOrder = Regex.spaces.split(ngram).length;
				res += entry.getValue() * linearCorpusGainThetas[ngramOrder];
			}
		}
		return res;
	}
	
	public static int[] computeNgramMatches(String[] refSents, String hypSent){
		int bleuOrder = 4;
		int hypLength =Regex.spaces.split(hypSent).length;
		HashMap<String, Integer> refereceNgramTable = BLEU.constructMaxRefCountTable(refSents, bleuOrder);
		HashMap<String, Integer> hypNgramTable = BLEU.constructNgramTable(hypSent, bleuOrder); 
		return computeNgramMatches(hypLength, hypNgramTable,  refereceNgramTable, bleuOrder);
	}
	
	public static int[] computeNgramMatches(int hypLength, Map<String,Integer> hypNgramTable,  Map<String,Integer> referenceNgramTable, int highestOrder) {
		int[] res = new int[highestOrder+1];
		res[0] = hypLength;
		for (Entry<String,Integer> entry : hypNgramTable.entrySet()) {
			String   ngram = entry.getKey();
			if(referenceNgramTable.containsKey(ngram)){//delta function
				int ngramOrder = Regex.spaces.split(ngram).length;
				res[ngramOrder] += entry.getValue();
			}
		}
		return res;
	}
	
	static public  double[] computeLinearCorpusThetas(int numUnigramTokens, double unigramPrecision, double decayRatio){
		double[] res = new double[5];
		res[0] = -1.0/numUnigramTokens;
		for(int i=1; i<5; i++)
			res[i] = 1.0/(4.0*numUnigramTokens*unigramPrecision*Math.pow(decayRatio, i-1));
		
		double firstWeight = res[0];
		for(int i=0; i<5; i++)
			res[i] /= Math.abs(firstWeight);//normalize by first one
		
		
		System.out.print("Normalized Thetas are: ");
		for(int i=0; i<5; i++)
			System.out.print(res[i] + " ");
		System.out.print("\n");
				
		return res;
	}		
	
	
}