/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.decoder; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import joshua.corpus.vocab.SymbolTable; import joshua.util.Ngram; import joshua.util.Regex; /** * this class implements: * (1) sentence-level bleu, with smoothing * * @author Zhifei Li, <zhifei.work@gmail.com> * @version $LastChangedDate: 2010-02-02 17:15:12 -0600 (Tue, 02 Feb 2010) $ */ public class BLEU { //do_ngram_clip: consider global n-gram clip public static double computeSentenceBleu(String[] refSents, String hypSent) { return computeSentenceBleu(refSents, hypSent, true, 4, false); } //====================multiple references /** * * @param refSents * @param hypSent * @param doNgramClip Should usually be true * @param bleuOrder Should usually be 4 * @param useShortestRef Probably use false */ public static double computeSentenceBleu(String[] refSents, String hypSent, boolean doNgramClip, int bleuOrder, boolean useShortestRef){ //=== ref tbl HashMap<String, Integer> maxRefCountTbl = constructMaxRefCountTable(refSents, bleuOrder); //== ref len int[] refLens = new int[refSents.length]; for(int i =0; i<refSents.length; i++){ String[] refWords = Regex.spaces.split(refSents[i]); refLens[i] = refWords.length; } double effectiveRefLen=computeEffectiveLen(refLens, useShortestRef); //=== hyp tbl String[] hypWrds = Regex.spaces.split(hypSent); HashMap<String, Integer> hypNgramTbl = new HashMap<String, Integer>(); Ngram.getNgrams(hypNgramTbl, 1, bleuOrder, hypWrds); return computeSentenceBleu(effectiveRefLen, maxRefCountTbl, hypWrds.length, hypNgramTbl, doNgramClip, bleuOrder); } public static double computeEffectiveLen(int[] refLens, boolean useShortestRef ){ if(useShortestRef){ int res=Integer.MAX_VALUE; for(int i=0; i<refLens.length;i++) if(refLens[i]<res) res = refLens[i]; return res; }else{//default is average length double res=0; for(int i=0; i<refLens.length;i++) res += refLens[i]; return res*1.0/refLens.length; } } /** * construct maxRefCount tbl for multiple references */ public static HashMap<String, Integer> constructMaxRefCountTable(String[] refSents, int bleuOrder){ return constructMaxRefCountTable(null, refSents, bleuOrder); } /**words in the ngrams are using integer symbol ID * */ public static HashMap<String, Integer> constructMaxRefCountTable(SymbolTable symbolTbl, String[] refSents, int bleuOrder){ List<HashMap<String, Integer>> listRefNgramTbl = new ArrayList<HashMap<String, Integer>>(); for(int i=0; i<refSents.length; i++){ //if(refSents[i]==null){System.out.println("null ref sent"); System.exit(1);} //String[] refWords = refSents[i].split("\\s+"); String[] refWords = Regex.spaces.split(refSents[i]); HashMap<String, Integer> refNgramTbl = new HashMap<String, Integer>(); if(symbolTbl!=null) Ngram.getNgrams(symbolTbl, refNgramTbl, 1, bleuOrder, refWords); else Ngram.getNgrams(refNgramTbl, 1, bleuOrder, refWords); listRefNgramTbl.add(refNgramTbl); } return computeMaxRefCountTbl(listRefNgramTbl); } /**compute max_ref_count for each ngram in the reference sentences * */ public static HashMap<String, Integer> computeMaxRefCountTbl(List<HashMap<String, Integer>> listRefNgramTbl){ HashMap<String, Integer> merged = new HashMap<String, Integer>(); //== get merged key set for(HashMap<String, Integer> tbl : listRefNgramTbl){ for(String ngram : tbl.keySet()){ merged.put(ngram, 0); } } //== get max ref count for(String ngram : merged.keySet()){ int max=0; for(HashMap<String, Integer> tbl : listRefNgramTbl){ Integer val = tbl.get(ngram); if(val!=null && val>max) max = val; } merged.put(ngram, max); } return merged; } public static double computeSentenceBleu(double effectiveRefLen, HashMap<String, Integer> maxRefCountTbl, int hypLen, HashMap<String, Integer> hypNgramTbl, boolean doNgramClip, int bleuOrder){ double resBleu = 0; int[] numNgramMatch = new int[bleuOrder]; for(String ngram : hypNgramTbl.keySet()){//each ngram in hyp if(maxRefCountTbl.containsKey(ngram)){ int hypNgramCount = hypNgramTbl.get(ngram); int effectiveNumMatch = hypNgramCount; if(doNgramClip){//min{hypNgramCount, maxRefCount} int maxRefCount = maxRefCountTbl.get(ngram); effectiveNumMatch = (int)Support.findMin(hypNgramCount, maxRefCount); //ngram clip; } numNgramMatch[Regex.spaces.split(ngram).length-1] += effectiveNumMatch; } } resBleu = computeBleu(hypLen, effectiveRefLen, numNgramMatch, bleuOrder); //System.out.println("hyp_len: " + hyp_sent.length + "; ref_len:" + ref_sent.length + "; bleu: " + res_bleu +" num_ngram_matches: " + num_ngram_match[0] + " " +num_ngram_match[1]+ // " " + num_ngram_match[2] + " " +num_ngram_match[3]); //System.out.println("Blue is " + res_bleu); return resBleu; } //==============================multiple references end public static double computeSentenceBleu(String refSent, String hypSent, boolean doNgramClip, int bleuOrder){ String[] refWrds = Regex.spaces.split(refSent); String[] hypWrds = Regex.spaces.split(hypSent); HashMap<String, Integer> refNgramTbl = new HashMap<String, Integer>(); Ngram.getNgrams(refNgramTbl, 1, bleuOrder, refWrds); HashMap<String, Integer> hypNgramTbl = new HashMap<String, Integer>(); Ngram.getNgrams(hypNgramTbl, 1, bleuOrder, hypWrds); return computeSentenceBleu(refWrds.length, refNgramTbl, hypWrds.length, hypNgramTbl, doNgramClip, bleuOrder); } public static double computeSentenceBleu(int refLen, HashMap<String, Integer> refNgramTbl, int hypLen, HashMap<String, Integer> hypNgramTbl, boolean doNgramClip, int bleuOrder){ double resBleu = 0; int[] numNgramMatch = new int[bleuOrder]; for(Iterator<String> it = hypNgramTbl.keySet().iterator(); it.hasNext();){ String ngram = it.next(); if (refNgramTbl.containsKey(ngram)) { if (doNgramClip) { numNgramMatch[Regex.spaces.split(ngram).length-1] += Support.findMin(refNgramTbl.get(ngram), hypNgramTbl.get(ngram)); //ngram clip } else { numNgramMatch[Regex.spaces.split(ngram).length-1] += hypNgramTbl.get(ngram);//without ngram count clipping } } } resBleu = computeBleu(hypLen, refLen, numNgramMatch, bleuOrder); //System.out.println("hyp_len: " + hyp_sent.length + "; ref_len:" + ref_sent.length + "; bleu: " + res_bleu +" num_ngram_matches: " + num_ngram_match[0] + " " +num_ngram_match[1]+ // " " + num_ngram_match[2] + " " +num_ngram_match[3]); //System.out.println("Blue is " + res_bleu); return resBleu; } //sentence-bleu: BLEU= bp * prec; where prec = exp (sum 1/4 * log(prec[order])) public static double computeBleu(int hypLen, double refLen, int[] numNgramMatch, int bleuOrder){ if (hypLen <= 0 || refLen <= 0) { System.out.println("error: ref or hyp is zero len"); System.exit(1); } double res = 0; double wt = 1.0/bleuOrder; double prec = 0; double smooth_factor=1.0; for (int t = 0; t < bleuOrder && t < hypLen; t++) { if (numNgramMatch[t] > 0) { prec += wt*Math.log(numNgramMatch[t]*1.0/(hypLen-t)); } else { smooth_factor *= 0.5;//TODO prec += wt*Math.log(smooth_factor/(hypLen-t)); } } double bp = (hypLen >= refLen) ? 1.0 : Math.exp(1-refLen/hypLen); res = bp*Math.exp(prec); //System.out.println("hyp_len: " + hyp_len + "; ref_len:" + ref_len + "prec: " + Math.exp(prec) + "; bp: " + bp + "; bleu: " + res); return res; } public static HashMap<String, Integer> constructNgramTable(String sentence, int bleuOrder){ HashMap<String, Integer> ngramTable = new HashMap<String, Integer>(); String[] refWrds = Regex.spaces.split(sentence); Ngram.getNgrams(ngramTable, 1, bleuOrder, refWrds); return ngramTable; } //================================ Google linear corpus gain ============================================ public static double computeLinearCorpusGain(double[] linearCorpusGainThetas, String[] refSents, String hypSent){ int bleuOrder = 4; int hypLength =Regex.spaces.split(hypSent).length; HashMap<String, Integer> refereceNgramTable = BLEU.constructMaxRefCountTable(refSents, bleuOrder); HashMap<String, Integer> hypNgramTable = BLEU.constructNgramTable(hypSent, bleuOrder); return computeLinearCorpusGain(linearCorpusGainThetas, hypLength, hypNgramTable, refereceNgramTable); } /** * speed consideration: assume hypNgramTable has a smaller * size than referenceNgramTable does */ public static double computeLinearCorpusGain(double[] linearCorpusGainThetas, int hypLength, Map<String,Integer> hypNgramTable, Map<String,Integer> referenceNgramTable) { double res = 0; res += linearCorpusGainThetas[0] * hypLength; for (Entry<String,Integer> entry : hypNgramTable.entrySet()) { String ngram = entry.getKey(); if(referenceNgramTable.containsKey(ngram)){//delta function int ngramOrder = Regex.spaces.split(ngram).length; res += entry.getValue() * linearCorpusGainThetas[ngramOrder]; } } return res; } public static int[] computeNgramMatches(String[] refSents, String hypSent){ int bleuOrder = 4; int hypLength =Regex.spaces.split(hypSent).length; HashMap<String, Integer> refereceNgramTable = BLEU.constructMaxRefCountTable(refSents, bleuOrder); HashMap<String, Integer> hypNgramTable = BLEU.constructNgramTable(hypSent, bleuOrder); return computeNgramMatches(hypLength, hypNgramTable, refereceNgramTable, bleuOrder); } public static int[] computeNgramMatches(int hypLength, Map<String,Integer> hypNgramTable, Map<String,Integer> referenceNgramTable, int highestOrder) { int[] res = new int[highestOrder+1]; res[0] = hypLength; for (Entry<String,Integer> entry : hypNgramTable.entrySet()) { String ngram = entry.getKey(); if(referenceNgramTable.containsKey(ngram)){//delta function int ngramOrder = Regex.spaces.split(ngram).length; res[ngramOrder] += entry.getValue(); } } return res; } static public double[] computeLinearCorpusThetas(int numUnigramTokens, double unigramPrecision, double decayRatio){ double[] res = new double[5]; res[0] = -1.0/numUnigramTokens; for(int i=1; i<5; i++) res[i] = 1.0/(4.0*numUnigramTokens*unigramPrecision*Math.pow(decayRatio, i-1)); double firstWeight = res[0]; for(int i=0; i<5; i++) res[i] /= Math.abs(firstWeight);//normalize by first one System.out.print("Normalized Thetas are: "); for(int i=0; i<5; i++) System.out.print(res[i] + " "); System.out.print("\n"); return res; } }