package joshua.discriminative.bleu_approximater; import java.io.BufferedWriter; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.logging.Logger; import joshua.decoder.BLEU; import joshua.util.FileUtility; import joshua.util.Regex; import joshua.util.io.LineReader; import joshua.util.io.Reader; public class LinearCorpusGainRecover { //===TODO /* private static double unigramPrecision = 0.85; private static double precisionDecayRatio = 0.7; private static int numUnigramTokens = 10; private static double[] linearCorpusGainThetas = BLEU.computeLinearCorpusThetas( numUnigramTokens, unigramPrecision, precisionDecayRatio); */ private double[] linearCorpusGainThetas; Logger logger = Logger.getLogger( LinearCorpusGainRecover.class.getSimpleName() ); public LinearCorpusGainRecover(double[] linearCorpusGainThetas){ this.linearCorpusGainThetas = linearCorpusGainThetas; logger.info("google weights are: " + this.linearCorpusGainThetas); } public LinearCorpusGainRecover(List<Double> linearCorpusGainThetas){ this.linearCorpusGainThetas = new double[linearCorpusGainThetas.size()]; for(int i=0; i<linearCorpusGainThetas.size(); i++) this.linearCorpusGainThetas[i]=linearCorpusGainThetas.get(i); logger.info("google weights are: " + this.linearCorpusGainThetas); } public LinearCorpusGainRecover(){ this.linearCorpusGainThetas = BLEU.computeLinearCorpusThetas(10, 0.7, 0.85); logger.info("google weights are: " + this.linearCorpusGainThetas); } public void processWholeSet(String inputNbestFile, String outputNbestFile, String[] refFiles) throws IOException{ Reader<String>[] referenceReaders = new LineReader[refFiles.length]; for(int k=0; k<refFiles.length; k++){ LineReader refReader = new LineReader(refFiles[k]); referenceReaders[k] = refReader; } NbestReader nbestReader = new NbestReader(inputNbestFile); BufferedWriter outWriter = FileUtility.getWriteFileStream(outputNbestFile); while(nbestReader.hasNext()){ List<String> nbest = nbestReader.next(); String[] referenceSentences = new String[referenceReaders.length]; for(int i=0; i<referenceReaders.length; i++){ referenceSentences[i] = referenceReaders[i].readLine(); } List<String> newNbest = processOneSentence(nbest, referenceSentences); for(String hyp : newNbest){ outWriter.write(hyp+"\n"); } } //close all files for(int k=0; k<refFiles.length; k++){ referenceReaders[k].close(); } outWriter.close(); } private List<String> processOneSentence(List<String> nbest, String[] references){ logger.info("process a sentence"); List<String> newNbest = new ArrayList<String>(); for(String line : nbest){ String[] fds = Regex.threeBarsWithSpace.split(line); int[] ngramMatches = BLEU.computeNgramMatches(references, fds[1]); double oldGain = new Double(fds[3]); double gain = BLEU.computeLinearCorpusGain(linearCorpusGainThetas, references, fds[1]); if(Math.abs(gain-oldGain)>1e-3){ logger.severe("unequal bleu"); System.exit(0); } StringBuffer newLine = new StringBuffer(); newLine.append(fds[0]);//sent id newLine.append(" ||| "); newLine.append(fds[1]);//translation itself newLine.append(" ||| "); //== scores for(double score : ngramMatches){ newLine.append(score); newLine.append(" "); } newLine.append("||| "); newLine.append(fds[3]);//gain newNbest.add(newLine.toString()); } return newNbest; } public static void main(String[] args) throws IOException { if(args.length<3){ System.out.println("args are:"); for(String arg : args) System.out.println(arg); System.out.println("must specific at least one reference"); System.exit(1); } String inputNbestFile = args[0].trim(); String outputNbestFile = args[1].trim(); String[] refFiles = null; if(args.length>2){ refFiles = new String[args.length-2]; for(int i=2; i< args.length; i++){ refFiles[i-2]= args[i].trim(); System.out.println("Use ref file " + refFiles[i-2]); } } LinearCorpusGainRecover recover = new LinearCorpusGainRecover(); recover.processWholeSet(inputNbestFile, outputNbestFile, refFiles); } }