package joshua.discriminative.training; import java.io.BufferedWriter; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import joshua.discriminative.bleu_approximater.NbestReader; import joshua.util.FileUtility; import joshua.util.Regex; public class NbestMerger { public static int mergeNbest(String nbestFile1, String nbestFile2, String nbestOutFile){ int totalNumHyp = 0; try { NbestReader nbestReader1 = new NbestReader(nbestFile1); NbestReader nbestReader2 = new NbestReader(nbestFile2); BufferedWriter outWriter = FileUtility.getWriteFileStream(nbestOutFile); while(nbestReader1.hasNext()){ List<String> nbest1 = nbestReader1.next(); List<String> nbest2 = nbestReader2.next(); List<String> newNbest = processOneSentence(nbest1, nbest2); for(String hyp : newNbest){ outWriter.write(hyp+"\n"); } totalNumHyp += newNbest.size(); } outWriter.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("totalNumHyp="+totalNumHyp); return totalNumHyp; } private static List<String> processOneSentence(List<String> nbest1, List<String> nbest2){ List<String> newNbest = new ArrayList<String>(); Set<String> uniqueNbests = new HashSet<String>(); processOneNbest(nbest1, uniqueNbests, newNbest); processOneNbest(nbest2, uniqueNbests, newNbest); return newNbest; } private static void processOneNbest(List<String> nbest, Set<String> uniqueNbests, List<String> newNbest){ for(String line : nbest){ String[] fds = Regex.threeBarsWithSpace.split(line); String hypItself = fds[1]; if(uniqueNbests.contains(hypItself)){ //skip }else{ uniqueNbests.add(hypItself); newNbest.add(line); } } } }