JoshuaEval.java example

Explorer
relax-decode-master
- third-party
/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */

package joshua.util;

import joshua.zmert.EvaluationMetric;

import java.util.TreeSet;
import java.text.DecimalFormat;

// BUG: try using joshua.util.io.LineReader instead
import java.io.IOException;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;

public class JoshuaEval {
	final static DecimalFormat f4 = new DecimalFormat("###0.0000");
	
	// if true, evaluation is performed for each candidate translation as
	// well as on the entire candidate set
	static boolean verbose;
	
	// number of candidate translations
	static int numSentences;
	
	// number of reference translations per sentence
	static int refsPerSen;
	
	// 0: no normalization, 1: "NIST-style" tokenization, and also rejoin 'm, 're, *'s, 've, 'll, 'd, and n't,
	// 2: apply 1 and also rejoin dashes between letters, 3: apply 1 and also drop non-ASCII characters
	// 4: apply 1+2+3
	static private int textNormMethod;
	
	// refSentences[i][r] is the rth reference translation of the ith sentence
	static String[][] refSentences;
	
	// name of evaluation metric
	static String metricName;
	
	// options for the evaluation metric (e.g. for BLEU, maxGramLength and effLengthMethod)
	static String[] metricOptions;
	
	// the scorer
	static EvaluationMetric evalMetric;
	
	// if true, the reference set(s) is (are) evaluated
	static boolean evaluateRefs;
	
	// file names for input files.  When refsPerSen > 1, refFileName can be
	// the name of a single file, or a file name prefix.
	static String refFileName;
	static String candFileName;
	
	// format of the candidate file: "plain" if one candidate per sentence, and "nbest" if a decoder output
	static String candFileFormat;
	
	// if format is nbest, evaluate the r'th candidate of each sentence
	static int candRank;

	
	private static void evaluateCands_plain(String inFileName) {
		evaluate(candFileName, "plain", 1, 1);
	}
	
	
	private static void evaluateCands_nbest(String inFileName, int testIndex) {
		evaluate(candFileName, "nbest", -1, testIndex);
	}
	
	
	private static void evaluateRefSet(int r) {
		evaluate(refFileName, "plain", refsPerSen, r);
	}
	
	
	private static void evaluate(String inFileName, String inFileFormat, int candPerSen, int testIndex) {
		// candPerSen: how many candidates are provided per sentence?
		//             (if inFileFormat is nbest, then candPerSen is ignored, since it is variable)
		// testIndex: which of the candidates (for each sentence) should be tested?
		//            e.g. testIndex=1 means first candidate should be evaluated
		//                 testIndex=candPerSen means last candidate should be evaluated
		
		if (inFileFormat.equals("plain") && candPerSen < 1) {
			println("candPerSen must be positive for a file in plain format.");
			System.exit(30);
		}
		
		if (inFileFormat.equals("plain") && (testIndex < 1 || testIndex > candPerSen)) {
			println("For the plain format, testIndex must be in [1,candPerSen]");
			System.exit(31);
		}
		

		String[] topCand_str = new String[numSentences];
		
		// BUG: all of this needs to be replaced with the SegmentFileParser and related interfaces.
		try {
			
			// read the candidates
			
			InputStream inStream = new FileInputStream(new File(inFileName));
			BufferedReader inFile = new BufferedReader(new InputStreamReader(inStream, "utf8"));
			String line, candidate_str;
			
			if (inFileFormat.equals("plain")) {

				for (int i = 0; i < numSentences; ++i) {

					// skip candidates 1 through testIndex-1
					for (int n = 1; n < testIndex; ++n) {
						line = inFile.readLine();
					}
					
					// read testIndex'th candidate
					candidate_str = inFile.readLine();
					
					topCand_str[i] = normalize(candidate_str, textNormMethod);
					
					for (int n = testIndex+1; n <= candPerSen; ++n){
						// skip candidates testIndex+1 through candPerSen-1
						// (this probably only applies when evaluating a combined reference file)
						line = inFile.readLine();
					}

				} // for (i)
				
			} else { // nbest format
				
				int i = 0;
				int n = 1;
				line = inFile.readLine();
				
				while (line != null && i < numSentences) {

/*
line format:

.* ||| words of candidate translation . ||| feat-1_val feat-2_val ... feat-numParams_val .*

*/
					
					while (n < candRank) {
						line = inFile.readLine();
						++n;
					}
					
					// at the moment, line stores the candRank'th candidate (1-indexed) of the i'th sentence (0-indexed)
					
					if (line == null) {
						println("Not enough candidates in " + inFileName + " to extract the " + candRank + "'th candidate for each sentence.");
						println("(Failed to extract one for the " + i + "'th sentence (0-indexed).)");
						System.exit(32);
					}
					
					int read_i = Integer.parseInt(line.substring(0,line.indexOf(" |||")).trim());
					if (read_i == i) {
						line = line.substring(line.indexOf("||| ")+4); // get rid of initial text
						candidate_str = line.substring(0,line.indexOf(" |||"));
						topCand_str[i] = normalize(candidate_str, textNormMethod);
						if (i < numSentences-1) {
							while (read_i == i) {
								line = inFile.readLine();
								read_i = Integer.parseInt(line.substring(0,line.indexOf(" |||")).trim());
							}
						}
						n = 1;
						i += 1;
					} else {
						println("Not enough candidates in " + inFileName + " to extract the " + candRank + "'th candidate for each sentence.");
						println("(Failed to extract one for the " + i + "'th sentence (0-indexed).)");
						System.exit(32);
					}
					
				} // while (line != null)
				
				if (i != numSentences) {
					println("Not enough candidates were found (i = " + i + "; was expecting " + numSentences + ")");
					System.exit(33);
				}

			} // nbest format
			
			inFile.close();
			
		} catch (FileNotFoundException e) {
			System.err.println("FileNotFoundException in MertCore.initialize(int): " + e.getMessage());
			System.exit(99901);
		} catch (IOException e) {
			System.err.println("IOException in MertCore.initialize(int): " + e.getMessage());
			System.exit(99902);
		}
		
		
		int[] IA = new int[numSentences];
		for (int i = 0; i < numSentences; ++i) { IA[i] = i; }
		int[][] SS = evalMetric.suffStats(topCand_str,IA);
		
		int suffStatsCount = evalMetric.get_suffStatsCount();
		
		int[] totStats = new int[suffStatsCount];
		for (int s = 0; s < suffStatsCount; ++s) {
			totStats[s] = 0;
			for (int i = 0; i < numSentences; ++i) {
				totStats[s] += SS[i][s];
			}
		}
		
		evalMetric.printDetailedScore_fromStats(totStats,false);
		
		if (verbose) {
			println("");
			println("Printing detailed scores for individual sentences...");
			for (int i = 0; i < numSentences; ++i) {
				print("Sentence #" + i + ": ");
				int[] stats = new int[suffStatsCount];
				for (int s = 0; s < suffStatsCount; ++s) { stats[s] = SS[i][s]; }
				evalMetric.printDetailedScore_fromStats(stats,true);
				// already prints a \n
			}
		}

	} // void evaluate(...)
	
		
	private static void printUsage(int argsLen) {
		println("Oops, you provided " + argsLen + " args!");
		println("");
		println("Usage:");
		println(" JoshuaEval [-cand candFile] [-format candFileformat] [-rank r]\n            [-ref refFile] [-rps refsPerSen] [-m metricName metric options]\n            [-evr evalRefs] [-v verbose]");
		println("");
		println(" (*) -cand candFile: candidate translations\n       [[default: candidates.txt]]");
		println(" (*) -format candFileFormat: is the candidate file a plain file (one candidate\n       per sentence) or does it contain multiple candidates per sentence as\n       a decoder's output)?  For the first, use \"plain\".  For the second,\n       use \"nbest\".\n       [[default: plain]]");
		println(" (*) -rank r: if format=nbest, evaluate the set of r'th candidates.\n       [[default: 1]]");
		println(" (*) -ref refFile: reference translations (or file name prefix)\n       [[default: references.txt]]");
		println(" (*) -rps refsPerSen: number of reference translations per sentence\n       [[default: 1]]");
		println(" (*) -txtNrm textNormMethod: how should text be normalized?\n          (0) don't normalize text,\n       or (1) \"NIST-style\", and also rejoin 're, *'s, n't, etc,\n       or (2) apply 1 and also rejoin dashes between letters,\n       or (3) apply 1 and also drop non-ASCII characters,\n       or (4) apply 1+2+3\n       [[default: 1]]");
		println(" (*) -m metricName metric options: name of evaluation metric and its options\n       [[default: BLEU 4 closest]]");
		println(" (*) -evr evalRefs: evaluate references (1) or not (0) (sanity check)\n       [[default: 0]]");
		println(" (*) -v verbose: evaluate individual sentences (1) or not (0)\n       [[default: 0]]");
		println("");
		println("Ex.: java JoshuaEval -cand nbest.out -ref ref.all -rps 4 -m BLEU 4 shortest");
	}
	
	
	private static void processArgsAndInitialize(String[] args) {
		EvaluationMetric.set_knownMetrics();
		
		// set default values
		candFileName = "candidates.txt";
		candFileFormat = "plain";
		candRank = 1;
		refFileName = "references.txt";
		refsPerSen = 1;
		textNormMethod = 1;
		metricName = "BLEU";
		metricOptions = new String[2];
		metricOptions[0] = "4";
		metricOptions[1] = "closest";
		evaluateRefs = false;
		verbose = false;
		
		int i = 0;
		
		while (i < args.length) {
			String option = args[i];
			if (option.equals("-cand")) {
				candFileName = args[i+1];
			} else if (option.equals("-format")) {
				candFileFormat = args[i+1];
				if (!candFileFormat.equals("plain") && !candFileFormat.equals("nbest")) {
					println("candFileFormat must be either plain or nbest.");
					System.exit(10);
				}
			} else if (option.equals("-rank")) {
				candRank = Integer.parseInt(args[i+1]);
				if (refsPerSen < 1) {
					println("Argument for -rank must be positive.");
					System.exit(10);
				}
			} else if (option.equals("-ref")) {
				refFileName = args[i+1];
			} else if (option.equals("-rps")) {
				refsPerSen = Integer.parseInt(args[i+1]);
				if (refsPerSen < 1) {
					println("refsPerSen must be positive.");
					System.exit(10);
				}
			} else if (option.equals("-txtNrm")) {
				textNormMethod = Integer.parseInt(args[i+1]);
				if (textNormMethod < 0 || textNormMethod > 4) {
					println("textNormMethod should be between 0 and 4");
					System.exit(10);
				}
			} else if (option.equals("-m")) {
				metricName = args[i+1];
				if (EvaluationMetric.knownMetricName(metricName)) {
					int optionCount = EvaluationMetric.metricOptionCount(metricName);
					metricOptions = new String[optionCount];
					for (int opt = 0; opt < optionCount; ++opt) {
						metricOptions[opt] = args[i+opt+2];
					}
					i += optionCount;
				} else {
					println("Unknown metric name " + metricName + ".");
					System.exit(10);
				}
			} else if (option.equals("-evr")) {
				int evr = Integer.parseInt(args[i+1]);
				if (evr == 1) {
					evaluateRefs = true;
				} else if (evr == 0) {
					evaluateRefs = false;
				} else {
					println("evalRefs must be either 0 or 1.");
					System.exit(10);
				}
			} else if (option.equals("-v")) {
				int v = Integer.parseInt(args[i+1]);
				if (v == 1) {
					verbose = true;
				} else if (v == 0) {
					verbose = false;
				} else {
					println("verbose must be either 0 or 1.");
					System.exit(10);
				}
			} else {
				println("Unknown option " + option); System.exit(10);
			}
			
			i += 2;
			
		} // while (i)
		
		if (refsPerSen > 1) {
			// the provided refFileName might be a prefix
			File dummy = new File(refFileName);
			if (!dummy.exists()) {
				refFileName = createUnifiedRefFile(refFileName,refsPerSen);
			}
		} else {
			checkFile(refFileName);
		}
		
		
		// initialize
		numSentences = countLines(refFileName) / refsPerSen;
		
		// read in reference sentences
		refSentences = new String[numSentences][refsPerSen];
		
		try {
			
			InputStream inStream_refs = new FileInputStream(new File(refFileName));
			BufferedReader inFile_refs = new BufferedReader(new InputStreamReader(inStream_refs, "utf8"));
			
			for (i = 0; i < numSentences; ++i) {
				for (int r = 0; r < refsPerSen; ++r) {
					// read the rth reference translation for the ith sentence
					refSentences[i][r] = normalize(inFile_refs.readLine(), textNormMethod);
				}
			}
			
			inFile_refs.close();
			
		} catch (FileNotFoundException e) {
			System.err.println("FileNotFoundException in MertCore.initialize(int): " + e.getMessage());
			System.exit(99901);
		} catch (IOException e) {
			System.err.println("IOException in MertCore.initialize(int): " + e.getMessage());
			System.exit(99902);
		}
		
		// set static data members for the EvaluationMetric class
		EvaluationMetric.set_numSentences(numSentences);
		EvaluationMetric.set_refsPerSen(refsPerSen);
		EvaluationMetric.set_refSentences(refSentences);
		
		// do necessary initialization for the evaluation metric
		evalMetric = EvaluationMetric.getMetric(metricName,metricOptions);
		
		println("Processing " + numSentences + " sentences...");
		
	} // processArgsAndInitialize(String[] args)
	
	
	private static void checkFile(String fileName) {
		if (!fileExists(fileName)) {
			println("The file " + fileName + " was not found!");
			System.exit(40);
		}
	}
	
	
	private static boolean fileExists(String fileName) {
		File checker = new File(fileName);
		return checker.exists();
	}
	
	
	private static String createUnifiedRefFile(String prefix, int numFiles) {
		if (numFiles < 2) {
			println("Warning: createUnifiedRefFile called with numFiles = " + numFiles + "; doing nothing.");
			return prefix;
		} else {
			File checker;
			checker = new File(prefix + "1");
			
			if (!checker.exists()) {
				checker = new File(prefix + ".1");
				if (!checker.exists()) {
					println("Can't find reference files.");
					System.exit(50);
				} else {
					prefix = prefix + ".";
				}
			}
			
			String outFileName;
			if (prefix.endsWith(".")) {
				outFileName = prefix + "all";
			} else {
				outFileName = prefix + ".all";
			}
			
			try {
				PrintWriter outFile = new PrintWriter(outFileName);
				
				BufferedReader[] inFile = new BufferedReader[numFiles];
				
				int nextIndex;
				checker = new File(prefix + "0");
				if (checker.exists()) {
					nextIndex = 0;
				} else {
					nextIndex = 1;
				}
				int lineCount = countLines(prefix + nextIndex);
				
				for (int r = 0; r < numFiles; ++r) {
					if (countLines(prefix + nextIndex) != lineCount) {
						println("Line count mismatch in " + (prefix+nextIndex) + ".");
						System.exit(60);
					}
					InputStream inStream = new FileInputStream(new File(prefix + nextIndex));
					inFile[r] = new BufferedReader(new InputStreamReader(inStream, "utf8"));
					++nextIndex;
				}
				
				String line;
				
				for (int i = 0; i < lineCount; ++i) {
					for (int r = 0; r < numFiles; ++r) {
						line = inFile[r].readLine();
						outFile.println(line);
					}
				}
				
				outFile.close();
				
				for (int r = 0; r < numFiles; ++r) {
					inFile[r].close();
				}
				
			} catch (FileNotFoundException e) {
				System.err.println("FileNotFoundException in MertCore.createUnifiedRefFile(String,int): " + e.getMessage());
				System.exit(99901);
			} catch (IOException e) {
				System.err.println("IOException in MertCore.createUnifiedRefFile(String,int): " + e.getMessage());
				System.exit(99902);
			}
			
			return outFileName;
			
		}
		
	} // createUnifiedRefFile(String prefix, int numFiles)
	
	private static String normalize(String str, int normMethod)
	{
		if (normMethod == 0) return str;
		
		// replace HTML/SGML
		str = str.replaceAll(""","\"");
		str = str.replaceAll("&","&");
		str = str.replaceAll("<","<");
		str = str.replaceAll(">",">");
		str = str.replaceAll("'","'");
		
		
		
		// split on these characters:
		// ! " # $ % & ( ) * + / : ; < = > ? @ [ \ ] ^ _ ` { | } ~
		// i.e. ASCII 33-126, except alphanumeric, and except "," "-" "." "'"
		
		//                 ! "#  $%&  (  )  *  +/:;<=>  ?@  [   \  ]  ^_`  {  |  }~
		String split_on = "!\"#\\$%&\\(\\)\\*\\+/:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}~";
		
//		println("split_on: " + split_on);
		
		for (int k = 0; k < split_on.length(); ++k) {
			// for each split character, reprocess the string
			String regex = "" + split_on.charAt(k);
			if (regex.equals("\\")) {
				++k;
				regex += split_on.charAt(k);
			}
			str = str.replaceAll(regex," " + regex + " ");
		}
		
		
		
		// split on "." and "," and "-", conditioned on proper context
		
		str = " " + str + " ";
		str = str.replaceAll("\\s+"," ");
		
		TreeSet<Integer> splitIndices = new TreeSet<Integer>();
		
		for (int i = 0; i < str.length(); ++i) {
			char ch = str.charAt(i);
			if (ch == '.' || ch == ',') {
				// split if either of the previous or next characters is a non-digit
				char prev_ch = str.charAt(i-1);
				char next_ch = str.charAt(i+1);
				if (prev_ch < '0' || prev_ch > '9' || next_ch < '0' || next_ch > '9') {
					splitIndices.add(i);
				}
			} else if (ch == '-') {
				// split if preceded by a digit
				char prev_ch = str.charAt(i-1);
				if (prev_ch >= '0' && prev_ch <= '9') {
					splitIndices.add(i);
				}
			}
		}
		
		String str0 = str;
		str = "";
		
		for (int i = 0; i < str0.length(); ++i) {
			if (splitIndices.contains(i)) {
				str += " " + str0.charAt(i) + " ";
			} else {
				str += str0.charAt(i);
			}
		}
		
		
		
		// rejoin i'm, we're, *'s, won't, don't, etc
		
		str = " " + str + " ";
		str = str.replaceAll("\\s+"," ");
		
		str = str.replaceAll(" i 'm "," i'm ");
		str = str.replaceAll(" we 're "," we're ");
		str = str.replaceAll(" 's ","'s ");
		str = str.replaceAll(" 've ","'ve ");
		str = str.replaceAll(" 'll ","'ll ");
		str = str.replaceAll(" 'd ","'d ");
		str = str.replaceAll(" n't ","n't ");
		
		
		
		// remove spaces around dashes
		if (normMethod == 2 || normMethod == 4) {
			
			TreeSet<Integer> skipIndices = new TreeSet<Integer>();
			str = " " + str + " ";
			
			for (int i = 0; i < str.length(); ++i) {
				char ch = str.charAt(i);
				if (ch == '-') {
					// rejoin if surrounded by spaces, and then letters
					if (str.charAt(i-1) == ' ' && str.charAt(i+1) == ' ') {
						if (Character.isLetter(str.charAt(i-2)) && Character.isLetter(str.charAt(i+2))) {
							skipIndices.add(i-1);
							skipIndices.add(i+1);
						}
					}
				}
			}
			
			str0 = str;
			str = "";
			
			for (int i = 0; i < str0.length(); ++i) {
				if (!skipIndices.contains(i)) {
					str += str0.charAt(i);
				}
			}
		}
		
		
		
		// drop non-ASCII characters
		if (normMethod == 3 || normMethod == 4) {
			
			str0 = str;
			str = "";
			
			for (int i = 0; i < str0.length(); ++i) {
				char ch = str0.charAt(i);
				if (ch <= 127) { // i.e. if ASCII
					str += ch;
				}
			}
		}
		
		
		
		str = str.replaceAll("\\s+"," ");
		
		str = str.trim();
		
		return str;
	}
	
	// TODO: we should handle errors properly for the three use sites of this function, and should remove the function.
	//       OK, but we don't want it to use LineReader, so it can function within the standalone release of Z-MERT. -- O.Z.
	private static int countLines(String fileName) {
		int count = 0;
		
		try {
			BufferedReader inFile = new BufferedReader(new FileReader(fileName));
			
			String line;
			do {
				line = inFile.readLine();
				if (line != null) ++count;
			}  while (line != null);
			
			inFile.close();
		} catch (IOException e) {
			System.err.println("IOException in MertCore.countLines(String): " + e.getMessage());
			System.exit(99902);
		}
		
		return count;
	}
	
	
	private static void println(Object obj) { System.out.println(obj); }
	private static void print(Object obj) { System.out.print(obj); }

	public static void main(String[] args) {
		if (args.length == 0) {
			printUsage(args.length);
			System.exit(0);
		} else {
			processArgsAndInitialize(args);
		}
		// non-specified args will be set to default values in processArgsAndInitialize
		
		if (candFileFormat.equals("plain")) {
			println("Evaluating candidate translations in plain file " + candFileName + "...");
			evaluateCands_plain(candFileName);
		} else if (candFileFormat.equals("nbest")) {
			println("Evaluating set of " + candRank + "'th candidate translations from " + candFileName + "...");
			evaluateCands_nbest(candFileName,candRank);
		}
		println("");
		
		if (evaluateRefs) {
			// evaluate the references themselves; useful if developing a new evaluation metric
			
			println("");
			println("PERFORMING SANITY CHECK:");
			println("------------------------");
			println("");
			println("This metric's scores range from "
				+ evalMetric.worstPossibleScore() + " (worst) to "
				+ evalMetric.bestPossibleScore() + " (best).");
			
			for (int r = 1; r <= refsPerSen; ++r) {
				println("");
				println("(*) Evaluating reference set " + r + ":");
				println("");
				evaluateRefSet(r);
				println("");
			}
		}
		
//		System.exit(0);
		
	} // main(String[] args)

}