CompareGrammars.java example

Explorer
relax-decode-master
- third-party
/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.util;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.HashSet;
import java.util.Scanner;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

import joshua.decoder.ff.tm.hiero.HieroFormatReader;

/**
 * This class allows two grammars (loaded from disk) to be compared.
 *
 * @author Lane Schwartz
 */
public class CompareGrammars {

	/** Logger for this class. */
	private static final Logger logger =
		Logger.getLogger(CompareGrammars.class.getName());
	
	/**
	 * Gets a set containing all unique instances of the specified
	 * field.
	 *
	 * @param grammarFile File containing a grammar.
	 * @param fieldDelimiter Regular expression to split each
	 *                       line
	 * @param fieldNumber Field from each rule to extract
	 * @return set containing all unique instances of the
	 *         specified field
	 * @throws FileNotFoundException
	 */
	public static Set<String> getFields(File grammarFile, String fieldDelimiter, int fieldNumber) throws FileNotFoundException {
		
		Scanner grammarScanner = new Scanner(grammarFile);
		
		Set<String> set = new HashSet<String>();
				
		while (grammarScanner.hasNextLine()) {
		
			String line = grammarScanner.nextLine();
			
			String[] fields = line.split(fieldDelimiter);
			
			set.add(fields[fieldNumber]);
		}
		
		return set;
	}
	
	public static void compareValues(File grammarFile1, File grammarFile2, String fieldDelimiter, int fieldNumber, String scoresDelimiter, int scoresFieldNumber, float delta) throws FileNotFoundException {
		
		Scanner grammarScanner1 = new Scanner(grammarFile1);
		Scanner grammarScanner2 = new Scanner(grammarFile2);
		
		Set<String> set = new HashSet<String>();
				
		int counter = 0;
		float totalOverDiffs = 0.0f;
		while (grammarScanner1.hasNextLine() && grammarScanner2.hasNextLine()) {
		
			counter++;
			
			String line1 = grammarScanner1.nextLine();
			String[] fields1 = line1.split(fieldDelimiter);
			String[] scores1 = fields1[fieldNumber].split(scoresDelimiter);
			float score1 = Float.valueOf(scores1[scoresFieldNumber]);
			
			String line2 = grammarScanner2.nextLine();
			String[] fields2 = line2.split(fieldDelimiter);
			String[] scores2 = fields2[fieldNumber].split(scoresDelimiter);
			float score2 = Float.valueOf(scores2[scoresFieldNumber]);			
			
			if (fields1[0].endsWith(fields2[0]) && fields1[1].endsWith(fields2[1]) && fields1[1].endsWith(fields2[1])) {
			
				float diff1 = Math.abs(score1-score2);
				float diff2 = Math.abs(score2-score1);
				float diff = (diff1 < diff2) ? diff1 : diff2;
								
				if (diff > delta) {
					logger.fine("Line " + counter + ":  Score mismatch: " + score1 + " vs " + score2);
					set.add(line1);
					totalOverDiffs += diff;
				} else if (logger.isLoggable(Level.FINEST)) {
				    logger.finest("Line " + counter + ": Scores MATCH: " + score1 + " vs " + score2);
				}
				
			} else {
				throw new RuntimeException("Lines don't match: " + line1 + " and " + line2);
			}
		}
		
		if (set.isEmpty()) {
			logger.info("No score mismatches");
		} else {
			logger.warning("Number of mismatches: " + set.size() + " out of " + counter);
			logger.warning("Total mismatch logProb mass: " + totalOverDiffs + " (" + totalOverDiffs/set.size() + ") (" + totalOverDiffs/counter+")");
		}
		
	}
	
	/**
	 * Main method.
	 * 
	 * @param args names of the two grammars to be compared
	 * @throws FileNotFoundException
	 */
	public static void main(String[] args) throws FileNotFoundException {
		
		if (args.length != 2) {
			logger.severe("Usage: " + CompareGrammars.class.toString() + " grammarFile1 grammarFile2");
			System.exit(-1);
		}
		
		// Tell standard in and out to use UTF-8
		FormatUtil.useUTF8();
		logger.finest("Using UTF-8");
		
		logger.info("Comparing grammar files " + args[0] + " and " + args[1]);
		
		File grammarFile1 = new File(args[0]);
		File grammarFile2 = new File(args[1]);

		String fieldDelimiter = HieroFormatReader.getFieldDelimiter();
		
		boolean compareScores = true;
		
		// Compare left-hand sides
		{
			Set<String> leftHandSides1 = getFields(grammarFile1, fieldDelimiter, 0);
			Set<String> leftHandSides2 = getFields(grammarFile2, fieldDelimiter, 0);

			if (leftHandSides1.equals(leftHandSides2)) {
				logger.info("Grammar files have the same set of left-hand sides");
			} else {
				logger.warning("Grammar files have differing sets of left-hand sides");
				compareScores = false;
			}
		}
		
		// Compare source right-hand sides
		{
			Set<String> sourceRHSs1 = getFields(grammarFile1, fieldDelimiter, 1);
			Set<String> sourceRHSs2 = getFields(grammarFile2, fieldDelimiter, 1);

			if (sourceRHSs1.equals(sourceRHSs2)) {
				logger.info("Grammar files have the same set of source right-hand sides");
			} else {
				logger.warning("Grammar files have differing sets of source right-hand sides");
				compareScores = false;
			}
		}
		
		
		// Compare target right-hand sides
		{
			Set<String> targetRHSs1 = getFields(grammarFile1, fieldDelimiter, 2);
			Set<String> targetRHSs2 = getFields(grammarFile2, fieldDelimiter, 2);

			if (targetRHSs1.equals(targetRHSs2)) {
				logger.info("Grammar files have the same set of target right-hand sides");
			} else {
				logger.warning("Grammar files have differing sets of target right-hand sides");
				compareScores = false;
			}
		}
		
		// Compare translation probs
		if (compareScores) {
			float delta = 0.001f;
			compareValues(grammarFile1, grammarFile2, fieldDelimiter, 3, "\\s+", 0, delta);
			compareValues(grammarFile1, grammarFile2, fieldDelimiter, 3, "\\s+", 1, delta);
			compareValues(grammarFile1, grammarFile2, fieldDelimiter, 3, "\\s+", 2, delta);
			
		}
		
	}
	
	

}