/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.util;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.HashSet;
import java.util.Scanner;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import joshua.decoder.ff.tm.hiero.HieroFormatReader;
/**
* This class allows two grammars (loaded from disk) to be compared.
*
* @author Lane Schwartz
*/
public class CompareGrammars {
/** Logger for this class. */
private static final Logger logger =
Logger.getLogger(CompareGrammars.class.getName());
/**
* Gets a set containing all unique instances of the specified
* field.
*
* @param grammarFile File containing a grammar.
* @param fieldDelimiter Regular expression to split each
* line
* @param fieldNumber Field from each rule to extract
* @return set containing all unique instances of the
* specified field
* @throws FileNotFoundException
*/
public static Set<String> getFields(File grammarFile, String fieldDelimiter, int fieldNumber) throws FileNotFoundException {
Scanner grammarScanner = new Scanner(grammarFile);
Set<String> set = new HashSet<String>();
while (grammarScanner.hasNextLine()) {
String line = grammarScanner.nextLine();
String[] fields = line.split(fieldDelimiter);
set.add(fields[fieldNumber]);
}
return set;
}
public static void compareValues(File grammarFile1, File grammarFile2, String fieldDelimiter, int fieldNumber, String scoresDelimiter, int scoresFieldNumber, float delta) throws FileNotFoundException {
Scanner grammarScanner1 = new Scanner(grammarFile1);
Scanner grammarScanner2 = new Scanner(grammarFile2);
Set<String> set = new HashSet<String>();
int counter = 0;
float totalOverDiffs = 0.0f;
while (grammarScanner1.hasNextLine() && grammarScanner2.hasNextLine()) {
counter++;
String line1 = grammarScanner1.nextLine();
String[] fields1 = line1.split(fieldDelimiter);
String[] scores1 = fields1[fieldNumber].split(scoresDelimiter);
float score1 = Float.valueOf(scores1[scoresFieldNumber]);
String line2 = grammarScanner2.nextLine();
String[] fields2 = line2.split(fieldDelimiter);
String[] scores2 = fields2[fieldNumber].split(scoresDelimiter);
float score2 = Float.valueOf(scores2[scoresFieldNumber]);
if (fields1[0].endsWith(fields2[0]) && fields1[1].endsWith(fields2[1]) && fields1[1].endsWith(fields2[1])) {
float diff1 = Math.abs(score1-score2);
float diff2 = Math.abs(score2-score1);
float diff = (diff1 < diff2) ? diff1 : diff2;
if (diff > delta) {
logger.fine("Line " + counter + ": Score mismatch: " + score1 + " vs " + score2);
set.add(line1);
totalOverDiffs += diff;
} else if (logger.isLoggable(Level.FINEST)) {
logger.finest("Line " + counter + ": Scores MATCH: " + score1 + " vs " + score2);
}
} else {
throw new RuntimeException("Lines don't match: " + line1 + " and " + line2);
}
}
if (set.isEmpty()) {
logger.info("No score mismatches");
} else {
logger.warning("Number of mismatches: " + set.size() + " out of " + counter);
logger.warning("Total mismatch logProb mass: " + totalOverDiffs + " (" + totalOverDiffs/set.size() + ") (" + totalOverDiffs/counter+")");
}
}
/**
* Main method.
*
* @param args names of the two grammars to be compared
* @throws FileNotFoundException
*/
public static void main(String[] args) throws FileNotFoundException {
if (args.length != 2) {
logger.severe("Usage: " + CompareGrammars.class.toString() + " grammarFile1 grammarFile2");
System.exit(-1);
}
// Tell standard in and out to use UTF-8
FormatUtil.useUTF8();
logger.finest("Using UTF-8");
logger.info("Comparing grammar files " + args[0] + " and " + args[1]);
File grammarFile1 = new File(args[0]);
File grammarFile2 = new File(args[1]);
String fieldDelimiter = HieroFormatReader.getFieldDelimiter();
boolean compareScores = true;
// Compare left-hand sides
{
Set<String> leftHandSides1 = getFields(grammarFile1, fieldDelimiter, 0);
Set<String> leftHandSides2 = getFields(grammarFile2, fieldDelimiter, 0);
if (leftHandSides1.equals(leftHandSides2)) {
logger.info("Grammar files have the same set of left-hand sides");
} else {
logger.warning("Grammar files have differing sets of left-hand sides");
compareScores = false;
}
}
// Compare source right-hand sides
{
Set<String> sourceRHSs1 = getFields(grammarFile1, fieldDelimiter, 1);
Set<String> sourceRHSs2 = getFields(grammarFile2, fieldDelimiter, 1);
if (sourceRHSs1.equals(sourceRHSs2)) {
logger.info("Grammar files have the same set of source right-hand sides");
} else {
logger.warning("Grammar files have differing sets of source right-hand sides");
compareScores = false;
}
}
// Compare target right-hand sides
{
Set<String> targetRHSs1 = getFields(grammarFile1, fieldDelimiter, 2);
Set<String> targetRHSs2 = getFields(grammarFile2, fieldDelimiter, 2);
if (targetRHSs1.equals(targetRHSs2)) {
logger.info("Grammar files have the same set of target right-hand sides");
} else {
logger.warning("Grammar files have differing sets of target right-hand sides");
compareScores = false;
}
}
// Compare translation probs
if (compareScores) {
float delta = 0.001f;
compareValues(grammarFile1, grammarFile2, fieldDelimiter, 3, "\\s+", 0, delta);
compareValues(grammarFile1, grammarFile2, fieldDelimiter, 3, "\\s+", 1, delta);
compareValues(grammarFile1, grammarFile2, fieldDelimiter, 3, "\\s+", 2, delta);
}
}
}