/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.zmert;
import joshua.decoder.*;
import java.util.*;
import java.io.*;
import java.util.zip.*;
import java.text.DecimalFormat;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Semaphore;
public class MertCore
{
private TreeSet<Integer>[] indicesOfInterest_all;
private final static DecimalFormat f4 = new DecimalFormat("###0.0000");
private final Runtime myRuntime = Runtime.getRuntime();
private final static double NegInf = (-1.0 / 0.0);
private final static double PosInf = (+1.0 / 0.0);
private final static double epsilon = 1.0 / 1000000;
private int progress;
private int verbosity; // anything of priority <= verbosity will be printed
// (lower value for priority means more important)
private Random randGen;
private int generatedRands;
private int numSentences;
// number of sentences in the dev set
// (aka the "MERT training" set)
private int numDocuments;
// number of documents in the dev set
// this should be 1, unless doing doc-level optimization
private int[] docOfSentence;
// docOfSentence[i] stores which document contains the i'th sentence.
// docOfSentence is 0-indexed, as are the documents (i.e. first doc is indexed 0)
private int[] docSubsetInfo;
// stores information regarding which subset of the documents are evaluated
// [0]: method (0-6)
// [1]: first (1-indexed)
// [2]: last (1-indexed)
// [3]: size
// [4]: center
// [5]: arg1
// [6]: arg2
// [1-6] are 0 for method 0, [6] is 0 for methods 1-4 as well
// only [1] and [2] are needed for optimization. The rest are only needed for an output message.
private int refsPerSen;
// number of reference translations per sentence
private int textNormMethod;
// 0: no normalization, 1: "NIST-style" tokenization, and also rejoin 'm, 're, *'s, 've, 'll, 'd, and n't,
// 2: apply 1 and also rejoin dashes between letters, 3: apply 1 and also drop non-ASCII characters
// 4: apply 1+2+3
private int numParams;
// number of features for the log-linear model
private double[] normalizationOptions;
// How should a lambda[] vector be normalized (before decoding)?
// nO[0] = 0: no normalization
// nO[0] = 1: scale so that parameter nO[2] has absolute value nO[1]
// nO[0] = 2: scale so that the maximum absolute value is nO[1]
// nO[0] = 3: scale so that the minimum absolute value is nO[1]
// nO[0] = 4: scale so that the L-nO[1] norm equals nO[2]
/* *********************************************************** */
/* NOTE: indexing starts at 1 in the following few arrays: */
/* *********************************************************** */
private String[] paramNames;
// feature names, needed to read/create config file
private double[] lambda;
// the current weight vector. NOTE: indexing starts at 1.
private boolean[] isOptimizable;
// isOptimizable[c] = true iff lambda[c] should be optimized
private double[] minThValue;
private double[] maxThValue;
// when investigating thresholds along the lambda[c] dimension, only values
// in the [minThValue[c],maxThValue[c]] range will be considered.
// (*) minThValue and maxThValue can be real values as well as -Infinity and +Infinity
// (coded as -Inf and +Inf, respectively, in an input file)
private double[] minRandValue;
private double[] maxRandValue;
// when choosing a random value for the lambda[c] parameter, it will be
// chosen from the [minRandValue[c],maxRandValue[c]] range.
// (*) minRandValue and maxRandValue must be real values, but not -Inf or +Inf
private int damianos_method;
private double damianos_param;
private double damianos_mult;
private double[] defaultLambda;
// "default" parameter values; simply the values read in the parameter file
/* *********************************************************** */
/* *********************************************************** */
private JoshuaDecoder myDecoder;
// COMMENT OUT if decoder is not Joshua
private String decoderCommand;
// the command that runs the decoder; read from decoderCommandFileName
private int decVerbosity;
// verbosity level for decoder output. If 0, decoder output is ignored.
// If 1, decoder output is printed.
private int validDecoderExitValue;
// return value from running the decoder command that indicates success
private int numOptThreads;
// number of threads to run things in parallel
private int saveInterFiles;
// 0: nothing, 1: only configs, 2: only n-bests, 3: both configs and n-bests
private int compressFiles;
// should Z-MERT gzip the large files? If 0, no compression takes place.
// If 1, compression is performed on: decoder output files, temp sents files,
// and temp feats files.
private int sizeOfNBest;
// size of N-best list generated by decoder at each iteration
// (aka simply N, but N is a bad variable name)
private long seed;
// seed used to create random number generators
private boolean randInit;
// if true, parameters are initialized randomly. If false, parameters
// are initialized using values from parameter file.
private int initsPerIt;
// number of intermediate initial points per iteration
private int maxMERTIterations, minMERTIterations, prevMERTIterations;
// max: maximum number of MERT iterations
// min: minimum number of MERT iterations before an early MERT exit
// prev: number of previous MERT iterations from which to consider candidates (in addition to
// the candidates from the current iteration)
private double stopSigValue;
// early MERT exit if no weight changes by more than stopSigValue
// (but see minMERTIterations above and stopMinIts below)
private int stopMinIts;
// some early stopping criterion must be satisfied in stopMinIts *consecutive* iterations
// before an early exit (but see minMERTIterations above)
private boolean oneModificationPerIteration;
// if true, each MERT iteration performs at most one parameter modification.
// If false, a new MERT iteration starts (i.e. a new N-best list is
// generated) only after the previous iteration reaches a local maximum.
private String metricName;
// name of evaluation metric optimized by MERT
private String metricName_display;
// name of evaluation metric optimized by MERT, possibly with "doc-level " prefixed
private String[] metricOptions;
// options for the evaluation metric (e.g. for BLEU, maxGramLength and effLengthMethod)
private EvaluationMetric evalMetric;
// the evaluation metric used by MERT
private int suffStatsCount;
// number of sufficient statistics for the evaluation metric
private String tmpDirPrefix;
// prefix for the ZMERT.temp.* files
private int passIterationToDecoder;
// should the iteration number be passed as an argument to decoderCommandFileName?
// If 1, iteration number is passed. If 0, launch with no arguments.
private String dirPrefix; // where are all these files located?
private String paramsFileName, docInfoFileName, finalLambdaFileName;
private String sourceFileName, refFileName, decoderOutFileName;
private String decoderConfigFileName, decoderCommandFileName;
private String fakeFileNameTemplate, fakeFileNamePrefix, fakeFileNameSuffix;
// e.g. output.it[1-x].someOldRun would be specified as:
// output.it?.someOldRun
// and we'd have prefix = "output.it" and suffix = ".sameOldRun"
// private int useDisk;
public MertCore()
{
}
public MertCore(String[] args)
{
EvaluationMetric.set_knownMetrics();
processArgsArray(args);
initialize(0);
}
public MertCore(String configFileName)
{
EvaluationMetric.set_knownMetrics();
processArgsArray(cfgFileToArgsArray(configFileName));
initialize(0);
}
private void initialize(int randsToSkip)
{
println("NegInf: " + NegInf + ", PosInf: " + PosInf + ", epsilon: " + epsilon,4);
randGen = new Random(seed);
for (int r = 1; r <= randsToSkip; ++r) {
randGen.nextDouble();
}
generatedRands = randsToSkip;
if (randsToSkip == 0) {
println("----------------------------------------------------",1);
println("Initializing...",1);
println("----------------------------------------------------",1);
println("",1);
println("Random number generator initialized using seed: " + seed,1);
println("",1);
}
numSentences = countLines(refFileName) / refsPerSen;
processDocInfo();
// sets numDocuments and docOfSentence[]
if (numDocuments > 1) metricName_display = "doc-level " + metricName;
set_docSubsetInfo(docSubsetInfo);
numParams = countNonEmptyLines(paramsFileName) - 1;
// the parameter file contains one line per parameter
// and one line for the normalization method
paramNames = new String[1+numParams];
lambda = new double[1+numParams]; // indexing starts at 1 in these arrays
isOptimizable = new boolean[1+numParams];
minThValue = new double[1+numParams];
maxThValue = new double[1+numParams];
minRandValue = new double[1+numParams];
maxRandValue = new double[1+numParams];
// precision = new double[1+numParams];
defaultLambda = new double[1+numParams];
normalizationOptions = new double[3];
try {
// read parameter names
BufferedReader inFile_names = new BufferedReader(new FileReader(paramsFileName));
for (int c = 1; c <= numParams; ++c) {
String line = "";
while (line != null && line.length() == 0) { // skip empty lines
line = inFile_names.readLine();
}
paramNames[c] = (line.substring(0,line.indexOf("|||"))).trim();
}
inFile_names.close();
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MertCore.initialize(int): " + e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in MertCore.initialize(int): " + e.getMessage());
System.exit(99902);
}
processParamFile();
// sets the arrays declared just above
// SentenceInfo.createV(); // uncomment ONLY IF using vocabulary implementation of SentenceInfo
String[][] refSentences = new String[numSentences][refsPerSen];
try {
// read in reference sentences
InputStream inStream_refs = new FileInputStream(new File(refFileName));
BufferedReader inFile_refs = new BufferedReader(new InputStreamReader(inStream_refs, "utf8"));
for (int i = 0; i < numSentences; ++i) {
for (int r = 0; r < refsPerSen; ++r) {
// read the rth reference translation for the ith sentence
refSentences[i][r] = inFile_refs.readLine();
}
}
inFile_refs.close();
// normalize reference sentences
for (int i = 0; i < numSentences; ++i) {
for (int r = 0; r < refsPerSen; ++r) {
// normalize the rth reference translation for the ith sentence
refSentences[i][r] = normalize(refSentences[i][r], textNormMethod);
}
}
// read in decoder command, if any
decoderCommand = null;
if (decoderCommandFileName != null) {
if (fileExists(decoderCommandFileName)) {
BufferedReader inFile_comm = new BufferedReader(new FileReader(decoderCommandFileName));
decoderCommand = inFile_comm.readLine();
inFile_comm.close();
}
}
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MertCore.initialize(int): " + e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in MertCore.initialize(int): " + e.getMessage());
System.exit(99902);
}
// set static data members for the EvaluationMetric class
EvaluationMetric.set_numSentences(numSentences);
EvaluationMetric.set_numDocuments(numDocuments);
EvaluationMetric.set_refsPerSen(refsPerSen);
EvaluationMetric.set_refSentences(refSentences);
EvaluationMetric.set_tmpDirPrefix(tmpDirPrefix);
evalMetric = EvaluationMetric.getMetric(metricName,metricOptions);
suffStatsCount = evalMetric.get_suffStatsCount();
// set static data members for the IntermediateOptimizer class
IntermediateOptimizer.set_MERTparams(numSentences, numDocuments, docOfSentence, docSubsetInfo,
numParams, normalizationOptions,
isOptimizable, minThValue, maxThValue,
oneModificationPerIteration, evalMetric,
tmpDirPrefix, verbosity);
if (randsToSkip == 0) { // i.e. first iteration
println("Number of sentences: " + numSentences,1);
println("Number of documents: " + numDocuments,1);
println("Optimizing " + metricName_display,1);
print("docSubsetInfo: {",1);
for (int f = 0; f < 6; ++f) print(docSubsetInfo[f] + ", ",1);
println(docSubsetInfo[6] + "}",1);
println("Number of features: " + numParams,1);
print("Feature names: {",1);
for (int c = 1; c <= numParams; ++c) {
print("\"" + paramNames[c] + "\"",1);
if (c < numParams) print(",",1);
}
println("}",1);
println("",1);
println("c Default value\tOptimizable?\tCrit. val. range\tRand. val. range",1);
for (int c = 1; c <= numParams; ++c) {
print(c + " " + f4.format(lambda[c]) + "\t\t",1);
if (!isOptimizable[c]) {
println(" No",1);
} else {
print(" Yes\t\t",1);
// print("[" + minThValue[c] + "," + maxThValue[c] + "] @ " + precision[c] + " precision",1);
print(" [" + minThValue[c] + "," + maxThValue[c] + "]",1);
print("\t\t",1);
print(" [" + minRandValue[c] + "," + maxRandValue[c] + "]",1);
println("",1);
}
}
println("",1);
print("Weight vector normalization method: ",1);
if (normalizationOptions[0] == 0) {
println("none.",1);
} else if (normalizationOptions[0] == 1) {
println("weights will be scaled so that the \"" + paramNames[(int)normalizationOptions[1]]
+ "\" weight has an absolute value of " + normalizationOptions[2] + ".",1);
} else if (normalizationOptions[0] == 2) {
println("weights will be scaled so that the maximum absolute value is "
+ normalizationOptions[1] + ".",1);
} else if (normalizationOptions[0] == 3) {
println("weights will be scaled so that the minimum absolute value is "
+ normalizationOptions[1] + ".",1);
} else if (normalizationOptions[0] == 4) {
println("weights will be scaled so that the L-" + normalizationOptions[1]
+ " norm is " + normalizationOptions[2] + ".",1);
}
println("",1);
println("----------------------------------------------------",1);
println("",1);
// rename original config file so it doesn't get overwritten
// (original name will be restored in finish())
renameFile(decoderConfigFileName,decoderConfigFileName+".ZMERT.orig");
} // if (randsToSkip == 0)
if (decoderCommand == null && fakeFileNameTemplate == null) {
println("Loading Joshua decoder...",1);
myDecoder = new JoshuaDecoder(decoderConfigFileName+".ZMERT.orig");
println("...finished loading @ " + (new Date()),1);
println("");
} else {
myDecoder = null;
}
@SuppressWarnings("unchecked")
TreeSet<Integer>[] temp_TSA = new TreeSet[numSentences];
indicesOfInterest_all = temp_TSA;
for (int i = 0; i < numSentences; ++i) {
indicesOfInterest_all[i] = new TreeSet<Integer>();
}
} // void initialize(...)
public void run_MERT()
{
run_MERT(minMERTIterations,maxMERTIterations,prevMERTIterations);
}
public void run_MERT(int minIts, int maxIts, int prevIts)
{
println("----------------------------------------------------",1);
println("Z-MERT run started @ " + (new Date()),1);
// printMemoryUsage();
println("----------------------------------------------------",1);
println("",1);
if (randInit) {
println("Initializing lambda[] randomly.",1);
// initialize optimizable parameters randomly (sampling uniformly from
// that parameter's random value range)
lambda = randomLambda();
}
println("Initial lambda[]: " + lambdaToString(lambda),1);
println("",1);
double FINAL_score = evalMetric.worstPossibleScore();
// int[] lastUsedIndex = new int[numSentences];
int[] maxIndex = new int[numSentences];
// used to grow featVal_array dynamically
// HashMap<Integer,int[]>[] suffStats_array = new HashMap[numSentences];
// suffStats_array[i] maps candidates of interest for sentence i to an array
// storing the sufficient statistics for that candidate
for (int i = 0; i < numSentences; ++i) {
// lastUsedIndex[i] = -1;
maxIndex[i] = sizeOfNBest - 1;
// suffStats_array[i] = new HashMap<Integer,int[]>();
}
/*
double[][][] featVal_array = new double[1+numParams][][];
// indexed by [param][sentence][candidate]
featVal_array[0] = null; // param indexing starts at 1
for (int c = 1; c <= numParams; ++c) {
featVal_array[c] = new double[numSentences][];
for (int i = 0; i < numSentences; ++i) {
featVal_array[c][i] = new double[maxIndex[i]];
// will grow dynamically as needed
}
}
*/
int earlyStop = 0;
// number of consecutive iteration an early stopping criterion was satisfied
for (int iteration = 1; ; ++iteration) {
double[] A = run_single_iteration(iteration, minIts, maxIts, prevIts, earlyStop, maxIndex);
if (A != null) {
FINAL_score = A[0];
earlyStop = (int)A[1];
if (A[2] == 1) break;
} else {
break;
}
} // for (iteration)
println("",1);
println("----------------------------------------------------",1);
println("Z-MERT run ended @ " + (new Date()),1);
// printMemoryUsage();
println("----------------------------------------------------",1);
println("",1);
println("FINAL lambda: " + lambdaToString(lambda)
+ " (" + metricName_display + ": " + FINAL_score + ")",1);
// check if a lambda is outside its threshold range
for (int c = 1; c <= numParams; ++c) {
if (lambda[c] < minThValue[c] || lambda[c] > maxThValue[c]) {
println("Warning: after normalization, lambda[" + c + "]=" + f4.format(lambda[c])
+ " is outside its critical value range.",1);
}
}
println("",1);
// delete intermediate .temp.*.it* decoder output files
for (int iteration = 1; iteration <= maxIts; ++iteration) {
if (compressFiles == 1) {
deleteFile(tmpDirPrefix+"temp.sents.it"+iteration+".gz");
deleteFile(tmpDirPrefix+"temp.feats.it"+iteration+".gz");
if (fileExists(tmpDirPrefix+"temp.stats.it"+iteration+".copy.gz")) {
deleteFile(tmpDirPrefix+"temp.stats.it"+iteration+".copy.gz");
} else {
deleteFile(tmpDirPrefix+"temp.stats.it"+iteration+".gz");
}
} else {
deleteFile(tmpDirPrefix+"temp.sents.it"+iteration);
deleteFile(tmpDirPrefix+"temp.feats.it"+iteration);
if (fileExists(tmpDirPrefix+"temp.stats.it"+iteration+".copy")) {
deleteFile(tmpDirPrefix+"temp.stats.it"+iteration+".copy");
} else {
deleteFile(tmpDirPrefix+"temp.stats.it"+iteration);
}
}
}
} // void run_MERT(int maxIts)
@SuppressWarnings("unchecked")
public double[] run_single_iteration(
int iteration, int minIts, int maxIts, int prevIts, int earlyStop, int[]maxIndex)
{
double FINAL_score = 0;
double[] retA = new double[3];
// retA[0]: FINAL_score
// retA[1]: earlyStop
// retA[2]: should this be the last iteration?
boolean done = false;
retA[2] = 1; // will only be made 0 if we don't break from the following loop
double[][][] featVal_array = new double[1+numParams][][];
// indexed by [param][sentence][candidate]
featVal_array[0] = null; // param indexing starts at 1
for (int c = 1; c <= numParams; ++c) {
featVal_array[c] = new double[numSentences][];
for (int i = 0; i < numSentences; ++i) {
featVal_array[c][i] = new double[maxIndex[i]+1];
// will grow dynamically as needed
}
}
while (!done) { // NOTE: this "loop" will only be carried out once
println("--- Starting Z-MERT iteration #" + iteration + " @ " + (new Date()) + " ---",1);
// printMemoryUsage();
// run the decoder on all the sentences, producing for each sentence a set of
// sizeOfNBest candidates, with numParams feature values for each candidate
/******************************/
// CREATE DECODER CONFIG FILE //
/******************************/
createConfigFile(lambda,decoderConfigFileName,decoderConfigFileName+".ZMERT.orig");
// i.e. use the original config file as a template
/***************/
// RUN DECODER //
/***************/
if (iteration == 1) {
println("Decoding using initial weight vector " + lambdaToString(lambda),1);
} else {
println("Redecoding using weight vector " + lambdaToString(lambda),1);
}
String[] decRunResult = run_decoder(iteration); // iteration passed in case fake decoder will be used
// [0] name of file to be processed
// [1] indicates how the output file was obtained:
// 1: external decoder
// 2: fake decoder
// 3: internal decoder
if (!decRunResult[1].equals("2")) {
println("...finished decoding @ " + (new Date()),1);
}
checkFile(decRunResult[0]);
println("Producing temp files for iteration "+iteration,3);
produceTempFiles(decRunResult[0], iteration);
if (saveInterFiles == 1 || saveInterFiles == 3) { // make copy of intermediate config file
if (!copyFile(decoderConfigFileName,decoderConfigFileName+".ZMERT.it"+iteration)) {
println("Warning: attempt to make copy of decoder config file (to create" + decoderConfigFileName+".ZMERT.it"+iteration + ") was unsuccessful!",1);
}
}
if (saveInterFiles == 2 || saveInterFiles == 3) { // make copy of intermediate decoder output file...
if (!decRunResult[1].equals("2")) { // ...but only if no fake decoder
if (!decRunResult[0].endsWith(".gz")) {
if (!copyFile(decRunResult[0],decRunResult[0]+".ZMERT.it"+iteration)) {
println("Warning: attempt to make copy of decoder output file (to create" + decRunResult[0]+".ZMERT.it"+iteration + ") was unsuccessful!",1);
}
} else {
String prefix = decRunResult[0].substring(0,decRunResult[0].length()-3);
if (!copyFile(prefix+".gz",prefix+".ZMERT.it"+iteration+".gz")) {
println("Warning: attempt to make copy of decoder output file (to create" + prefix+".ZMERT.it"+iteration+".gz" + ") was unsuccessful!",1);
}
}
if (compressFiles == 1 && !decRunResult[0].endsWith(".gz")) {
gzipFile(decRunResult[0]+".ZMERT.it"+iteration);
}
} // if (!fake)
}
int[] candCount = new int[numSentences];
int[] lastUsedIndex = new int[numSentences];
@SuppressWarnings("unchecked")
ConcurrentHashMap<Integer,int[]>[] suffStats_array = new ConcurrentHashMap[numSentences];
for (int i = 0; i < numSentences; ++i) {
candCount[i] = 0;
lastUsedIndex[i] = -1;
// suffStats_array[i].clear();
suffStats_array[i] = new ConcurrentHashMap<Integer,int[]>();
}
double[][] initialLambda = new double[1+initsPerIt][1+numParams];
// the intermediate "initial" lambdas
double[][] finalLambda = new double[1+initsPerIt][1+numParams];
// the intermediate "final" lambdas
// set initialLambda[][]
System.arraycopy(lambda,1,initialLambda[1],1,numParams);
for (int j = 2; j <= initsPerIt; ++j) {
if (damianos_method == 0) {
initialLambda[j] = randomLambda();
} else {
initialLambda[j] = randomPerturbation(initialLambda[1], iteration, damianos_method, damianos_param, damianos_mult);
}
}
double[] initialScore = new double[1+initsPerIt];
double[] finalScore = new double[1+initsPerIt];
int[][][] best1Cand_suffStats = new int[1+initsPerIt][numSentences][suffStatsCount];
double[][] best1Score = new double[1+initsPerIt][numSentences];
// Those two arrays are used to calculate initialScore[]
// (the "score" in best1Score refers to that assigned by the
// decoder; the "score" in initialScore refers to that
// assigned by the evaluation metric)
int firstIt = Math.max(1,iteration-prevIts);
// i.e. only process candidates from the current iteration and candidates
// from up to prevIts previous iterations.
println("Reading candidate translations from iterations " + firstIt + "-" + iteration,1);
println("(and computing " + metricName + " sufficient statistics for previously unseen candidates)",1);
print(" Progress: ");
int[] newCandidatesAdded = new int[1+iteration];
for (int it = 1; it <= iteration; ++it) { newCandidatesAdded[it] = 0; }
try {
// each inFile corresponds to the output of an iteration
// (index 0 is not used; no corresponding index for the current iteration)
BufferedReader[] inFile_sents = new BufferedReader[iteration];
BufferedReader[] inFile_feats = new BufferedReader[iteration];
BufferedReader[] inFile_stats = new BufferedReader[iteration];
for (int it = firstIt; it < iteration; ++it) {
InputStream inStream_sents, inStream_feats, inStream_stats;
if (compressFiles == 0) {
inStream_sents = new FileInputStream(tmpDirPrefix+"temp.sents.it"+it);
inStream_feats = new FileInputStream(tmpDirPrefix+"temp.feats.it"+it);
inStream_stats = new FileInputStream(tmpDirPrefix+"temp.stats.it"+it);
} else {
inStream_sents = new GZIPInputStream(new FileInputStream(tmpDirPrefix+"temp.sents.it"+it+".gz"));
inStream_feats = new GZIPInputStream(new FileInputStream(tmpDirPrefix+"temp.feats.it"+it+".gz"));
inStream_stats = new GZIPInputStream(new FileInputStream(tmpDirPrefix+"temp.stats.it"+it+".gz"));
}
inFile_sents[it] = new BufferedReader(new InputStreamReader(inStream_sents, "utf8"));
inFile_feats[it] = new BufferedReader(new InputStreamReader(inStream_feats, "utf8"));
inFile_stats[it] = new BufferedReader(new InputStreamReader(inStream_stats, "utf8"));
}
InputStream inStream_sentsCurrIt, inStream_featsCurrIt, inStream_statsCurrIt;
if (compressFiles == 0) {
inStream_sentsCurrIt = new FileInputStream(tmpDirPrefix+"temp.sents.it"+iteration);
inStream_featsCurrIt = new FileInputStream(tmpDirPrefix+"temp.feats.it"+iteration);
} else {
inStream_sentsCurrIt = new GZIPInputStream(new FileInputStream(tmpDirPrefix+"temp.sents.it"+iteration+".gz"));
inStream_featsCurrIt = new GZIPInputStream(new FileInputStream(tmpDirPrefix+"temp.feats.it"+iteration+".gz"));
}
BufferedReader inFile_sentsCurrIt = new BufferedReader(new InputStreamReader(inStream_sentsCurrIt, "utf8"));
BufferedReader inFile_featsCurrIt = new BufferedReader(new InputStreamReader(inStream_featsCurrIt, "utf8"));
BufferedReader inFile_statsCurrIt = null; // will only be used if statsCurrIt_exists below is set to true
PrintWriter outFile_statsCurrIt = null; // will only be used if statsCurrIt_exists below is set to false
boolean statsCurrIt_exists = false;
if (fileExists(tmpDirPrefix+"temp.stats.it"+iteration)) {
inStream_statsCurrIt = new FileInputStream(tmpDirPrefix+"temp.stats.it"+iteration);
inFile_statsCurrIt = new BufferedReader(new InputStreamReader(inStream_statsCurrIt, "utf8"));
statsCurrIt_exists = true;
copyFile(tmpDirPrefix+"temp.stats.it"+iteration,tmpDirPrefix+"temp.stats.it"+iteration+".copy");
} else if (fileExists(tmpDirPrefix+"temp.stats.it"+iteration+".gz")) {
inStream_statsCurrIt = new GZIPInputStream(new FileInputStream(tmpDirPrefix+"temp.stats.it"+iteration+".gz"));
inFile_statsCurrIt = new BufferedReader(new InputStreamReader(inStream_statsCurrIt, "utf8"));
statsCurrIt_exists = true;
copyFile(tmpDirPrefix+"temp.stats.it"+iteration+".gz",tmpDirPrefix+"temp.stats.it"+iteration+".copy.gz");
} else {
outFile_statsCurrIt = new PrintWriter(tmpDirPrefix+"temp.stats.it"+iteration);
}
PrintWriter outFile_statsMerged = new PrintWriter(tmpDirPrefix+"temp.stats.merged");
// write sufficient statistics from all the sentences
// from the output files into a single file
PrintWriter outFile_statsMergedKnown = new PrintWriter(tmpDirPrefix+"temp.stats.mergedKnown");
// write sufficient statistics from all the sentences
// from the output files into a single file
FileOutputStream outStream_unknownCands = new FileOutputStream(tmpDirPrefix+"temp.currIt.unknownCands", false);
OutputStreamWriter outStreamWriter_unknownCands = new OutputStreamWriter(outStream_unknownCands, "utf8");
BufferedWriter outFile_unknownCands = new BufferedWriter(outStreamWriter_unknownCands);
PrintWriter outFile_unknownIndices = new PrintWriter(tmpDirPrefix+"temp.currIt.unknownIndices");
String sents_str, feats_str, stats_str;
// BUG: this assumes a candidate string cannot be produced for two
// different source sentences, which is not necessarily true
// (It's not actually a bug, but only because existingCandStats gets
// cleared before moving to the next source sentence.)
// FIX: should be made an array, indexed by i
HashMap<String,String> existingCandStats = new HashMap<String,String>();
// Stores precalculated sufficient statistics for candidates, in case
// the same candidate is seen again. (SS stored as a String.)
// Q: Why do we care? If we see the same candidate again, aren't we going
// to ignore it? So, why do we care about the SS of this repeat candidate?
// A: A "repeat" candidate may not be a repeat candidate in later
// iterations if the user specifies a value for prevMERTIterations
// that causes MERT to skip candidates from early iterations.
double[] currFeatVal = new double[1+numParams];
String[] featVal_str;
int totalCandidateCount = 0;
int[] sizeUnknown_currIt = new int[numSentences];
for (int i = 0; i < numSentences; ++i) {
for (int j = 1; j <= initsPerIt; ++j) {
best1Score[j][i] = NegInf;
}
for (int it = firstIt; it < iteration; ++it) {
// Why up to but *excluding* iteration?
// Because the last iteration is handled a little differently, since
// the SS must be claculated (and the corresponding file created),
// which is not true for previous iterations.
for (int n = 0; n <= sizeOfNBest; ++n) {
// Why up to and *including* sizeOfNBest?
// So that it would read the "||||||" separator even if there is
// a complete list of sizeOfNBest candidates.
// for the nth candidate for the ith sentence, read the sentence, feature values,
// and sufficient statistics from the various temp files
sents_str = inFile_sents[it].readLine();
feats_str = inFile_feats[it].readLine();
stats_str = inFile_stats[it].readLine();
if (sents_str.equals("||||||")) {
n = sizeOfNBest+1;
} else if (!existingCandStats.containsKey(sents_str)) {
outFile_statsMergedKnown.println(stats_str);
featVal_str = feats_str.split("\\s+");
for (int c = 1; c <= numParams; ++c) {
currFeatVal[c] = Double.parseDouble(featVal_str[c-1]);
// print("fV[" + c + "]=" + currFeatVal[c] + " ",4);
}
// println("",4);
for (int j = 1; j <= initsPerIt; ++j) {
double score = 0; // i.e. score assigned by decoder
for (int c = 1; c <= numParams; ++c) {
score += initialLambda[j][c] * currFeatVal[c];
}
if (score > best1Score[j][i]) {
best1Score[j][i] = score;
String[] tempStats = stats_str.split("\\s+");
for (int s = 0; s < suffStatsCount; ++s)
best1Cand_suffStats[j][i][s] = Integer.parseInt(tempStats[s]);
}
} // for (j)
existingCandStats.put(sents_str,stats_str);
setFeats(featVal_array,i,lastUsedIndex,maxIndex,currFeatVal);
candCount[i] += 1;
newCandidatesAdded[it] += 1;
} // if unseen candidate
} // for (n)
} // for (it)
outFile_statsMergedKnown.println("||||||");
// now process the candidates of the current iteration
// now determine the new candidates of the current iteration
/* remember:
BufferedReader inFile_sentsCurrIt
BufferedReader inFile_featsCurrIt
PrintWriter outFile_statsCurrIt
*/
String[] sentsCurrIt_currSrcSent = new String[sizeOfNBest+1];
Vector<String> unknownCands_V = new Vector<String>();
// which candidates (of the i'th source sentence) have not been seen before
// this iteration?
for (int n = 0; n <= sizeOfNBest; ++n) {
// Why up to and *including* sizeOfNBest?
// So that it would read the "||||||" separator even if there is
// a complete list of sizeOfNBest candidates.
// for the nth candidate for the ith sentence, read the sentence,
// and store it in the sentsCurrIt_currSrcSent array
sents_str = inFile_sentsCurrIt.readLine();
sentsCurrIt_currSrcSent[n] = sents_str; // Note: possibly "||||||"
if (sents_str.equals("||||||")) {
n = sizeOfNBest+1;
} else if (!existingCandStats.containsKey(sents_str)) {
unknownCands_V.add(sents_str);
writeLine(sents_str,outFile_unknownCands);
outFile_unknownIndices.println(i);
newCandidatesAdded[iteration] += 1;
existingCandStats.put(sents_str,"U"); // i.e. unknown
// we add sents_str to avoid duplicate entries in unknownCands_V
}
} // for (n)
// now unknownCands_V has the candidates for which we need to calculate
// sufficient statistics (for the i'th source sentence)
int sizeUnknown = unknownCands_V.size();
sizeUnknown_currIt[i] = sizeUnknown;
/*********************************************/
/*
String[] unknownCands = new String[sizeUnknown];
unknownCands_V.toArray(unknownCands);
int[] indices = new int[sizeUnknown];
for (int d = 0; d < sizeUnknown; ++d) {
existingCandStats.remove(unknownCands[d]);
// remove the (unknownCands[d],"U") entry from existingCandStats
// (we had added it while constructing unknownCands_V to avoid duplicate entries)
indices[d] = i;
}
*/
/*********************************************/
existingCandStats.clear();
} // for (i)
/*
int[][] newSuffStats = null;
if (!statsCurrIt_exists && sizeUnknown > 0) {
newSuffStats = evalMetric.suffStats(unknownCands, indices);
}
*/
outFile_statsMergedKnown.close();
outFile_unknownCands.close();
outFile_unknownIndices.close();
for (int it = firstIt; it < iteration; ++it) {
inFile_sents[it].close();
inFile_stats[it].close();
InputStream inStream_sents, inStream_stats;
if (compressFiles == 0) {
inStream_sents = new FileInputStream(tmpDirPrefix+"temp.sents.it"+it);
inStream_stats = new FileInputStream(tmpDirPrefix+"temp.stats.it"+it);
} else {
inStream_sents = new GZIPInputStream(new FileInputStream(tmpDirPrefix+"temp.sents.it"+it+".gz"));
inStream_stats = new GZIPInputStream(new FileInputStream(tmpDirPrefix+"temp.stats.it"+it+".gz"));
}
inFile_sents[it] = new BufferedReader(new InputStreamReader(inStream_sents, "utf8"));
inFile_stats[it] = new BufferedReader(new InputStreamReader(inStream_stats, "utf8"));
}
inFile_sentsCurrIt.close();
if (compressFiles == 0) {
inStream_sentsCurrIt = new FileInputStream(tmpDirPrefix+"temp.sents.it"+iteration);
} else {
inStream_sentsCurrIt = new GZIPInputStream(new FileInputStream(tmpDirPrefix+"temp.sents.it"+iteration+".gz"));
}
inFile_sentsCurrIt = new BufferedReader(new InputStreamReader(inStream_sentsCurrIt, "utf8"));
// calculate SS for unseen candidates and write them to file
FileInputStream inStream_statsCurrIt_unknown = null;
BufferedReader inFile_statsCurrIt_unknown = null;
if (!statsCurrIt_exists && newCandidatesAdded[iteration] > 0) {
// create the file...
evalMetric.createSuffStatsFile(tmpDirPrefix+"temp.currIt.unknownCands", tmpDirPrefix+"temp.currIt.unknownIndices", tmpDirPrefix+"temp.stats.unknown", sizeOfNBest);
// ...and open it
inStream_statsCurrIt_unknown = new FileInputStream(tmpDirPrefix+"temp.stats.unknown");
inFile_statsCurrIt_unknown = new BufferedReader(new InputStreamReader(inStream_statsCurrIt_unknown, "utf8"));
}
// OPEN mergedKnown file
FileInputStream instream_statsMergedKnown = new FileInputStream(tmpDirPrefix+"temp.stats.mergedKnown");
BufferedReader inFile_statsMergedKnown = new BufferedReader(new InputStreamReader(instream_statsMergedKnown, "utf8"));
for (int i = 0; i < numSentences; ++i) {
// reprocess candidates from previous iterations
for (int it = firstIt; it < iteration; ++it) {
for (int n = 0; n <= sizeOfNBest; ++n) {
sents_str = inFile_sents[it].readLine();
stats_str = inFile_stats[it].readLine();
if (sents_str.equals("||||||")) {
n = sizeOfNBest+1;
} else if (!existingCandStats.containsKey(sents_str)) {
existingCandStats.put(sents_str,stats_str);
} // if unseen candidate
} // for (n)
} // for (it)
// copy relevant portion from mergedKnown to the merged file
String line_mergedKnown = inFile_statsMergedKnown.readLine();
while (!line_mergedKnown.equals("||||||")) {
outFile_statsMerged.println(line_mergedKnown);
line_mergedKnown = inFile_statsMergedKnown.readLine();
}
int d = -1;
int[] stats = new int[suffStatsCount];
for (int n = 0; n <= sizeOfNBest; ++n) {
// Why up to and *including* sizeOfNBest?
// So that it would read the "||||||" separator even if there is
// a complete list of sizeOfNBest candidates.
// for the nth candidate for the ith sentence, read the sentence, feature values,
// and sufficient statistics from the various temp files
sents_str = inFile_sentsCurrIt.readLine();
feats_str = inFile_featsCurrIt.readLine();
if (sents_str.equals("||||||")) {
n = sizeOfNBest+1;
} else if (!existingCandStats.containsKey(sents_str)) {
++d;
if (!statsCurrIt_exists) {
stats_str = inFile_statsCurrIt_unknown.readLine();
String[] temp_stats = stats_str.split("\\s+");
for (int s = 0; s < suffStatsCount; ++s) {
stats[s] = Integer.parseInt(temp_stats[s]);
}
/*
stats_str = "";
for (int s = 0; s < suffStatsCount-1; ++s) {
stats[s] = newSuffStats[d][s];
stats_str += (stats[s] + " ");
}
stats[suffStatsCount-1] = newSuffStats[d][suffStatsCount-1];
stats_str += stats[suffStatsCount-1];
*/
outFile_statsCurrIt.println(stats_str);
} else {
stats_str = inFile_statsCurrIt.readLine();
String[] temp_stats = stats_str.split("\\s+");
for (int s = 0; s < suffStatsCount; ++s) {
stats[s] = Integer.parseInt(temp_stats[s]);
}
}
outFile_statsMerged.println(stats_str);
featVal_str = feats_str.split("\\s+");
for (int c = 1; c <= numParams; ++c) {
currFeatVal[c] = Double.parseDouble(featVal_str[c-1]);
// print("fV[" + c + "]=" + currFeatVal[c] + " ",4);
}
// println("",4);
for (int j = 1; j <= initsPerIt; ++j) {
double score = 0; // i.e. score assigned by decoder
for (int c = 1; c <= numParams; ++c) {
score += initialLambda[j][c] * currFeatVal[c];
}
if (score > best1Score[j][i]) {
best1Score[j][i] = score;
for (int s = 0; s < suffStatsCount; ++s)
best1Cand_suffStats[j][i][s] = stats[s];
}
} // for (j)
existingCandStats.put(sents_str,stats_str);
setFeats(featVal_array,i,lastUsedIndex,maxIndex,currFeatVal);
candCount[i] += 1;
// newCandidatesAdded[iteration] += 1;
// moved to code above detecting new candidates
} else {
if (statsCurrIt_exists)
inFile_statsCurrIt.readLine();
else {
// write SS to outFile_statsCurrIt
stats_str = existingCandStats.get(sents_str);
outFile_statsCurrIt.println(stats_str);
}
}
} // for (n)
// now d = sizeUnknown_currIt[i] - 1
if (statsCurrIt_exists)
inFile_statsCurrIt.readLine();
else
outFile_statsCurrIt.println("||||||");
existingCandStats.clear();
totalCandidateCount += candCount[i];
if ((i+1) % 500 == 0) { print((i+1) + "\n" + " ",1); }
else if ((i+1) % 100 == 0) { print("+",1); }
else if ((i+1) % 25 == 0) { print(".",1); }
} // for (i)
outFile_statsMerged.close();
println("",1); // finish progress line
for (int it = firstIt; it < iteration; ++it) {
inFile_sents[it].close();
inFile_feats[it].close();
inFile_stats[it].close();
}
inFile_sentsCurrIt.close();
inFile_featsCurrIt.close();
if (statsCurrIt_exists)
inFile_statsCurrIt.close();
else
outFile_statsCurrIt.close();
if (compressFiles == 1 && !statsCurrIt_exists) {
gzipFile(tmpDirPrefix+"temp.stats.it"+iteration);
}
deleteFile(tmpDirPrefix+"temp.currIt.unknownCands");
deleteFile(tmpDirPrefix+"temp.currIt.unknownIndices");
deleteFile(tmpDirPrefix+"temp.stats.unknown");
deleteFile(tmpDirPrefix+"temp.stats.mergedKnown");
// cleanupMemory();
println("Processed " + totalCandidateCount + " distinct candidates "
+ "(about " + totalCandidateCount/numSentences + " per sentence):",1);
for (int it = firstIt; it <= iteration; ++it) {
println("newCandidatesAdded[it=" + it + "] = " + newCandidatesAdded[it]
+ " (about " + newCandidatesAdded[it]/numSentences + " per sentence)",1);
}
println("",1);
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MertCore.run_single_iteration(6): " + e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in MertCore.run_single_iteration(6): " + e.getMessage());
System.exit(99902);
}
if (newCandidatesAdded[iteration] == 0) {
if (!oneModificationPerIteration) {
println("No new candidates added in this iteration; exiting Z-MERT.",1);
println("",1);
println("--- Z-MERT iteration #" + iteration + " ending @ " + (new Date()) + " ---",1);
println("",1);
return null; // THIS MEANS THAT THE OLD VALUES SHOULD BE KEPT BY THE CALLER
} else {
println("Note: No new candidates added in this iteration.",1);
}
}
// run the initsPerIt optimizations, in parallel, across numOptThreads threads
ExecutorService pool = Executors.newFixedThreadPool(numOptThreads);
Semaphore blocker = new Semaphore(0);
Vector<String>[] threadOutput = new Vector[initsPerIt+1];
for (int j = 1; j <= initsPerIt; ++j) {
threadOutput[j] = new Vector<String>();
pool.execute(new IntermediateOptimizer(j, blocker, threadOutput[j],
initialLambda[j], finalLambda[j], best1Cand_suffStats[j],
finalScore, candCount, featVal_array, suffStats_array));
}
pool.shutdown();
try {
blocker.acquire(initsPerIt);
} catch(java.lang.InterruptedException e) {
System.err.println("InterruptedException in MertCore.run_single_iteration(): " + e.getMessage());
System.exit(99906);
}
// extract output from threadOutput[]
for (int j = 1; j <= initsPerIt; ++j) {
for (String str : threadOutput[j]) {
println(str); // no verbosity check needed; thread already checked
}
}
int best_j = 1;
double bestFinalScore = finalScore[1];
for (int j = 2; j <= initsPerIt; ++j) {
if (evalMetric.isBetter(finalScore[j],bestFinalScore)) {
best_j = j;
bestFinalScore = finalScore[j];
}
}
if (initsPerIt > 1) {
println("Best final lambda is lambda[j=" + best_j + "] "
+ "(" + metricName_display + ": " + f4.format(bestFinalScore) + ").",1);
println("",1);
}
FINAL_score = bestFinalScore;
boolean anyParamChanged = false;
boolean anyParamChangedSignificantly = false;
for (int c = 1; c <= numParams; ++c) {
if (finalLambda[best_j][c] != lambda[c]) {
anyParamChanged = true;
}
if (Math.abs(finalLambda[best_j][c] - lambda[c]) > stopSigValue) {
anyParamChangedSignificantly = true;
}
}
System.arraycopy(finalLambda[best_j],1,lambda,1,numParams);
println("--- Z-MERT iteration #" + iteration + " ending @ " + (new Date()) + " ---",1);
println("",1);
if (!anyParamChanged) {
println("No parameter value changed in this iteration; exiting Z-MERT.",1);
println("",1);
break; // exit for (iteration) loop preemptively
}
// check if a lambda is outside its threshold range
for (int c = 1; c <= numParams; ++c) {
if (lambda[c] < minThValue[c] || lambda[c] > maxThValue[c]) {
println("Warning: after normalization, lambda[" + c + "]="
+ f4.format(lambda[c]) + " is outside its critical value range.",1);
}
}
// was an early stopping criterion satisfied?
boolean critSatisfied = false;
if (!anyParamChangedSignificantly && stopSigValue >= 0) {
println("Note: No parameter value changed significantly "
+ "(i.e. by more than " + stopSigValue + ") in this iteration.",1);
critSatisfied = true;
}
if (critSatisfied) { ++earlyStop; println("",1); }
else { earlyStop = 0; }
// if min number of iterations executed, investigate if early exit should happen
if (iteration >= minIts && earlyStop >= stopMinIts) {
println("Some early stopping criteria has been observed "
+ "in " + stopMinIts + " consecutive iterations; exiting Z-MERT.",1);
println("",1);
break; // exit for (iteration) loop preemptively
}
// if max number of iterations executed, exit
if (iteration >= maxIts) {
println("Maximum number of MERT iterations reached; exiting Z-MERT.",1);
println("",1);
break; // exit for (iteration) loop
}
println("Next iteration will decode with lambda: " + lambdaToString(lambda),1);
println("",1);
// printMemoryUsage();
for (int i = 0; i < numSentences; ++i) {
suffStats_array[i].clear();
}
// cleanupMemory();
// println("",2);
retA[2] = 0; // i.e. this should NOT be the last iteration
done = true;
} // while (!done) // NOTE: this "loop" will only be carried out once
// delete .temp.stats.merged file, since it is not needed in the next
// iteration (it will be recreated from scratch)
deleteFile(tmpDirPrefix+"temp.stats.merged");
retA[0] = FINAL_score;
retA[1] = earlyStop;
return retA;
} // run_single_iteration
private String lambdaToString(double[] lambdaA)
{
String retStr = "{";
for (int c = 1; c <= numParams-1; ++c) {
retStr += "" + lambdaA[c] + ", ";
}
retStr += "" + lambdaA[numParams] + "}";
return retStr;
}
private String[] run_decoder(int iteration)
{
String[] retSA = new String[2];
// [0] name of file to be processed
// [1] indicates how the output file was obtained:
// 1: external decoder
// 2: fake decoder
// 3: internal decoder
if (fakeFileNameTemplate != null && fileExists(fakeFileNamePrefix+iteration+fakeFileNameSuffix)) {
String fakeFileName = fakeFileNamePrefix+iteration+fakeFileNameSuffix;
println("Not running decoder; using " + fakeFileName + " instead.",1);
/*
if (fakeFileName.endsWith(".gz")) {
copyFile(fakeFileName,decoderOutFileName+".gz");
gunzipFile(decoderOutFileName+".gz");
} else {
copyFile(fakeFileName,decoderOutFileName);
}
*/
retSA[0] = fakeFileName;
retSA[1] = "2";
} else if (decoderCommand == null) {
if (myDecoder == null) {
println("Loading Joshua decoder...",1);
myDecoder = new JoshuaDecoder(decoderConfigFileName+".ZMERT.orig");
println("...finished loading @ " + (new Date()),1);
println("");
}
println("Running Joshua decoder on source file " + sourceFileName + "...",1);
// myDecoder.initialize(decoderConfigFileName);
double[] zeroBased_lambda = new double[numParams];
System.arraycopy(lambda,1,zeroBased_lambda,0,numParams);
myDecoder.changeBaselineFeatureWeights(zeroBased_lambda);
myDecoder.decodeTestSet(sourceFileName, decoderOutFileName);
retSA[0] = decoderOutFileName;
retSA[1] = "3";
} else {
println("Running external decoder...",1);
try {
Runtime rt = Runtime.getRuntime();
String cmd = decoderCommandFileName;
if (passIterationToDecoder == 1) {
cmd = cmd + " " + iteration;
}
Process p = rt.exec(cmd);
StreamGobbler errorGobbler = new StreamGobbler(p.getErrorStream(), decVerbosity);
StreamGobbler outputGobbler = new StreamGobbler(p.getInputStream(), decVerbosity);
errorGobbler.start();
outputGobbler.start();
int decStatus = p.waitFor();
if (decStatus != validDecoderExitValue) {
println("Call to decoder returned " + decStatus
+ "; was expecting " + validDecoderExitValue + ".");
System.exit(30);
}
} catch (IOException e) {
System.err.println("IOException in MertCore.run_decoder(int): " + e.getMessage());
System.exit(99902);
} catch (InterruptedException e) {
System.err.println("InterruptedException in MertCore.run_decoder(int): " + e.getMessage());
System.exit(99903);
}
retSA[0] = decoderOutFileName;
retSA[1] = "1";
}
return retSA;
}
private void produceTempFiles(String nbestFileName, int iteration)
{
try {
String sentsFileName = tmpDirPrefix+"temp.sents.it"+iteration;
String featsFileName = tmpDirPrefix+"temp.feats.it"+iteration;
FileOutputStream outStream_sents = new FileOutputStream(sentsFileName, false);
OutputStreamWriter outStreamWriter_sents = new OutputStreamWriter(outStream_sents, "utf8");
BufferedWriter outFile_sents = new BufferedWriter(outStreamWriter_sents);
PrintWriter outFile_feats = new PrintWriter(featsFileName);
InputStream inStream_nbest = null;
if (nbestFileName.endsWith(".gz")) {
inStream_nbest = new GZIPInputStream(new FileInputStream(nbestFileName));
} else {
inStream_nbest = new FileInputStream(nbestFileName);
}
BufferedReader inFile_nbest = new BufferedReader(new InputStreamReader(inStream_nbest, "utf8"));
String line; //, prevLine;
String candidate_str = "";
String feats_str = "";
int i = 0; int n = 0;
line = inFile_nbest.readLine();
while (line != null) {
/*
line format:
i ||| words of candidate translation . ||| feat-1_val feat-2_val ... feat-numParams_val .*
*/
// in a well formed file, we'd find the nth candidate for the ith sentence
int read_i = Integer.parseInt((line.substring(0,line.indexOf("|||"))).trim());
if (read_i != i) {
writeLine("||||||",outFile_sents);
outFile_feats.println("||||||");
n = 0; ++i;
}
line = (line.substring(line.indexOf("|||")+3)).trim(); // get rid of initial text
candidate_str = (line.substring(0,line.indexOf("|||"))).trim();
feats_str = (line.substring(line.indexOf("|||")+3)).trim();
// get rid of candidate string
int junk_i = feats_str.indexOf("|||");
if (junk_i >= 0) {
feats_str = (feats_str.substring(0,junk_i)).trim();
}
writeLine(normalize(candidate_str,textNormMethod), outFile_sents);
outFile_feats.println(feats_str);
++n;
if (n == sizeOfNBest) {
writeLine("||||||",outFile_sents);
outFile_feats.println("||||||");
n = 0; ++i;
}
line = inFile_nbest.readLine();
}
if (i != numSentences) { // last sentence had too few candidates
writeLine("||||||",outFile_sents);
outFile_feats.println("||||||");
}
inFile_nbest.close();
outFile_sents.close();
outFile_feats.close();
if (compressFiles == 1) {
gzipFile(sentsFileName);
gzipFile(featsFileName);
}
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MertCore.produceTempFiles(int): " + e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in MertCore.produceTempFiles(int): " + e.getMessage());
System.exit(99902);
}
}
private void createConfigFile(double[] params, String cfgFileName, String templateFileName)
{
try {
// i.e. create cfgFileName, which is similar to templateFileName, but with
// params[] as parameter values
BufferedReader inFile = new BufferedReader(new FileReader(templateFileName));
PrintWriter outFile = new PrintWriter(cfgFileName);
String line = inFile.readLine();
while (line != null) {
int c_match = -1;
for (int c = 1; c <= numParams; ++c) {
if (line.startsWith(paramNames[c] + " ")) { c_match = c; break; }
}
if (c_match == -1) {
outFile.println(line);
} else {
outFile.println(paramNames[c_match] + " " + params[c_match]);
}
line = inFile.readLine();
}
inFile.close();
outFile.close();
} catch (IOException e) {
System.err.println("IOException in MertCore.createConfigFile(double[],String,String): " + e.getMessage());
System.exit(99902);
}
}
private void processParamFile()
{
// process parameter file
Scanner inFile_init = null;
try {
inFile_init = new Scanner(new FileReader(paramsFileName));
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MertCore.processParamFile(): " + e.getMessage());
System.exit(99901);
}
String dummy = "";
// initialize lambda[] and other related arrays
for (int c = 1; c <= numParams; ++c) {
// skip parameter name
while (!dummy.equals("|||")) { dummy = inFile_init.next(); }
// read default value
lambda[c] = inFile_init.nextDouble();
defaultLambda[c] = lambda[c];
// read isOptimizable
dummy = inFile_init.next();
if (dummy.equals("Opt")) { isOptimizable[c] = true; }
else if (dummy.equals("Fix")) { isOptimizable[c] = false; }
else {
println("Unknown isOptimizable string " + dummy + " (must be either Opt or Fix)");
System.exit(21);
}
if (!isOptimizable[c]) { // skip next four values
dummy = inFile_init.next();
dummy = inFile_init.next();
dummy = inFile_init.next();
dummy = inFile_init.next();
} else {
// set minThValue[c] and maxThValue[c] (range for thresholds to investigate)
dummy = inFile_init.next();
if (dummy.equals("-Inf")) { minThValue[c] = NegInf; }
else if (dummy.equals("+Inf")) {
println("minThValue[" + c + "] cannot be +Inf!");
System.exit(21);
} else { minThValue[c] = Double.parseDouble(dummy); }
dummy = inFile_init.next();
if (dummy.equals("-Inf")) {
println("maxThValue[" + c + "] cannot be -Inf!");
System.exit(21);
} else if (dummy.equals("+Inf")) { maxThValue[c] = PosInf; }
else { maxThValue[c] = Double.parseDouble(dummy); }
// set minRandValue[c] and maxRandValue[c] (range for random values)
dummy = inFile_init.next();
if (dummy.equals("-Inf") || dummy.equals("+Inf")) {
println("minRandValue[" + c + "] cannot be -Inf or +Inf!");
System.exit(21);
} else { minRandValue[c] = Double.parseDouble(dummy); }
dummy = inFile_init.next();
if (dummy.equals("-Inf") || dummy.equals("+Inf")) {
println("maxRandValue[" + c + "] cannot be -Inf or +Inf!");
System.exit(21);
} else { maxRandValue[c] = Double.parseDouble(dummy); }
// check for illogical values
if (minThValue[c] > maxThValue[c]) {
println("minThValue[" + c + "]=" + minThValue[c]
+ " > " + maxThValue[c] + "=maxThValue[" + c + "]!");
System.exit(21);
}
if (minRandValue[c] > maxRandValue[c]) {
println("minRandValue[" + c + "]=" + minRandValue[c]
+ " > " + maxRandValue[c] + "=maxRandValue[" + c + "]!");
System.exit(21);
}
// check for odd values
if (!(minThValue[c] <= lambda[c] && lambda[c] <= maxThValue[c])) {
println("Warning: lambda[" + c + "] has initial value (" + lambda[c] + ")",1);
println(" that is outside its critical value range "
+ "[" + minThValue[c] + "," + maxThValue[c] + "]",1);
}
if (minThValue[c] == maxThValue[c]) {
println("Warning: lambda[" + c + "] has "
+ "minThValue = maxThValue = " + minThValue[c] + ".",1);
}
if (minRandValue[c] == maxRandValue[c]) {
println("Warning: lambda[" + c + "] has "
+ "minRandValue = maxRandValue = " + minRandValue[c] + ".",1);
}
if (minRandValue[c] < minThValue[c] || minRandValue[c] > maxThValue[c]
|| maxRandValue[c] < minThValue[c] || maxRandValue[c] > maxThValue[c]) {
println("Warning: The random value range for lambda[" + c + "] is not contained",1);
println(" within its critical value range.",1);
}
} // if (!isOptimizable[c])
/*
precision[c] = inFile_init.nextDouble();
if (precision[c] < 0) {
println("precision[" + c + "]=" + precision[c] + " < 0! Must be non-negative.");
System.exit(21);
}
*/
}
// set normalizationOptions[]
String origLine = "";
while (origLine != null && origLine.length() == 0) { origLine = inFile_init.nextLine(); }
// How should a lambda[] vector be normalized (before decoding)?
// nO[0] = 0: no normalization
// nO[0] = 1: scale so that parameter nO[2] has absolute value nO[1]
// nO[0] = 2: scale so that the maximum absolute value is nO[1]
// nO[0] = 3: scale so that the minimum absolute value is nO[1]
// nO[0] = 4: scale so that the L-nO[1] norm equals nO[2]
// normalization = none
// normalization = absval 1 lm
// normalization = maxabsval 1
// normalization = minabsval 1
// normalization = LNorm 2 1
dummy = (origLine.substring(origLine.indexOf("=")+1)).trim();
String[] dummyA = dummy.split("\\s+");
if (dummyA[0].equals("none")) {
normalizationOptions[0] = 0;
} else if (dummyA[0].equals("absval")) {
normalizationOptions[0] = 1;
normalizationOptions[1] = Double.parseDouble(dummyA[1]);
String pName = dummyA[2];
for (int i = 3; i < dummyA.length; ++i) { // in case parameter name has multiple words
pName = pName + " " + dummyA[i];
}
normalizationOptions[2] = c_fromParamName(pName);;
if (normalizationOptions[1] <= 0) {
println("Value for the absval normalization method must be positive.");
System.exit(21);
}
if (normalizationOptions[2] == 0) {
println("Unrecognized feature name " + normalizationOptions[2]
+ " for absval normalization method.",1);
System.exit(21);
}
} else if (dummyA[0].equals("maxabsval")) {
normalizationOptions[0] = 2;
normalizationOptions[1] = Double.parseDouble(dummyA[1]);
if (normalizationOptions[1] <= 0) {
println("Value for the maxabsval normalization method must be positive.");
System.exit(21);
}
} else if (dummyA[0].equals("minabsval")) {
normalizationOptions[0] = 3;
normalizationOptions[1] = Double.parseDouble(dummyA[1]);
if (normalizationOptions[1] <= 0) {
println("Value for the minabsval normalization method must be positive.");
System.exit(21);
}
} else if (dummyA[0].equals("LNorm")) {
normalizationOptions[0] = 4;
normalizationOptions[1] = Double.parseDouble(dummyA[1]);
normalizationOptions[2] = Double.parseDouble(dummyA[2]);
if (normalizationOptions[1] <= 0 || normalizationOptions[2] <= 0) {
println("Both values for the LNorm normalization method must be positive.");
System.exit(21);
}
} else {
println("Unrecognized normalization method " + dummyA[0] + "; "
+ "must be one of none, absval, maxabsval, and LNorm.");
System.exit(21);
} // if (dummyA[0])
inFile_init.close();
}
private void processDocInfo()
{
// sets numDocuments and docOfSentence[]
docOfSentence = new int[numSentences];
if (docInfoFileName == null) {
for (int i = 0; i < numSentences; ++i) docOfSentence[i] = 0;
numDocuments = 1;
} else {
try {
// 4 possible formats:
// 1) List of numbers, one per document, indicating # sentences in each document.
// 2) List of "docName size" pairs, one per document, indicating name of document and # sentences.
// 3) List of docName's, one per sentence, indicating which doument each sentence belongs to.
// 4) List of docName_number's, one per sentence, indicating which doument each sentence belongs to,
// and its order in that document. (can also use '-' instead of '_')
int docInfoSize = countNonEmptyLines(docInfoFileName);
if (docInfoSize < numSentences) { // format #1 or #2
numDocuments = docInfoSize;
int i = 0;
BufferedReader inFile = new BufferedReader(new FileReader(docInfoFileName));
String line = inFile.readLine();
boolean format1 = (!(line.contains(" ")));
for (int doc = 0; doc < numDocuments; ++doc) {
if (doc != 0) line = inFile.readLine();
int docSize = 0;
if (format1) {
docSize = Integer.parseInt(line);
} else {
docSize = Integer.parseInt(line.split("\\s+")[1]);
}
for (int i2 = 1; i2 <= docSize; ++i2) {
docOfSentence[i] = doc;
++i;
}
}
// now i == numSentences
inFile.close();
} else if (docInfoSize == numSentences) { // format #3 or #4
boolean format3 = false;
HashSet<String> seenStrings = new HashSet<String>();
BufferedReader inFile = new BufferedReader(new FileReader(docInfoFileName));
for (int i = 0; i < numSentences; ++i) {
// set format3 = true if a duplicate is found
String line = inFile.readLine();
if (seenStrings.contains(line)) format3 = true;
seenStrings.add(line);
}
inFile.close();
HashSet<String> seenDocNames = new HashSet<String>();
HashMap<String,Integer> docOrder = new HashMap<String,Integer>();
// maps a document name to the order (0-indexed) in which it was seen
inFile = new BufferedReader(new FileReader(docInfoFileName));
for (int i = 0; i < numSentences; ++i) {
String line = inFile.readLine();
String docName = "";
if (format3) {
docName = line;
} else {
int sep_i = Math.max(line.lastIndexOf('_'),line.lastIndexOf('-'));
docName = line.substring(0,sep_i);
}
if (!seenDocNames.contains(docName)) {
seenDocNames.add(docName);
docOrder.put(docName,seenDocNames.size()-1);
}
int docOrder_i = docOrder.get(docName);
docOfSentence[i] = docOrder_i;
}
inFile.close();
numDocuments = seenDocNames.size();
} else { // badly formatted
}
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MertCore.processDocInfo(): " + e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in MertCore.processDocInfo(): " + e.getMessage());
System.exit(99902);
}
}
}
private boolean copyFile(String origFileName, String newFileName)
{
try {
File inputFile = new File(origFileName);
File outputFile = new File(newFileName);
InputStream in = new FileInputStream(inputFile);
OutputStream out = new FileOutputStream(outputFile);
byte[] buffer = new byte[1024];
int len;
while ((len = in.read(buffer)) > 0){
out.write(buffer, 0, len);
}
in.close();
out.close();
/*
InputStream inStream = new FileInputStream(new File(origFileName));
BufferedReader inFile = new BufferedReader(new InputStreamReader(inStream, "utf8"));
FileOutputStream outStream = new FileOutputStream(newFileName, false);
OutputStreamWriter outStreamWriter = new OutputStreamWriter(outStream, "utf8");
BufferedWriter outFile = new BufferedWriter(outStreamWriter);
String line;
while(inFile.ready()) {
line = inFile.readLine();
writeLine(line, outFile);
}
inFile.close();
outFile.close();
*/
return true;
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MertCore.copyFile(String,String): " + e.getMessage());
return false;
} catch (IOException e) {
System.err.println("IOException in MertCore.copyFile(String,String): " + e.getMessage());
return false;
}
}
private void renameFile(String origFileName, String newFileName)
{
if (fileExists(origFileName)) {
deleteFile(newFileName);
File oldFile = new File(origFileName);
File newFile = new File(newFileName);
if (!oldFile.renameTo(newFile)) {
println("Warning: attempt to rename " + origFileName + " to " + newFileName + " was unsuccessful!",1);
}
} else {
println("Warning: file " + origFileName + " does not exist! (in MertCore.renameFile)",1);
}
}
private void deleteFile(String fileName)
{
if (fileExists(fileName)) {
File fd = new File(fileName);
if (!fd.delete()) {
println("Warning: attempt to delete " + fileName + " was unsuccessful!",1);
}
}
}
private void writeLine(String line, BufferedWriter writer) throws IOException
{
writer.write(line, 0, line.length());
writer.newLine();
writer.flush();
}
public void finish()
{
if (myDecoder != null) {
myDecoder.cleanUp();
}
// create config file with final values
createConfigFile(lambda, decoderConfigFileName+".ZMERT.final",decoderConfigFileName+".ZMERT.orig");
// delete current decoder config file and decoder output
deleteFile(decoderConfigFileName);
deleteFile(decoderOutFileName);
// restore original name for config file (name was changed
// in initialize() so it doesn't get overwritten)
renameFile(decoderConfigFileName+".ZMERT.orig",decoderConfigFileName);
if (finalLambdaFileName != null) {
try {
PrintWriter outFile_lambdas = new PrintWriter(finalLambdaFileName);
for (int c = 1; c <= numParams; ++c) {
outFile_lambdas.println(paramNames[c] + " ||| " + lambda[c]);
}
outFile_lambdas.close();
} catch (IOException e) {
System.err.println("IOException in MertCore.finish(): " + e.getMessage());
System.exit(99902);
}
}
}
private String[] cfgFileToArgsArray(String fileName)
{
checkFile(fileName);
Vector<String> argsVector = new Vector<String>();
BufferedReader inFile = null;
try {
inFile = new BufferedReader(new FileReader(fileName));
String line, origLine;
do {
line = inFile.readLine();
origLine = line; // for error reporting purposes
if (line != null && line.length() > 0 && line.charAt(0) != '#') {
if (line.indexOf("#") != -1) { // discard comment
line = line.substring(0,line.indexOf("#"));
}
line = line.trim();
// now line should look like "-xxx XXX"
String[] paramA = line.split("\\s+");
if (paramA.length == 2 && paramA[0].charAt(0) == '-') {
argsVector.add(paramA[0]);
argsVector.add(paramA[1]);
} else if (paramA.length > 2 && (paramA[0].equals("-m") || paramA[0].equals("-docSet") || paramA[0].equals("-damianos"))) {
// -m (metricName), -docSet, and -damianos are allowed to have extra optinos
for (int opt = 0; opt < paramA.length; ++opt) { argsVector.add(paramA[opt]); }
} else {
println("Malformed line in config file:");
println(origLine);
System.exit(70);
}
}
} while (line != null);
inFile.close();
} catch (FileNotFoundException e) {
println("Z-MERT configuration file " + fileName + " was not found!");
System.err.println("FileNotFoundException in MertCore.cfgFileToArgsArray(String): " + e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in MertCore.cfgFileToArgsArray(String): " + e.getMessage());
System.exit(99902);
}
String[] argsArray = new String[argsVector.size()];
for (int i = 0; i < argsVector.size(); ++i) {
argsArray[i] = argsVector.elementAt(i);
}
return argsArray;
}
private void processArgsArray(String[] args)
{
processArgsArray(args,true);
}
private void processArgsArray(String[] args, boolean firstTime) {
/* set default values */
// Relevant files
dirPrefix = null;
sourceFileName = null;
refFileName = "reference.txt";
refsPerSen = 1;
textNormMethod = 1;
paramsFileName = "params.txt";
docInfoFileName = null;
finalLambdaFileName = null;
// MERT specs
metricName = "BLEU";
metricName_display = metricName;
metricOptions = new String[2];
metricOptions[0] = "4";
metricOptions[1] = "closest";
docSubsetInfo = new int[7];
docSubsetInfo[0] = 0;
maxMERTIterations = 20;
prevMERTIterations = 20;
minMERTIterations = 5;
stopMinIts = 3;
stopSigValue = -1;
//
// /* possibly other early stopping criteria here */
//
numOptThreads = 1;
saveInterFiles = 3;
compressFiles = 0;
initsPerIt = 20;
oneModificationPerIteration = false;
randInit = false;
seed = System.currentTimeMillis();
// useDisk = 2;
// Decoder specs
decoderCommandFileName = null;
passIterationToDecoder = 0;
decoderOutFileName = "output.nbest";
validDecoderExitValue = 0;
decoderConfigFileName = "dec_cfg.txt";
sizeOfNBest = 100;
fakeFileNameTemplate = null;
fakeFileNamePrefix = null;
fakeFileNameSuffix = null;
// Output specs
verbosity = 1;
decVerbosity = 0;
damianos_method = 0;
damianos_param = 0.0;
damianos_mult = 0.0;
int i = 0;
while (i < args.length) {
String option = args[i];
// Relevant files
if (option.equals("-dir")) { dirPrefix = args[i+1];
} else if (option.equals("-s")) { sourceFileName = args[i+1];
} else if (option.equals("-r")) { refFileName = args[i+1];
} else if (option.equals("-rps")) {
refsPerSen = Integer.parseInt(args[i+1]);
if (refsPerSen < 1) {
println("refsPerSen must be positive.");
System.exit(10);
}
} else if (option.equals("-txtNrm")) {
textNormMethod = Integer.parseInt(args[i+1]);
if (textNormMethod < 0 || textNormMethod > 4) {
println("textNormMethod should be between 0 and 4");
System.exit(10);
}
} else if (option.equals("-p")) {
paramsFileName = args[i+1];
} else if (option.equals("-docInfo")) {
docInfoFileName = args[i+1];
} else if (option.equals("-fin")) { finalLambdaFileName = args[i+1];
// MERT specs
} else if (option.equals("-m")) {
metricName = args[i+1];
metricName_display = metricName;
if (EvaluationMetric.knownMetricName(metricName)) {
int optionCount = EvaluationMetric.metricOptionCount(metricName);
metricOptions = new String[optionCount];
for (int opt = 0; opt < optionCount; ++opt) {
metricOptions[opt] = args[i+opt+2];
}
i += optionCount;
} else {
println("Unknown metric name " + metricName + ".");
System.exit(10);
}
} else if (option.equals("-docSet")) {
String method = args[i+1];
if (method.equals("all")) {
docSubsetInfo[0] = 0;
i += 0;
} else if (method.equals("bottom")) {
String a = args[i+2];
if (a.endsWith("d")) {
docSubsetInfo[0] = 1;
a = a.substring(0,a.indexOf("d"));
} else {
docSubsetInfo[0] = 2;
a = a.substring(0,a.indexOf("%"));
}
docSubsetInfo[5] = Integer.parseInt(a);
i += 1;
} else if (method.equals("top")) {
String a = args[i+2];
if (a.endsWith("d")) {
docSubsetInfo[0] = 3;
a = a.substring(0,a.indexOf("d"));
} else {
docSubsetInfo[0] = 4;
a = a.substring(0,a.indexOf("%"));
}
docSubsetInfo[5] = Integer.parseInt(a);
i += 1;
} else if (method.equals("window")) {
String a1 = args[i+2];
a1 = a1.substring(0,a1.indexOf("d")); // size of window
String a2 = args[i+4];
if (a2.indexOf("p") > 0) {
docSubsetInfo[0] = 5;
a2 = a2.substring(0,a2.indexOf("p"));
} else {
docSubsetInfo[0] = 6;
a2 = a2.substring(0,a2.indexOf("r"));
}
docSubsetInfo[5] = Integer.parseInt(a1);
docSubsetInfo[6] = Integer.parseInt(a2);
i += 3;
} else {
println("Unknown docSet method " + method + ".");
System.exit(10);
}
} else if (option.equals("-maxIt")) {
maxMERTIterations = Integer.parseInt(args[i+1]);
if (maxMERTIterations < 1) {
println("maxMERTIts must be positive.");
System.exit(10);
}
} else if (option.equals("-minIt")) {
minMERTIterations = Integer.parseInt(args[i+1]);
if (minMERTIterations < 1) {
println("minMERTIts must be positive.");
System.exit(10);
}
} else if (option.equals("-prevIt")) {
prevMERTIterations = Integer.parseInt(args[i+1]);
if (prevMERTIterations < 0) {
println("prevMERTIts must be non-negative.");
System.exit(10);
}
} else if (option.equals("-stopIt")) {
stopMinIts = Integer.parseInt(args[i+1]);
if (stopMinIts < 1) {
println("stopMinIts must be positive.");
System.exit(10);
}
} else if (option.equals("-stopSig")) {
stopSigValue = Double.parseDouble(args[i+1]);
}
//
// /* possibly other early stopping criteria here */
//
else if (option.equals("-thrCnt")) {
numOptThreads = Integer.parseInt(args[i+1]);
if (numOptThreads < 1) {
println("threadCount must be positive.");
System.exit(10);
}
} else if (option.equals("-save")) {
saveInterFiles = Integer.parseInt(args[i+1]);
if (saveInterFiles < 0 || saveInterFiles > 3) {
println("save should be between 0 and 3");
System.exit(10);
}
} else if (option.equals("-compress")) {
compressFiles = Integer.parseInt(args[i+1]);
if (compressFiles < 0 || compressFiles > 1) {
println("compressFiles should be either 0 or 1");
System.exit(10);
}
} else if (option.equals("-ipi")) {
initsPerIt = Integer.parseInt(args[i+1]);
if (initsPerIt < 1) {
println("initsPerIt must be positive.");
System.exit(10);
}
} else if (option.equals("-opi")) {
int opi = Integer.parseInt(args[i+1]);
if (opi == 1) {
oneModificationPerIteration = true;
} else if (opi == 0) {
oneModificationPerIteration = false;
} else {
println("oncePerIt must be either 0 or 1.");
System.exit(10);
}
} else if (option.equals("-rand")) {
int rand = Integer.parseInt(args[i+1]);
if (rand == 1) {
randInit = true;
} else if (rand == 0) {
randInit = false;
} else {
println("randInit must be either 0 or 1.");
System.exit(10);
}
} else if (option.equals("-seed")) {
if (args[i+1].equals("time")) {
seed = System.currentTimeMillis();
} else {
seed = Long.parseLong(args[i+1]);
}
}
/*
else if (option.equals("-ud")) {
useDisk = Integer.parseInt(args[i+1]);
if (useDisk < 0 || useDisk > 2) {
println("useDisk should be between 0 and 2");
System.exit(10);
}
}
*/
// Decoder specs
else if (option.equals("-cmd")) {
decoderCommandFileName = args[i+1];
} else if (option.equals("-passIt")) {
passIterationToDecoder = Integer.parseInt(args[i+1]);
if (passIterationToDecoder < 0 || passIterationToDecoder > 1) {
println("passIterationToDecoder should be either 0 or 1");
System.exit(10);
}
} else if (option.equals("-decOut")) {
decoderOutFileName = args[i+1];
} else if (option.equals("-decExit")) {
validDecoderExitValue = Integer.parseInt(args[i+1]);
} else if (option.equals("-dcfg")) {
decoderConfigFileName = args[i+1];
} else if (option.equals("-N")) {
sizeOfNBest = Integer.parseInt(args[i+1]);
if (sizeOfNBest < 1) {
println("N must be positive.");
System.exit(10);
}
}
// Output specs
else if (option.equals("-v")) {
verbosity = Integer.parseInt(args[i+1]);
if (verbosity < 0 || verbosity > 4) {
println("verbosity should be between 0 and 4");
System.exit(10);
}
} else if (option.equals("-decV")) {
decVerbosity = Integer.parseInt(args[i+1]);
if (decVerbosity < 0 || decVerbosity > 1) {
println("decVerbosity should be either 0 or 1");
System.exit(10);
}
} else if (option.equals("-fake")) {
fakeFileNameTemplate = args[i+1];
int QM_i = fakeFileNameTemplate.indexOf("?");
if (QM_i <= 0) {
println("fakeFileNameTemplate must contain '?' to indicate position of iteration number");
System.exit(10);
}
fakeFileNamePrefix = fakeFileNameTemplate.substring(0,QM_i);
fakeFileNameSuffix = fakeFileNameTemplate.substring(QM_i+1);
} else if (option.equals("-damianos")) {
damianos_method = Integer.parseInt(args[i+1]);
if (damianos_method < 0 || damianos_method > 3) {
println("damianos_method should be between 0 and 3");
System.exit(10);
}
damianos_param = Double.parseDouble(args[i+2]);
damianos_mult = Double.parseDouble(args[i+3]);
i += 2;
} else {
println("Unknown option " + option);
System.exit(10);
}
i += 2;
} // while (i)
if (maxMERTIterations < minMERTIterations) {
if (firstTime)
println("Warning: maxMERTIts is smaller than minMERTIts; "
+ "decreasing minMERTIts from " + minMERTIterations + " to maxMERTIts "
+ "(i.e. " + maxMERTIterations + ").",1);
minMERTIterations = maxMERTIterations;
}
if (dirPrefix != null) { // append dirPrefix to file names
refFileName = fullPath(dirPrefix,refFileName);
decoderOutFileName = fullPath(dirPrefix,decoderOutFileName);
paramsFileName = fullPath(dirPrefix,paramsFileName);
decoderConfigFileName = fullPath(dirPrefix,decoderConfigFileName);
if (sourceFileName != null) { sourceFileName = fullPath(dirPrefix,sourceFileName); }
if (docInfoFileName != null) { docInfoFileName = fullPath(dirPrefix,docInfoFileName); }
if (finalLambdaFileName != null) { finalLambdaFileName = fullPath(dirPrefix,finalLambdaFileName); }
if (decoderCommandFileName != null) { decoderCommandFileName = fullPath(dirPrefix,decoderCommandFileName); }
if (fakeFileNamePrefix != null) { fakeFileNamePrefix = fullPath(dirPrefix,fakeFileNamePrefix); }
}
// TODO: make this an argument
// TODO: also use this for the state file? could be tricky, since that file is created by ZMERT.java
// TODO: change name from tmpDirPrefix to tmpFilePrefix?
int k = decoderOutFileName.lastIndexOf("/");
if (k >= 0) {
tmpDirPrefix = decoderOutFileName.substring(0,k+1) + "ZMERT.";
} else {
tmpDirPrefix = "ZMERT.";
}
println("tmpDirPrefix: " + tmpDirPrefix);
checkFile(paramsFileName);
checkFile(decoderConfigFileName);
boolean canRunCommand = fileExists(decoderCommandFileName);
if (decoderCommandFileName != null && !canRunCommand) {
// i.e. a decoder command file was specified, but it was not found
if (firstTime)
println("Warning: specified decoder command file "
+ decoderCommandFileName + " was not found.",1);
}
boolean canRunJoshua = fileExists(sourceFileName);
if (sourceFileName != null && !canRunJoshua) {
// i.e. a source file was specified, but it was not found
if (firstTime)
println("Warning: specified source file "
+ sourceFileName + " was not found.",1);
}
boolean canRunFake = (fakeFileNameTemplate != null);
if (!canRunCommand && !canRunJoshua) { // can only run fake decoder
if (!canRunFake) {
println("Z-MERT cannot decode; must provide one of: command file (for external decoder),");
println(" source file (for Joshua decoder),");
println(" or prefix for existing output files (for fake decoder).");
System.exit(12);
}
int lastGoodIt = 0;
for (int it = 1; it <= maxMERTIterations; ++it) {
if (fileExists(fakeFileNamePrefix+it+fakeFileNameSuffix)) {
lastGoodIt = it;
} else {
break; // from for (it) loop
}
}
if (lastGoodIt == 0) {
println("Fake decoder cannot find first output file " + (fakeFileNamePrefix+1+fakeFileNameSuffix));
System.exit(13);
} else if (lastGoodIt < maxMERTIterations) {
if (firstTime)
println("Warning: can only run fake decoder; existing output files "
+ "are only available for the first " + lastGoodIt + " iteration(s).",1);
}
}
if (refsPerSen > 1) {
// the provided refFileName might be a prefix
File dummy = new File(refFileName);
if (!dummy.exists()) {
refFileName = createUnifiedRefFile(refFileName,refsPerSen);
}
} else {
checkFile(refFileName);
}
if (firstTime) {
println("Processed the following args array:",1);
print(" ",1);
for (i = 0; i < args.length; ++i) {
print(args[i] + " ",1);
}
println("",1);
println("",1);
}
} // processArgs(String[] args)
private void set_docSubsetInfo(int[] info)
{
/*
1: -docSet bottom 8d
2: -docSet bottom 25% the bottom ceil(0.20*numDocs) documents
3: -docSet top 8d
4: -docSet top 25% the top ceil(0.20*numDocs) documents
5: -docSet window 11d around 90percentile 11 docs centered around 80th percentile
(complain if not enough docs; don't adjust)
6: -docSet window 11d around 40rank 11 docs centered around doc ranked 50
(complain if not enough docs; don't adjust)
[0]: method (0-6)
[1]: first (1-indexed)
[2]: last (1-indexed)
[3]: size
[4]: center
[5]: arg1 (-1 for method 0)
[6]: arg2 (-1 for methods 0-4)
*/
if (info[0] == 0) { // all
info[1] = 1;
info[2] = numDocuments;
info[3] = numDocuments;
info[4] = (info[1] + info[2]) / 2;
} if (info[0] == 1) { // bottom d
info[3] = info[5];
info[2] = numDocuments;
info[1] = numDocuments - info[3] + 1;
info[4] = (info[1] + info[2]) / 2;
} if (info[0] == 2) { // bottom p
info[3] = (int)(Math.ceil((info[5]/100.0) * numDocuments));
info[2] = numDocuments;
info[1] = numDocuments - info[3] + 1;
info[4] = (info[1] + info[2]) / 2;
} if (info[0] == 3) { // top d
info[3] = info[5];
info[1] = 1;
info[2] = info[3];
info[4] = (info[1] + info[2]) / 2;
} if (info[0] == 4) { // top p
info[3] = (int)(Math.ceil((info[5]/100.0) * numDocuments));
info[1] = 1;
info[2] = info[3];
info[4] = (info[1] + info[2]) / 2;
} if (info[0] == 5) { // window around percentile
info[3] = info[5];
info[4] = (int)(Math.floor((info[6]/100.0) * numDocuments));
info[1] = info[4] - ((info[3]-1) / 2);
info[2] = info[4] + ((info[3]-1) / 2);
} if (info[0] == 6) { // window around rank
info[3] = info[5];
info[4] = info[6];
info[1] = info[4] - ((info[3]-1) / 2);
info[2] = info[4] + ((info[3]-1) / 2);
}
}
private void checkFile(String fileName)
{
if (!fileExists(fileName)) {
println("The file " + fileName + " was not found!");
System.exit(40);
}
}
private boolean fileExists(String fileName)
{
if (fileName == null) return false;
File checker = new File(fileName);
return checker.exists();
}
private void gzipFile(String inputFileName)
{
gzipFile(inputFileName, inputFileName + ".gz");
}
private void gzipFile(String inputFileName, String gzippedFileName)
{
// NOTE: this will delete the original file
try {
FileInputStream in = new FileInputStream(inputFileName);
GZIPOutputStream out = new GZIPOutputStream(new FileOutputStream(gzippedFileName));
byte[] buffer = new byte[4096];
int len;
while ((len = in.read(buffer)) > 0) {
out.write(buffer, 0, len);
}
in.close();
out.finish();
out.close();
deleteFile(inputFileName);
} catch (IOException e) {
System.err.println("IOException in MertCore.gzipFile(String,String): " + e.getMessage());
System.exit(99902);
}
}
private void gunzipFile(String gzippedFileName)
{
if (gzippedFileName.endsWith(".gz")) {
gunzipFile(gzippedFileName, gzippedFileName.substring(0,gzippedFileName.length()-3));
} else {
gunzipFile(gzippedFileName, gzippedFileName + ".dec");
}
}
private void gunzipFile(String gzippedFileName, String outputFileName)
{
// NOTE: this will delete the original file
try {
GZIPInputStream in = new GZIPInputStream(new FileInputStream(gzippedFileName));
FileOutputStream out = new FileOutputStream(outputFileName);
byte[] buffer = new byte[4096];
int len;
while ((len = in.read(buffer)) > 0) {
out.write(buffer, 0, len);
}
in.close();
out.close();
deleteFile(gzippedFileName);
} catch (IOException e) {
System.err.println("IOException in MertCore.gunzipFile(String,String): " + e.getMessage());
System.exit(99902);
}
}
private String createUnifiedRefFile(String prefix, int numFiles)
{
if (numFiles < 2) {
println("Warning: createUnifiedRefFile called with numFiles = " + numFiles + "; "
+ "doing nothing.",1);
return prefix;
} else {
File checker;
checker = new File(prefix+"1");
if (!checker.exists()) {
checker = new File(prefix+".1");
if (!checker.exists()) {
println("Can't find reference files.");
System.exit(50);
} else {
prefix = prefix + ".";
}
}
String outFileName;
if (prefix.endsWith(".")) { outFileName = prefix+"all"; }
else { outFileName = prefix+".all"; }
try {
PrintWriter outFile = new PrintWriter(outFileName);
BufferedReader[] inFile = new BufferedReader[numFiles];
int nextIndex;
checker = new File(prefix+"0");
if (checker.exists()) { nextIndex = 0; }
else { nextIndex = 1; }
int lineCount = countLines(prefix+nextIndex);
for (int r = 0; r < numFiles; ++r) {
if (countLines(prefix+nextIndex) != lineCount) {
println("Line count mismatch in " + (prefix+nextIndex) + ".");
System.exit(60);
}
InputStream inStream = new FileInputStream(new File(prefix+nextIndex));
inFile[r] = new BufferedReader(new InputStreamReader(inStream, "utf8"));
++nextIndex;
}
String line;
for (int i = 0; i < lineCount; ++i) {
for (int r = 0; r < numFiles; ++r) {
line = inFile[r].readLine();
outFile.println(line);
}
}
outFile.close();
for (int r = 0; r < numFiles; ++r) { inFile[r].close(); }
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MertCore.createUnifiedRefFile(String,int): " + e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in MertCore.createUnifiedRefFile(String,int): " + e.getMessage());
System.exit(99902);
}
return outFileName;
}
} // createUnifiedRefFile(String prefix, int numFiles)
private String normalize(String str, int normMethod)
{
if (normMethod == 0) return str;
// replace HTML/SGML
str = str.replaceAll(""","\"");
str = str.replaceAll("&","&");
str = str.replaceAll("<","<");
str = str.replaceAll(">",">");
str = str.replaceAll("'","'");
// split on these characters:
// ! " # $ % & ( ) * + / : ; < = > ? @ [ \ ] ^ _ ` { | } ~
// i.e. ASCII 33-126, except alphanumeric, and except "," "-" "." "'"
// ! "# $%& ( ) * +/:;<=> ?@ [ \ ] ^_` { | }~
String split_on = "!\"#\\$%&\\(\\)\\*\\+/:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}~";
// println("split_on: " + split_on);
for (int k = 0; k < split_on.length(); ++k) {
// for each split character, reprocess the string
String regex = "" + split_on.charAt(k);
if (regex.equals("\\")) {
++k;
regex += split_on.charAt(k);
}
str = str.replaceAll(regex," " + regex + " ");
}
// split on "." and "," and "-", conditioned on proper context
str = " " + str + " ";
str = str.replaceAll("\\s+"," ");
TreeSet<Integer> splitIndices = new TreeSet<Integer>();
for (int i = 0; i < str.length(); ++i) {
char ch = str.charAt(i);
if (ch == '.' || ch == ',') {
// split if either of the previous or next characters is a non-digit
char prev_ch = str.charAt(i-1);
char next_ch = str.charAt(i+1);
if (prev_ch < '0' || prev_ch > '9' || next_ch < '0' || next_ch > '9') {
splitIndices.add(i);
}
} else if (ch == '-') {
// split if preceded by a digit
char prev_ch = str.charAt(i-1);
if (prev_ch >= '0' && prev_ch <= '9') {
splitIndices.add(i);
}
}
}
String str0 = str;
str = "";
for (int i = 0; i < str0.length(); ++i) {
if (splitIndices.contains(i)) {
str += " " + str0.charAt(i) + " ";
} else {
str += str0.charAt(i);
}
}
// rejoin i'm, we're, *'s, won't, don't, etc
str = " " + str + " ";
str = str.replaceAll("\\s+"," ");
str = str.replaceAll(" i 'm "," i'm ");
str = str.replaceAll(" we 're "," we're ");
str = str.replaceAll(" 's ","'s ");
str = str.replaceAll(" 've ","'ve ");
str = str.replaceAll(" 'll ","'ll ");
str = str.replaceAll(" 'd ","'d ");
str = str.replaceAll(" n't ","n't ");
// remove spaces around dashes
if (normMethod == 2 || normMethod == 4) {
TreeSet<Integer> skipIndices = new TreeSet<Integer>();
str = " " + str + " ";
for (int i = 0; i < str.length(); ++i) {
char ch = str.charAt(i);
if (ch == '-') {
// rejoin if surrounded by spaces, and then letters
if (str.charAt(i-1) == ' ' && str.charAt(i+1) == ' ') {
if (Character.isLetter(str.charAt(i-2)) && Character.isLetter(str.charAt(i+2))) {
skipIndices.add(i-1);
skipIndices.add(i+1);
}
}
}
}
str0 = str;
str = "";
for (int i = 0; i < str0.length(); ++i) {
if (!skipIndices.contains(i)) {
str += str0.charAt(i);
}
}
}
// drop non-ASCII characters
if (normMethod == 3 || normMethod == 4) {
str0 = str;
str = "";
for (int i = 0; i < str0.length(); ++i) {
char ch = str0.charAt(i);
if (ch <= 127) { // i.e. if ASCII
str += ch;
}
}
}
str = str.replaceAll("\\s+"," ");
str = str.trim();
return str;
}
private int countLines(String fileName)
{
int count = 0;
try {
BufferedReader inFile = new BufferedReader(new FileReader(fileName));
String line;
do {
line = inFile.readLine();
if (line != null) ++count;
} while (line != null);
inFile.close();
} catch (IOException e) {
System.err.println("IOException in MertCore.countLines(String): " + e.getMessage());
System.exit(99902);
}
return count;
}
private int countNonEmptyLines(String fileName)
{
int count = 0;
try {
BufferedReader inFile = new BufferedReader(new FileReader(fileName));
String line;
do {
line = inFile.readLine();
if (line != null && line.length() > 0) ++count;
} while (line != null);
inFile.close();
} catch (IOException e) {
System.err.println("IOException in MertCore.countNonEmptyLines(String): " + e.getMessage());
System.exit(99902);
}
return count;
}
private String fullPath(String dir, String fileName)
{
File dummyFile = new File(dir,fileName);
return dummyFile.getAbsolutePath();
}
@SuppressWarnings("unused")
private void cleanupMemory()
{
cleanupMemory(100,false);
}
@SuppressWarnings("unused")
private void cleanupMemorySilently()
{
cleanupMemory(100,true);
}
@SuppressWarnings("static-access")
private void cleanupMemory(int reps, boolean silent)
{
int bytesPerMB = 1024 * 1024;
long totalMemBefore = myRuntime.totalMemory();
long freeMemBefore = myRuntime.freeMemory();
long usedMemBefore = totalMemBefore - freeMemBefore;
long usedCurr = usedMemBefore; long usedPrev = usedCurr;
// perform garbage collection repeatedly, until there is no decrease in
// the amount of used memory
for (int i = 1; i <= reps; ++i) {
myRuntime.runFinalization();
myRuntime.gc();
(Thread.currentThread()).yield();
usedPrev = usedCurr;
usedCurr = myRuntime.totalMemory() - myRuntime.freeMemory();
if (usedCurr == usedPrev) break;
}
if (!silent) {
long totalMemAfter = myRuntime.totalMemory();
long freeMemAfter = myRuntime.freeMemory();
long usedMemAfter = totalMemAfter - freeMemAfter;
println("GC: d_used = " + ((usedMemAfter - usedMemBefore) / bytesPerMB) + " MB "
+ "(d_tot = " + ((totalMemAfter - totalMemBefore) / bytesPerMB) + " MB).",2);
}
}
@SuppressWarnings("unused")
private void printMemoryUsage()
{
int bytesPerMB = 1024 * 1024;
long totalMem = myRuntime.totalMemory();
long freeMem = myRuntime.freeMemory();
long usedMem = totalMem - freeMem;
println("Allocated memory: " + (totalMem / bytesPerMB) + " MB "
+ "(of which " + (usedMem / bytesPerMB) + " MB is being used).",2);
}
private void println(Object obj, int priority) { if (priority <= verbosity) println(obj); }
private void print(Object obj, int priority) { if (priority <= verbosity) print(obj); }
private void println(Object obj) { System.out.println(obj); }
private void print(Object obj) { System.out.print(obj); }
private void showProgress()
{
++progress;
if (progress % 100000 == 0) print(".",2);
}
private double[] randomLambda()
{
double[] retLambda = new double[1+numParams];
for (int c = 1; c <= numParams; ++c) {
if (isOptimizable[c]) {
double randVal = randGen.nextDouble(); // number in [0.0,1.0]
++generatedRands;
randVal = randVal * (maxRandValue[c] - minRandValue[c]); // number in [0.0,max-min]
randVal = minRandValue[c] + randVal; // number in [min,max]
retLambda[c] = randVal;
} else {
retLambda[c] = defaultLambda[c];
}
}
return retLambda;
}
private double[] randomPerturbation(double[] origLambda, int i, double method, double param, double mult)
{
double sigma = 0.0;
if (method == 1) {
sigma = 1.0/Math.pow(i,param);
} else if (method == 2) {
sigma = Math.exp(-param*i);
} else if (method == 3) {
sigma = Math.max(0.0 , 1.0 - (i/param));
}
sigma = mult*sigma;
double[] retLambda = new double[1+numParams];
for (int c = 1; c <= numParams; ++c) {
if (isOptimizable[c]) {
double randVal = 2*randGen.nextDouble() - 1.0; // number in [-1.0,1.0]
++generatedRands;
randVal = randVal * sigma; // number in [-sigma,sigma]
randVal = randVal * origLambda[c]; // number in [-sigma*orig[c],sigma*orig[c]]
randVal = randVal + origLambda[c]; // number in [orig[c]-sigma*orig[c],orig[c]+sigma*orig[c]]
// = [orig[c]*(1-sigma),orig[c]*(1+sigma)]
retLambda[c] = randVal;
} else {
retLambda[c] = origLambda[c];
}
}
return retLambda;
}
private int c_fromParamName (String pName)
{
for (int c = 1; c <= numParams; ++c) {
if (paramNames[c].equals(pName)) return c;
}
return 0; // no parameter with that name!
}
private void setFeats(
double[][][] featVal_array, int i, int[] lastUsedIndex,
int[] maxIndex, double[] featVal)
{
int k = lastUsedIndex[i] + 1;
if (k > maxIndex[i]) {
for (int c = 1; c <= numParams; ++c) {
double[] temp = featVal_array[c][i];
featVal_array[c][i] = new double[1+maxIndex[i]+sizeOfNBest];
for (int k2 = 0; k2 <= maxIndex[i]; ++k2) {
featVal_array[c][i][k2] = temp[k2];
}
}
maxIndex[i] += sizeOfNBest;
// cleanupMemorySilently(); // UNCOMMENT THIS if cleaning up memory
}
for (int c = 1; c <= numParams; ++c) {
featVal_array[c][i][k] = featVal[c];
}
lastUsedIndex[i] += 1;
}
@SuppressWarnings("unused")
private HashSet<Integer> indicesToDiscard(double[] slope, double[] offset)
{
// some lines can be eliminated: the ones that have a lower offset
// than some other line with the same slope.
// That is, for any k1 and k2:
// if slope[k1] = slope[k2] and offset[k1] > offset[k2],
// then k2 can be eliminated.
// (This is actually important to do as it eliminates a bug.)
// print("discarding: ",4);
int numCandidates = slope.length;
HashSet<Integer> discardedIndices = new HashSet<Integer>();
HashMap<Double,Integer> indicesOfSlopes = new HashMap<Double,Integer>();
// maps slope to index of best candidate that has that slope.
// ("best" as in the one with the highest offset)
for (int k1 = 0; k1 < numCandidates; ++k1) {
double currSlope = slope[k1];
if (!indicesOfSlopes.containsKey(currSlope)) {
indicesOfSlopes.put(currSlope,k1);
} else {
int existingIndex = indicesOfSlopes.get(currSlope);
if (offset[existingIndex] > offset[k1]) {
discardedIndices.add(k1);
// print(k1 + " ",4);
} else if (offset[k1] > offset[existingIndex]) {
indicesOfSlopes.put(currSlope,k1);
discardedIndices.add(existingIndex);
// print(existingIndex + " ",4);
}
}
}
// old way of doing it; takes quadratic time (vs. linear time above)
/*
for (int k1 = 0; k1 < numCandidates; ++k1) {
for (int k2 = 0; k2 < numCandidates; ++k2) {
if (k1 != k2 && slope[k1] == slope[k2] && offset[k1] > offset[k2]) {
discardedIndices.add(k2);
// print(k2 + " ",4);
}
}
}
*/
// println("",4);
return discardedIndices;
} // indicesToDiscard(double[] slope, double[] offset)
public static void main(String[] args)
{
MertCore DMC = new MertCore(); // dummy MertCore object
// if bad args[], System.exit(80)
String configFileName = args[0];
String stateFileName = args[1];
int currIteration = Integer.parseInt(args[2]);
int randsToSkip = 0;
int earlyStop = 0;
double FINAL_score = 0.0;
int[] maxIndex = null;
if (currIteration == 1) {
EvaluationMetric.set_knownMetrics();
DMC.processArgsArray(DMC.cfgFileToArgsArray(configFileName),true);
randsToSkip = 0;
DMC.initialize(randsToSkip);
DMC.println("----------------------------------------------------",1);
DMC.println("Z-MERT run started @ " + (new Date()),1);
// DMC.printMemoryUsage();
DMC.println("----------------------------------------------------",1);
DMC.println("",1);
if (DMC.randInit) {
DMC.println("Initializing lambda[] randomly.",1);
// initialize optimizable parameters randomly (sampling uniformly from
// that parameter's random value range)
DMC.lambda = DMC.randomLambda();
}
DMC.println("Initial lambda[]: " + DMC.lambdaToString(DMC.lambda),1);
DMC.println("",1);
FINAL_score = DMC.evalMetric.worstPossibleScore();
maxIndex = new int[DMC.numSentences];
for (int i = 0; i < DMC.numSentences; ++i) { maxIndex[i] = DMC.sizeOfNBest - 1; }
earlyStop = 0;
} else {
EvaluationMetric.set_knownMetrics();
DMC.processArgsArray(DMC.cfgFileToArgsArray(configFileName),false);
double[] serA = null;
try {
ObjectInputStream in = new ObjectInputStream(new FileInputStream(stateFileName));
serA = (double[])in.readObject();
in.close();
// contents of serA[]:
// (*) last iteration
// (*) number of random numbers generated already
// (*) earlyStop
// (*) FINAL_score
// (*) lambda[]
// (*) maxIndex[]
// => length should be 4+numParams+numSentences
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MertCore.main(String[]): " + e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in MertCore.main(String[]): " + e.getMessage());
System.exit(99902);
} catch (ClassNotFoundException e) {
System.err.println("ClassNotFoundException in MertCore.main(String[]): " + e.getMessage());
System.exit(99904);
}
if (serA.length < 2) {
DMC.println("State file contains an array of length " + serA.length + "; "
+ "was expecting at least 2");
System.exit(81);
}
if ((int)serA[0] != currIteration-1) {
DMC.println("Iteration in state file is " + (int)serA[0] + "; "
+ "was expecting " + (currIteration-1));
System.exit(82);
}
randsToSkip = (int)serA[1];
DMC.initialize(randsToSkip); // declares lambda[], sets numParams and numSentences
if (serA.length != 4+DMC.numParams+DMC.numSentences) {
DMC.println("State file contains an array of length " + serA.length + "; "
+ "was expecting " + (4+DMC.numParams+DMC.numSentences));
System.exit(83);
}
earlyStop = (int)serA[2];
FINAL_score = serA[3];
for (int c = 1; c <= DMC.numParams; ++c) { DMC.lambda[c] = serA[3+c]; }
maxIndex = new int[DMC.numSentences];
for (int i = 0; i < DMC.numSentences; ++i) { maxIndex[i] = (int)serA[3+DMC.numParams+1+i]; }
}
double[] A = DMC.run_single_iteration(currIteration, DMC.minMERTIterations,
DMC.maxMERTIterations, DMC.prevMERTIterations, earlyStop, maxIndex);
if (A != null) {
FINAL_score = A[0];
earlyStop = (int)A[1];
randsToSkip = DMC.generatedRands;
}
if (A != null && A[2] != 1) {
double[] serA = new double[4+DMC.numParams+DMC.numSentences];
serA[0] = currIteration;
serA[1] = randsToSkip;
serA[2] = earlyStop;
serA[3] = FINAL_score;
for (int c = 1; c <= DMC.numParams; ++c) { serA[3+c] = DMC.lambda[c]; }
for (int i = 0; i < DMC.numSentences; ++i) { serA[3+DMC.numParams+1+i] = maxIndex[i]; }
try {
ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(stateFileName));
out.writeObject(serA);
out.flush();
out.close();
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MertCore.main(String[]): " + e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in MertCore.main(String[]): " + e.getMessage());
System.exit(99902);
}
System.exit(91);
} else {
// done
DMC.println("",1);
DMC.println("----------------------------------------------------",1);
DMC.println("Z-MERT run ended @ " + (new Date()),1);
// DMC.printMemoryUsage();
DMC.println("----------------------------------------------------",1);
DMC.println("",1);
DMC.println("FINAL lambda: " + DMC.lambdaToString(DMC.lambda)
+ " (" + DMC.metricName_display + ": " + FINAL_score + ")",1);
// check if a lambda is outside its threshold range
for (int c = 1; c <= DMC.numParams; ++c) {
if (DMC.lambda[c] < DMC.minThValue[c] || DMC.lambda[c] > DMC.maxThValue[c]) {
DMC.println("Warning: after normalization, lambda[" + c + "]=" + f4.format(DMC.lambda[c])
+ " is outside its critical value range.",1);
}
}
DMC.println("",1);
// delete intermediate .temp.*.it* decoder output files
for (int iteration = 1; iteration <= DMC.maxMERTIterations; ++iteration) {
if (DMC.compressFiles == 1) {
DMC.deleteFile(DMC.tmpDirPrefix+"temp.sents.it"+iteration+".gz");
DMC.deleteFile(DMC.tmpDirPrefix+"temp.feats.it"+iteration+".gz");
if (DMC.fileExists(DMC.tmpDirPrefix+"temp.stats.it"+iteration+".copy.gz")) {
DMC.deleteFile(DMC.tmpDirPrefix+"temp.stats.it"+iteration+".copy.gz");
} else {
DMC.deleteFile(DMC.tmpDirPrefix+"temp.stats.it"+iteration+".gz");
}
} else {
DMC.deleteFile(DMC.tmpDirPrefix+"temp.sents.it"+iteration);
DMC.deleteFile(DMC.tmpDirPrefix+"temp.feats.it"+iteration);
if (DMC.fileExists(DMC.tmpDirPrefix+"temp.stats.it"+iteration+".copy")) {
DMC.deleteFile(DMC.tmpDirPrefix+"temp.stats.it"+iteration+".copy");
} else {
DMC.deleteFile(DMC.tmpDirPrefix+"temp.stats.it"+iteration);
}
}
}
DMC.finish();
DMC.deleteFile(stateFileName);
System.exit(90);
}
}
}
// based on:
// http://www.javaworld.com/javaworld/jw-12-2000/jw-1229-traps.html?page=4
class StreamGobbler extends Thread {
InputStream istream;
boolean verbose;
StreamGobbler(InputStream is, int p) {
istream = is;
verbose = (p != 0);
}
public void run() {
try {
InputStreamReader isreader = new InputStreamReader(istream);
BufferedReader br = new BufferedReader(isreader);
String line = null;
while ((line = br.readLine()) != null) {
if (verbose) System.out.println(line);
}
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
}
/*
fake:
-----
ex2_N300:
java -javaagent:shiftone-jrat.jar -Xmx300m -cp bin joshua.ZMERT.ZMERT -dir MERT_example -s src.txt -r ref.all -rps 4 -cmd decoder_command_ex2.txt -dcfg config_ex2.txt -decOut nbest_ex2.out -N 300 -p params.txt -maxIt 25 -opi 0 -ipi 20 -v 2 -rand 0 -seed 1226091488390 -save 1 -fake nbest_ex2.out.N300.it > ex2_N300ipi20opi0_300max+defratios.it10.noMemRep.bugFixes.monitored.txt
ex2_N500:
java -javaagent:shiftone-jrat.jar -Xmx300m -cp bin joshua.ZMERT.ZMERT -dir MERT_example -s src.txt -r ref.all -rps 4 -cmd decoder_command_ex2.txt -dcfg config_ex2.txt -decOut nbest_ex2.out -N 500 -p params.txt -maxIt 25 -opi 0 -ipi 20 -v 2 -rand 0 -seed 1226091488390 -save 1 -fake nbest_ex2.out.N500.it > ex2_N500ipi20opi0_300max+defratios.it05.noMemRep.bugFixes.monitored.txt
exL_N300__600max:
java -javaagent:shiftone-jrat.jar -Xmx600m -cp bin joshua.ZMERT.ZMERT -dir MERT_example -s mt06_source.txt -r mt06_ref.all -rps 4 -cmd decoder_command_ex2.txt -dcfg config_ex2.txt -decOut nbest_exL.out -N 300 -p params.txt -maxIt 5 -opi 0 -ipi 20 -v 2 -rand 0 -seed 1226091488390 -save 1 -fake nbest_exL.out.it > exL_N300ipi20opi0_600max+defratios.it05.noMemRep.bugFixes.monitored.txt
exL_N300__300max:
java -javaagent:shiftone-jrat.jar -Xmx300m -cp bin joshua.ZMERT.ZMERT -dir MERT_example -s mt06_source.txt -r mt06_ref.all -rps 4 -cmd decoder_command_ex2.txt -dcfg config_ex2.txt -decOut nbest_exL.out -N 300 -p params.txt -maxIt 5 -opi 0 -ipi 20 -v 2 -rand 0 -seed 1226091488390 -save 1 -fake nbest_exL.out.it > exL_N300ipi20opi0_300max+defratios.it05.noMemRep.bugFixes.monitored.txt
gen:
----
ex2_N300:
make sure top_n=300 in MERT_example\config_ex2.txt
java -javaagent:shiftone-jrat.jar -Xmx300m -cp bin joshua.ZMERT.ZMERT -dir MERT_example -s src.txt -r ref.all -rps 4 -cmd decoder_command_ex2.txt -dcfg config_ex2.txt -decOut nbest_ex2.out -N 300 -p params.txt -maxIt 25 -opi 0 -ipi 20 -v 2 -rand 0 -seed 1226091488390 -save 1 > ex2_N300ipi20opi0_300max+defratios.itxx.monitored.txt.gen
ex2_N500:
make sure top_n=500 in MERT_example\config_ex2.txt
java -javaagent:shiftone-jrat.jar -Xmx300m -cp bin joshua.ZMERT.ZMERT -dir MERT_example -s src.txt -r ref.all -rps 4 -cmd decoder_command_ex2.txt -dcfg config_ex2.txt -decOut nbest_ex2.out -N 500 -p params.txt -maxIt 25 -opi 0 -ipi 20 -v 2 -rand 0 -seed 1226091488390 -save 1 > ex2_N500ipi20opi0_300max+defratios.itxx.monitored.txt.gen
exL_N300__600max:
run on CLSP machines only! (e.g. z12)
$JAVA_bin/java -javaagent:shiftone-jrat.jar -Xmx600m -cp bin joshua.ZMERT.ZMERT -dir YOURDIR -s mt06_source.txt -r mt06_ref.all -rps 4 -cmd decoder_command.txt -dcfg config_exL.txt -decOut nbest_exL.out -N 300 -p params.txt -maxIt 25 -opi 0 -ipi 20 -v 2 -rand 0 -seed 1226091488390 -save 1 > exL_N300ipi20opi0_600max+defratios.itxx.monitored.txt.gen
exL_N300__300max:
run on CLSP machines only! (e.g. z12)
$JAVA_bin/java -javaagent:shiftone-jrat.jar -Xmx300m -cp bin joshua.ZMERT.ZMERT -dir YOURDIR -s mt06_source.txt -r mt06_ref.all -rps 4 -cmd decoder_command.txt -dcfg config_exL.txt -decOut nbest_exL.out -N 300 -p params.txt -maxIt 25 -opi 0 -ipi 20 -v 2 -rand 0 -seed 1226091488390 -save 1 > exL_N300ipi20opi0_600max+defratios.itxx.monitored.txt.gen
*/