/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.discriminative.monolingual_parser; import java.io.BufferedReader; import java.io.IOException; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import joshua.corpus.vocab.SymbolTable; import joshua.decoder.JoshuaConfiguration; import joshua.decoder.chart_parser.Chart; import joshua.decoder.ff.FeatureFunction; import joshua.decoder.ff.tm.Grammar; import joshua.decoder.ff.tm.GrammarFactory; import joshua.decoder.hypergraph.HyperGraph; import joshua.lattice.Lattice; import joshua.util.FileUtility; /** * this class implements: * (1) interact with the chart-parsing functions to do the true decoding * * * @author Zhifei Li, <zhifei.work@gmail.com> * @version $LastChangedDate: 2008-10-20 00:12:30 -0400 $ */ //TODO: known synchronization problem: LM cache; srilm call; public abstract class MonolingualDecoderThread extends Thread { //these variables may be the same across all threads (e.g., just copy from DecoderFactory), or differ from thread to thread private final GrammarFactory[] grammarFactories;// = null; private final boolean haveLMModel;// = false; protected final List<FeatureFunction> featFunctions;// = null; private final List<Integer> defaultNonterminals;// = null; protected final SymbolTable symbolTable;// = null; //more test set specific private final String testFile; private final int startSentID; //start sent id private BufferedReader testReader; private static final Logger logger = Logger.getLogger(MonolingualDecoderThread.class.getName()); public MonolingualDecoderThread(GrammarFactory[] grammarFactories, boolean haveLMModel, List<FeatureFunction> featFunctions, List<Integer> defaultNonterminals, SymbolTable symbolTable, String testFile, int startSentID) throws IOException { this.grammarFactories = grammarFactories; this.haveLMModel = haveLMModel; this.featFunctions = featFunctions; this.defaultNonterminals = defaultNonterminals; this.symbolTable = symbolTable; this.testFile = testFile; this.startSentID = startSentID; this.testReader = FileUtility.getReadFileStream(testFile); } public abstract void postProcessHypergraph(HyperGraph hypergraph, int sentenceID) throws IOException; public abstract void postProcess() throws IOException; // DecoderThread.run() cannot throw anything public void run() { try { decodeFile(); } catch (IOException e) { e.printStackTrace(); System.exit(1); } } // TODO: log file is not properly handled for parallel decoding public void decodeFile() throws IOException { String cn_sent; int sent_id = startSentID; // if no sent tag, then this will be used while ((cn_sent = FileUtility.read_line_lzf(testReader)) != null) { if (logger.isLoggable(Level.FINE)) logger.fine("now translate\n" + cn_sent); int[] tem_id = new int[1]; cn_sent = get_sent_id(cn_sent, tem_id); if (tem_id[0] > 0) { sent_id = tem_id[0]; } translate( this.grammarFactories, this.featFunctions, cn_sent, this.defaultNonterminals, sent_id ); sent_id++; } testReader.close(); postProcess(); } /** * Translate a sentence. * * @param grammars Translation grammars to be used during translation. * @param models Models to be used when scoring rules. * @param sentence The sentence to be translated. * @param defaultNonterminals * @param sentenceID * @param topN * @param diskHyperGraph * @param kbestExtractor */ private void translate(GrammarFactory[] grammarFactories, List<FeatureFunction> models, String sentence, List<Integer> defaultNonterminals, int sentenceID ) throws IOException { long start = System.currentTimeMillis(); int[] sentence_numeric = this.symbolTable.getIDs(sentence); Integer[] input = new Integer[sentence_numeric.length]; for (int i = 0; i < sentence_numeric.length; i++) { input[i] = sentence_numeric[i]; } Lattice<Integer> inputLattice = new Lattice<Integer>(input); Grammar[] grammars = new Grammar[grammarFactories.length]; for (int i = 0; i < grammarFactories.length; i++) { grammars[i] = grammarFactories[i].getGrammarForSentence(null);//?????????????????????????????????????????????????????????????????????? // grammars[i].sortGrammar(models);//TODO: for batch grammar, we do not want to sort it every time } //==========================seeding: the chart only sees the grammars, not the grammarFactories Chart chart = new Chart( inputLattice, models, null, this.symbolTable, sentenceID, grammars, this.haveLMModel, JoshuaConfiguration.goal_symbol, null); if (logger.isLoggable(Level.FINER)) logger.finer("after seed, time: " + (System.currentTimeMillis() - start) / 1000); //=========================parsing HyperGraph p_hyper_graph = chart.expand(); if (logger.isLoggable(Level.FINER)) logger.finer("after expand, time: " + (System.currentTimeMillis() - start) / 1000); postProcessHypergraph(p_hyper_graph, sentenceID); } //return sent without the tag //if no sent id, then return -1 in sent_id[] private static String get_sent_id(String sent, int[] sent_id) { if (sent.matches("^<seg\\s+id=.*$")) { // havd sent id String res_sent = sent.replaceAll("^<seg\\s+id=\"", ""); String str_id = ""; for (int i = 0; i < res_sent.length(); i++) { char cur = res_sent.charAt(i); if (cur != '"') { str_id += cur; } else { break; } } int res_id = Integer.parseInt(str_id); res_sent = res_sent.replaceFirst(str_id + "\">", ""); res_sent = res_sent.replaceAll("</seg>", ""); sent_id[0] = res_id; return res_sent; } else { sent_id[0] = -1; return sent; } } }