/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.decoder.ff.lm.srilm; import java.util.logging.Logger; import joshua.corpus.vocab.SrilmSymbol; import joshua.decoder.ff.lm.AbstractLM; /** * * @author Zhifei Li, <zhifei.work@gmail.com> * @version $LastChangedDate: 2009-05-17 19:17:48 -0500 (Sun, 17 May 2009) $ */ public class LMGrammarSRILM extends AbstractLM { SWIGTYPE_p_Ngram p_srilm; private static final Logger logger = Logger.getLogger(LMGrammarSRILM.class.getName()); public LMGrammarSRILM(SrilmSymbol symbol, int order, String lm_file) { super(symbol, order); logger.info("using local SRILM for the language model"); //p_srilm = srilm.initLM(order_, p_symbol.getLMStartID(), p_symbol.getLMEndID() );//TODO p_srilm = symbol.getSrilmPointer(); read_lm_grammar_from_file(lm_file);//TODO: what about sentence-specific? } // read grammar locally by the Java implementation private void read_lm_grammar_from_file(String grammar_file) { long start_loading_time = System.currentTimeMillis(); logger.info("reading language model with SRILM tool"); srilm.readLM(p_srilm, grammar_file); logger.info("finished reading language model"); //logger.info("##### mem used (kb): " + Support.getMemoryUse()); logger.info("##### time used (seconds): " + (System.currentTimeMillis() - start_loading_time) / 1000); } //note: when using the srilm C interfact, the srilm itself will NOT do the replacement to unk, so it will return a zero-prob for unknown word //however, if using the srilm in the command line, the srilm will do the replacement to unk //since we have trouble to run the replace_with_unk (because we do not know the vocabulary), we will let srilm return a zero-prob, and then replace with the ceiling cost /*note: the mismatch between srilm and our java implemtation is in: when unk words used as context, in java it will be replaced with "<unk>", but srilm will not, therefore the *lm cost by srilm may be smaller than by java, this happens only when the LM file have "<unk>" in backoff state*/ protected double ngramLogProbability_helper(int[] ngram_wrds, int order) { /*int[] ngram_wrds=replace_with_unk(ngram_wrds_in); if(ngram_wrds[ngram_wrds.length-1]==Symbol.UNK_SYM_ID)//TODO: wrong implementation in hiero return -Decoder.lm_ceiling_cost; //TODO: untranslated words*/ int hist_size = ngram_wrds.length-1; double res = 0.0; SWIGTYPE_p_unsigned_int hist; //TODO in principle, there should not have bad left-side state symbols, though need to check hist = srilm.new_unsigned_array(hist_size); for (int i = 0; i < hist_size; i++) { srilm.unsigned_array_setitem(hist, i, ngram_wrds[i]); } res = srilm.getProb_lzf(p_srilm, hist, hist_size, ngram_wrds[hist_size]); srilm.delete_unsigned_array(hist); return res; } protected double logProbabilityOfBackoffState_helper( int[] ngram, int order, int qtyAdditionalBackoffWeight ) { throw new UnsupportedOperationException("probabilityOfBackoffState_helper undefined for srilm"); } }