/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.discriminative.monolingual_parser; import java.io.IOException; import java.util.HashMap; import java.util.logging.Level; import java.util.logging.Logger; import joshua.corpus.vocab.SymbolTable; import joshua.decoder.ff.tm.BatchGrammar; import joshua.decoder.ff.tm.GrammarReader; import joshua.decoder.ff.tm.MonolingualRule; import joshua.decoder.ff.tm.Rule; import joshua.decoder.ff.tm.Trie; import joshua.decoder.ff.tm.hiero.MemoryBasedRuleBin; import joshua.decoder.ff.tm.hiero.MemoryBasedTrie; /** * this class implements MemoryBasedBatchGrammar * * @author Zhifei Li, <zhifei.work@gmail.com> * @version $LastChangedDate: 2009-03-09 12:52:29 -0400 ( 2009) $ */ public class MonolingualGrammar extends BatchGrammar { /*TMGrammar is composed by Trie nodes Each trie node has: (1) RuleBin: a list of rules matching the french sides so far (2) a HashMap of next-layer trie nodes, the next french word used as the key in HashMap */ // =============================================================== // Instance Fields // =============================================================== protected int qtyRulesRead = 0; protected int qtyRuleBins = 0; protected MemoryBasedTrie root = null; boolean addFakeFeatScoreForEM = false;//if this grammar is for EM, we will add a fake feature score for each rule protected int defaultOwner; protected int defaultLHS; protected int goalSymbol; protected int spanLimit = 10; SymbolTable symbolTable = null; protected GrammarReader<MonolingualRule> modelReader; // =============================================================== // Static Fields // =============================================================== public static int OOV_RULE_ID = 0; private static final Logger logger = Logger.getLogger(MonolingualGrammar.class.getName()); static int ruleIDCount =1; //three kinds of rule: regular rule (id>0); oov rule (id=0), and null rule (id=-1) static protected double tem_estcost = 0.0;//debug public MonolingualGrammar(){ //do nothing } public MonolingualGrammar( String formatKeyword, SymbolTable psymbolTable, String grammarFile, String default_owner, String defaultLHSSymbol, String goalSymbol, int span_limit, boolean addFakeFeatScoreForEM_ ) throws IOException { this.symbolTable = psymbolTable; this.defaultOwner = symbolTable.addTerminal(default_owner); this.defaultLHS = this.symbolTable.addNonterminal(defaultLHSSymbol); this.goalSymbol = this.symbolTable.addNonterminal(goalSymbol); this.spanLimit = span_limit; this.addFakeFeatScoreForEM = addFakeFeatScoreForEM_; this.root = new MemoryBasedTrie(); ////==== loading grammar this.modelReader = createReader(formatKeyword, grammarFile, symbolTable); if (modelReader != null) { modelReader.initialize(); for (MonolingualRule rule : modelReader) addRule(rule); } this.printGrammar(); } protected GrammarReader<MonolingualRule> createReader(String formatKeyword, String grammarFile, SymbolTable symbolTable) { if ("monolingual".equals(formatKeyword)) { return new MonolingualGrammarReader(grammarFile, symbolTable, addFakeFeatScoreForEM); } else { logger.severe("wrong grammar formatKeyword: " + formatKeyword); return null; } } // =============================================================== // Methods // =============================================================== public int getNumRules() { return qtyRulesRead; } public Rule constructOOVRule(int num_feats, int sourceWord, int targetWord, boolean have_lm_model) { int[] p_french = new int[1]; p_french[0] = sourceWord; float[] feat_scores; if(addFakeFeatScoreForEM) feat_scores = new float[num_feats+1]; else feat_scores = new float[num_feats]; /**TODO * This is a hack to make the decoding without a LM works * */ if(have_lm_model==false){//no LM is used for decoding, so we should set the stateless cost //this.feat_scores[0]=100.0/((FeatureFunction)p_l_models.get(0)).getWeight();//TODO feat_scores[0]=100;//TODO } return new MonolingualRule(this.defaultLHS, p_french, feat_scores, 0, this.defaultOwner, 0, getOOVRuleID()); } public int getOOVRuleID() { return OOV_RULE_ID; } /** * if the span covered by the chart bin is greater than the limit, * then return false **/ // TODO: catch glue grammar case in glue grammar class? public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) { if (this.spanLimit == -1) { // mono-glue grammar return (startIndex == 0); } else { return (endIndex - startIndex <= this.spanLimit); } } public Trie getTrieRoot() { return this.root; } protected void addRule(MonolingualRule rule) { // TODO: Why two increments? this.qtyRulesRead++; ruleIDCount++; rule.setRuleID(ruleIDCount); rule.setOwner(defaultOwner); // TODO: make sure costs are calculated here or in reader tem_estcost += rule.getEstCost(); // identify the position, and insert the trie nodes as necessary MemoryBasedTrie pos = root; int[] p_french = rule.getFrench(); for (int k = 0; k < p_french.length; k++) { int cur_sym_id = p_french[k]; if (this.symbolTable.isNonterminal(p_french[k])) { cur_sym_id = modelReader.cleanNonTerminal(p_french[k]); } MemoryBasedTrie next_layer = pos.matchOne(cur_sym_id); if (null == next_layer) { next_layer = new MemoryBasedTrie(); if (pos.hasExtensions() == false) { pos.setExtensions( new HashMap<Integer, MemoryBasedTrie>() ); } pos.getExtensionsTable().put(cur_sym_id, next_layer); } pos = next_layer; } this.insertRule(pos, rule); } protected void insertRule(MemoryBasedTrie pos, MonolingualRule rule) { // add the rule into the trie node if (! pos.hasRules()) { pos.setRuleBin( new MemoryBasedRuleBin(rule.getArity(), rule.getFrench()) ); this.qtyRuleBins++; } ((MemoryBasedRuleBin)pos.getRules()).addRule(rule); } protected void printGrammar() { if (logger.isLoggable(Level.INFO)) { logger.info("###########Grammar###########"); logger.info(String.format("####num_rules: %d; num_bins: %d; num_pruned: %d; sumest_cost: %.5f", this.qtyRulesRead, this.qtyRuleBins, 0, tem_estcost)); } /*if(root!=null) root.print_info(Support.DEBUG);*/ } //====================== functions for EM training ========================== /**We use * the last field of featScores to store the posteriorProb collected during E step * the first field of featScores to store the normalized cost in the M step */ public static float incrementRulePosteriorProb(Rule rl, double posteriorProb){ return rl.incrementFeatureScore(rl.getFeatureScores().length-1, posteriorProb); } public static float getRulePosteriorProb(Rule rl){ return rl.getFeatureCost(rl.getFeatureScores().length-1); } public static void resetRulePosteriorProb(Rule rl){ rl.setFeatureCost(rl.getFeatureScores().length-1, 0); } public static float getRuleNormalizedCost(Rule rl){ return rl.getFeatureCost(0); } static float CEILING_COST = 100; public static void setRuleNormalizedCost(Rule rl, float prob){ float cost = (float) -Math.log(prob); if(cost>CEILING_COST) cost = CEILING_COST; rl.setFeatureCost(0, cost); } public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords, float[] scores, int aritity) { // TODO Auto-generated method stub return null; } }