/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.decoder.ff.tm.hiero; import joshua.decoder.ff.tm.BatchGrammar; import joshua.decoder.ff.tm.Rule; import joshua.decoder.ff.tm.BilingualRule; import joshua.decoder.ff.tm.GrammarReader; import joshua.decoder.ff.tm.Trie; import joshua.corpus.vocab.SymbolTable; import java.io.IOException; import java.util.HashMap; import java.util.logging.Level; import java.util.logging.Logger; /** * This class implements a memory-based bilingual BatchGrammar. * <p> * The rules are stored in a trie. Each trie node has: * (1) RuleBin: a list of rules matching the french sides so far * (2) A HashMap of next-layer trie nodes, the next french word * used as the key in HashMap * * @author Zhifei Li, <zhifei.work@gmail.com> * @version $LastChangedDate: 2010-01-08 11:00:40 -0600 (Fri, 08 Jan 2010) $ */ public class MemoryBasedBatchGrammar extends BatchGrammar { //=============================================================== // Instance Fields //=============================================================== static private double temEstcost = 0.0; private int qtyRulesRead = 0; private int qtyRuleBins = 0; private MemoryBasedTrie root = null; //protected ArrayList<FeatureFunction> featureFunctions = null; private int defaultOwner; private float oovFeatureCost = 100; /** * the OOV rule should have this lhs, this should be grammar * specific as only the grammar knows what LHS symbol can * be combined with other rules */ private int defaultLHS; private int spanLimit = 10; private final SymbolTable symbolTable; private GrammarReader<BilingualRule> modelReader; //=============================================================== // Static Fields //=============================================================== public static final int OOV_RULE_ID = 0; /* Three kinds of rules: * regular rule (id>0) * oov rule (id=0) * null rule (id=-1) */ static int ruleIDCount = 1; /** Logger for this class. */ private static final Logger logger = Logger.getLogger(MemoryBasedBatchGrammar.class.getName()); //=============================================================== // Constructors //=============================================================== public MemoryBasedBatchGrammar() { symbolTable = null; } public MemoryBasedBatchGrammar( String formatKeyword, String grammarFile, SymbolTable symbolTable, String defaultOwner, String defaultLHSSymbol, int spanLimit, float oovFeatureCost_) throws IOException { this.symbolTable = symbolTable; this.defaultOwner = this.symbolTable.addTerminal(defaultOwner); this.defaultLHS = this.symbolTable.addNonterminal(defaultLHSSymbol); this.spanLimit = spanLimit; this.oovFeatureCost = oovFeatureCost_; this.root = new MemoryBasedTrie(); //==== loading grammar this.modelReader = createReader(formatKeyword, grammarFile, symbolTable); if (modelReader != null) { modelReader.initialize(); for (BilingualRule rule : modelReader) if (rule != null) addRule(rule); } else { if (logger.isLoggable(Level.WARNING)) logger.warning("Couldn't create a GrammarReader for file " + grammarFile + " with format " + formatKeyword); } this.printGrammar(); } protected GrammarReader<BilingualRule> createReader(String formatKeyword, String grammarFile, SymbolTable symbolTable){ if ("hiero".equals(formatKeyword)) { return new HieroFormatReader(grammarFile, symbolTable); } else if ("samt".equals(formatKeyword)) { return new SamtFormatReader(grammarFile, symbolTable); } else { // TODO: throw something? // TODO: add special warning if "heiro" mispelling is used if (logger.isLoggable(Level.WARNING)) logger.warning("Unknown GrammarReader format " + formatKeyword); return null; } } //=============================================================== // Methods //=============================================================== public int getNumRules() { return this.qtyRulesRead; } public Rule constructOOVRule(int qtyFeatures, int sourceWord, int targetWord, boolean hasLM) { int[] french = new int[1]; french[0] = sourceWord; int[] english = new int[1]; english[0] = targetWord; float[] feat_scores = new float[qtyFeatures]; // TODO: This is a hack to make the decoding without a LM works /**when a ngram LM is used, the OOV word will have a cost 100. * if no LM is used for decoding, so we should set the cost of some * TM feature to be maximum * */ if ( (!hasLM) && qtyFeatures > 0) { feat_scores[0] = oovFeatureCost; } return new BilingualRule(this.defaultLHS, french, english, feat_scores, 0, this.defaultOwner, 0, getOOVRuleID()); } public int getOOVRuleID() { return OOV_RULE_ID; } public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords, float[] scores, int arity) { return new BilingualRule(lhs, sourceWords, targetWords, scores, arity, this.defaultOwner, 0, getOOVRuleID()); } /** * if the span covered by the chart bin is greater than the * limit, then return false */ public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) { if (this.spanLimit == -1) { // mono-glue grammar return (startIndex == 0); } else { return (endIndex - startIndex <= this.spanLimit); } } public Trie getTrieRoot() { return this.root; } protected void addRule(BilingualRule rule) { // TODO: Why two increments? this.qtyRulesRead++; ruleIDCount++; rule.setRuleID(ruleIDCount); rule.setOwner(defaultOwner); // TODO: make sure costs are calculated here or in reader temEstcost += rule.getEstCost(); //=== identify the position, and insert the trie nodes as necessary MemoryBasedTrie pos = root; int[] french = rule.getFrench(); for (int k = 0; k < french.length; k++) { int curSymID = french[k]; /**Note that the nonTerminal symbol in the french is not cleaned (i.e., will be sth * like [X,1]), but the symbol in the Trie has to be cleaned, so that the match does * not care about the markup (i.e., [X,1] or [X,2] means the same thing, that is X)*/ if (this.symbolTable.isNonterminal(french[k])) { curSymID = modelReader.cleanNonTerminal(french[k]); } MemoryBasedTrie nextLayer = pos.matchOne(curSymID); if (null == nextLayer) { nextLayer = new MemoryBasedTrie(); if (pos.hasExtensions() == false) { pos.childrenTbl = new HashMap<Integer, MemoryBasedTrie>(); } pos.childrenTbl.put(curSymID, nextLayer); } pos = nextLayer; } //=== add the rule into the trie node if (! pos.hasRules()) { pos.ruleBin = new MemoryBasedRuleBin(rule.getArity(), rule.getFrench()); this.qtyRuleBins++; } pos.ruleBin.addRule(rule); } // BUG: This always prints 0 for all fields protected void printGrammar() { if (logger.isLoggable(Level.INFO)) { logger.info("###########Grammar###########"); logger.info(String.format( "####num_rules: %d; num_bins: %d; num_pruned: %d; sumest_cost: %.5f", this.qtyRulesRead, this.qtyRuleBins, 0, temEstcost)); } /*if(root!=null) root.print_info(Support.DEBUG);*/ } }