package joshua.discriminative.monolingual_parser; import java.util.logging.Logger; import joshua.corpus.vocab.SymbolTable; import joshua.decoder.ff.tm.GrammarReader; import joshua.decoder.ff.tm.MonolingualRule; import joshua.decoder.ff.tm.hiero.HieroFormatReader; public class MonolingualGrammarReader extends GrammarReader<MonolingualRule> { boolean addFakeFeatScoreForEM = false; private static final Logger logger = Logger .getLogger(HieroFormatReader.class.getName()); static { fieldDelimiter = "\\s+\\|{3}\\s+"; nonTerminalRegEx = "^\\[[A-Z]+\\,[0-9]*\\]$"; //nonTerminalCleanRegEx = "[\\[\\]\\,0-9]+"; nonTerminalCleanRegEx = "[\\,0-9\\s]+"; description = "Original monolingual format"; } public MonolingualGrammarReader(String grammarFile, SymbolTable vocabulary, boolean addFakeFeatScoreForEM_) { super(grammarFile, vocabulary); this.addFakeFeatScoreForEM = addFakeFeatScoreForEM_; } @Override protected MonolingualRule parseLine(String line) { String[] fields = line.split(fieldDelimiter); if (fields.length != 3) { logger.severe("Rule line does not have four fields: " + line); } int lhs = symbolTable.addNonterminal(cleanNonTerminal(fields[0])); int arity = 0; // foreign side String[] foreignWords = fields[1].split("\\s+"); int[] french = new int[foreignWords.length]; for (int i = 0; i < foreignWords.length; i++) { if (isNonTerminal(foreignWords[i])) { arity++; french[i] = symbolTable.addNonterminal(foreignWords[i]); } else { french[i] = symbolTable.addTerminal(foreignWords[i]); } } // feature scores String[] scores = fields[2].split("\\s+"); float[] feature_scores; if(addFakeFeatScoreForEM) feature_scores = new float[scores.length+1]; else feature_scores = new float[scores.length]; int i = 0; for (String score : scores) { feature_scores[i++] = Float.parseFloat(score); } //?????????????? res.estimateRuleCost(p_l_models);//estimate lower-bound, and set statelesscost, this must be called return new MonolingualRule(lhs, french, feature_scores, arity); } @Override public String toTokenIds(MonolingualRule rule) { StringBuffer sb = new StringBuffer(); sb.append(rule.getLHS()); sb.append(" ||| "); sb.append(rule.getFrench()); sb.append(" |||"); float[] feature_scores = rule.getFeatureScores(); for (int i = 0; i < feature_scores.length; i++) { sb.append(String.format(" %.4f", feature_scores[i])); } return sb.toString(); } @Override public String toTokenIdsWithoutFeatureScores(MonolingualRule rule) { StringBuffer sb = new StringBuffer(); sb.append(rule.getLHS()); sb.append(" ||| "); sb.append(rule.getFrench()); return sb.toString(); } @Override public String toWords(MonolingualRule rule) { StringBuffer sb = new StringBuffer(); sb.append(rule.getLHS()); sb.append(" ||| "); sb.append(symbolTable.getWords(rule.getFrench())); sb.append(" ||| "); float[] feature_scores = rule.getFeatureScores(); for (int i = 0; i < feature_scores.length; i++) { sb.append(String.format(" %.4f", feature_scores[i])); } return sb.toString(); } @Override public String toWordsWithoutFeatureScores(MonolingualRule rule) { StringBuffer sb = new StringBuffer(); sb.append(rule.getLHS()); sb.append(" ||| "); sb.append(symbolTable.getWords(rule.getFrench())); return sb.toString(); } }