package joshua.decoder.ff.tm.hiero; import java.util.Arrays; import java.util.logging.Logger; import joshua.corpus.vocab.SymbolTable; import joshua.decoder.JoshuaConfiguration; import joshua.decoder.ff.tm.BilingualRule; import joshua.decoder.ff.tm.GrammarReader; public class SamtFormatReader extends GrammarReader<BilingualRule> { private static final Logger logger = Logger .getLogger(SamtFormatReader.class.getName()); private static final String samtNonTerminalMarkup; private int[] nonTerminalCache; static { fieldDelimiter = "#"; nonTerminalRegEx = "^@[^\\s]+"; nonTerminalCleanRegEx = "[\\,0-9\\s]+"; samtNonTerminalMarkup = "@"; description = "Original SAMT format"; } public SamtFormatReader(String grammarFile, SymbolTable vocabulary) { super(grammarFile, vocabulary); // TODO: should be limited to maxNTs + 1 if defined in config. // position 0 will never be used nonTerminalCache = new int[30]; } // Format example: // @VZ-HD @APPR-DA+ART-DA minutes#@2 protokoll @1#@PP-MO+VZ-HD#0 1 1 -0 0.5 -0 @Override protected BilingualRule parseLine(String line) { String[] fields = line.split(fieldDelimiter); if (fields.length != 4) { logger.severe("Rule line does not have four fields: " + line); logger.severe("Skipped."); return null; } int lhs = symbolTable.addNonterminal(adaptNonTerminalMarkup(fields[2])); int arity = 0; // foreign side String[] foreignWords = fields[0].split("\\s+"); int[] french = new int[foreignWords.length]; for (int i = 0; i < foreignWords.length; i++) { if (isNonTerminal(foreignWords[i])) { arity++; french[i] = symbolTable.addNonterminal(adaptNonTerminalMarkup(foreignWords[i], arity)); nonTerminalCache[arity] = french[i]; } else { french[i] = symbolTable.addTerminal(foreignWords[i]); } } // HACK: avoid source-side loop rules // TODO: global lookup for goal symbol id would really help here if ((french.length == 1) && (arity == 1) && !adaptNonTerminalMarkup(fields[2]).equals(JoshuaConfiguration.goal_symbol)) { return null; } // english side String[] englishWords = fields[1].split("\\s+"); int[] english = new int[englishWords.length]; for (int i = 0; i < englishWords.length; i++) { if (isNonTerminal(englishWords[i])) { english[i] = nonTerminalCache[Integer. parseInt(cleanSamtNonTerminal(englishWords[i]))]; } else { english[i] = symbolTable.addTerminal(englishWords[i]); } } // feature scores String[] scores = fields[3].split("\\s+"); float[] feature_scores = new float[scores.length]; int i = 0; for (String score : scores) { feature_scores[i++] = Float.parseFloat(score); } return new BilingualRule(lhs, french, english, feature_scores, arity); } protected String cleanSamtNonTerminal(String word) { // changes SAMT markup to Hiero-style return word.replaceAll(samtNonTerminalMarkup, ""); } protected String adaptNonTerminalMarkup(String word) { // changes SAMT markup to Hiero-style return "[" + word.replaceAll(",", "_COMMA_") .replaceAll("\\$", "_DOLLAR_") .replaceAll(samtNonTerminalMarkup, "") + "]"; } protected String adaptNonTerminalMarkup(String word, int ntIndex) { // changes SAMT markup to Hiero-style return "[" + word.replaceAll(",", "_COMMA_") .replaceAll("\\$", "_DOLLAR_") .replaceAll(samtNonTerminalMarkup, "") + "," + ntIndex + "]"; } @Override public String toTokenIds(BilingualRule rule) { StringBuffer sb = new StringBuffer(); sb.append(rule.getLHS()); sb.append(" ||| "); sb.append(Arrays.toString(rule.getFrench())); sb.append(" ||| "); sb.append(Arrays.toString(rule.getEnglish())); sb.append(" |||"); float[] feature_scores = rule.getFeatureScores(); for (int i = 0; i < feature_scores.length; i++) { sb.append(String.format(" %.4f", feature_scores[i])); } return sb.toString(); } @Override public String toTokenIdsWithoutFeatureScores(BilingualRule rule) { StringBuffer sb = new StringBuffer(); sb.append(rule.getLHS()); sb.append(" ||| "); sb.append(Arrays.toString(rule.getFrench())); sb.append(" ||| "); sb.append(Arrays.toString(rule.getEnglish())); return sb.toString(); } @Override public String toWords(BilingualRule rule) { StringBuffer sb = new StringBuffer(); sb.append(symbolTable.getWord(rule.getLHS())); sb.append(" ||| "); sb.append(symbolTable.getWords(rule.getFrench())); sb.append(" ||| "); sb.append(symbolTable.getWords(rule.getEnglish())); sb.append(" |||"); float[] feature_scores = rule.getFeatureScores(); for (int i = 0; i < feature_scores.length; i++) { sb.append(String.format(" %.4f", feature_scores[i])); } return sb.toString(); } @Override public String toWordsWithoutFeatureScores(BilingualRule rule) { StringBuffer sb = new StringBuffer(); sb.append(symbolTable.getWord(rule.getLHS())); sb.append(" ||| "); sb.append(symbolTable.getWords(rule.getFrench())); sb.append(" ||| "); sb.append(symbolTable.getWords(rule.getEnglish())); sb.append(" |||"); return sb.toString(); } }