package joshua.discriminative.monolingual_parser; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.util.HashMap; import joshua.util.FileUtility; public class StructureLMFeatureAugmenter { public static void main(String[] args) throws IOException{ /*String bilingualGrammarFile = "C:\\data_disk\\java_work_space\\sf_trunk\\example\\example.hiero.tm.gz"; String monolingualGrammarFile = "C:\\data_disk\\java_work_space\\sf_trunk\\example\\example.hiero.mono.tm"; */ if(args.length<3){ System.out.println("Wrong number of parameters, it must have at least two parameters: java StructuredLMEM f_joshua_config f_train"); System.exit(1); } String bilingualGrammarFile = args[0]; String monolingualGrammarFile = args[1]; String newbilingualGrammarFile = args[2]; BufferedReader bilingualReader = FileUtility.getReadFileStream(bilingualGrammarFile); BufferedReader monoReader = FileUtility.getReadFileStream(monolingualGrammarFile); BufferedWriter newBilingualReader = FileUtility.getWriteFileStream(newbilingualGrammarFile); HashMap<String, Double> tbl_eng_rules = new HashMap<String, Double>(); //read mono grammar into memory String nonterminalReplaceRegexp = "[\\[\\]\\,0-9]+"; String line; while ((line = FileUtility.read_line_lzf(monoReader)) != null) { String[] fds = line.split("\\s+\\|{3}\\s+");//[x] ||| cn ||| en ||| feature-scores if (fds.length != 3) { //Support.write_log_line("rule line does not have four fds; " + line, Support.ERROR); System.out.println("rule line does not have four fds; " + line); } String[] scores = fds[2].split("\\s+"); double ruleCost = new Double(scores[0]); fds[1]= fds[1].replaceAll(nonterminalReplaceRegexp, "");//remove [, ], and numbers tbl_eng_rules.put(fds[1], ruleCost);//TODO: what about LHS } monoReader.close(); //add slm feature into bilingual grammar while ((line = FileUtility.read_line_lzf(bilingualReader)) != null) { String[] fds = line.split("\\s+\\|{3}\\s+");//[x] ||| cn ||| en ||| feature-scores if (fds.length != 4) { //Support.write_log_line("rule line does not have four fds; " + line, Support.ERROR); System.out.println("rule line does not have four fds; " + line); } fds[2]= fds[2].replaceAll(nonterminalReplaceRegexp, "");//remove [, ], and numbers Double slmCost = (Double)tbl_eng_rules.get(fds[2]);//TODO: what about LHS if(slmCost==null){ System.out.println("no slm cost for rule, must be wrong"); System.out.println(line); //System.exit(1); } StringBuffer newRule = new StringBuffer(); newRule.append(line); newRule.append(" "); newRule.append(slmCost); newRule.append("\n"); newBilingualReader.write(newRule.toString()); } bilingualReader.close(); newBilingualReader.close(); } }