package joshua.discriminative.syntax_reorder; import java.io.BufferedReader; import java.io.BufferedWriter; import java.util.Hashtable; import java.util.Vector; import joshua.discriminative.FileUtilityOld; import joshua.util.FileUtility; /*Zhifei Li, <zhifei.work@gmail.com> * Johns Hopkins University */ public class RuleExtraction { public static void main(String[] args) { if(args.length!=4){ System.out.println("java ReorderPreprocess grammar file_parse file_parse_reordered file_flat_reordered"); //System.exit(0); } //BufferedReader f_rules = FileUtility.getReadFileStream("C:\\data_disk\\java_work_space\\PhraseExtraction\\phraseExtraction\\cn_reordering_rules.txt","UTF8"); //BufferedReader f_rules = FileUtility.getReadFileStream(args[0].trim(),"UTF8"); BufferedReader t_reader_tree = FileUtilityOld.getReadFileStream("C:\\data_disk\\java_work_space\\SyntaxMT\\phraseExtraction\\parse.sync.berkeley1","UTF8"); //BufferedReader t_reader_tree = FileUtility.getReadFileStream(args[1].trim(),"UTF8"); BufferedReader t_reader_align = FileUtilityOld.getReadFileStream("C:\\data_disk\\java_work_space\\SyntaxMT\\phraseExtraction\\aligned.ibm","UTF8"); //BufferedReader t_reader_tree = FileUtility.getReadFileStream(args[1].trim(),"UTF8"); BufferedReader t_reader_en = FileUtilityOld.getReadFileStream("C:\\data_disk\\java_work_space\\SyntaxMT\\phraseExtraction\\aligned.en.tmp1","UTF8"); //BufferedReader t_reader_tree = FileUtility.getReadFileStream(args[1].trim(),"UTF8"); BufferedWriter t_writer_rules = FileUtilityOld.getWriteFileStream("C:\\data_disk\\java_work_space\\SyntaxMT\\phraseExtraction\\extract.rule.tbl","UTF8"); //BufferedWriter t_writer_rules = FileUtility.getWriteFileStream(args[2].trim(),"UTF8"); //BufferedWriter t_writer_flat = FileUtility.getWriteFileStream("C:\\data_disk\\java_work_space\\PhraseExtraction\\phraseExtraction\\manual_align.txt.word.cn.flat.reordered","UTF8"); //BufferedWriter t_writer_flat = FileUtility.getWriteFileStream(args[3].trim(),"UTF8"); //read the rules into a hashtable RuleExtraction handler = new RuleExtraction(); //read the parse trees, and word-alignment String parse_str; String align_str; String en_str; int tgt_len=0; int count=0; while((parse_str=FileUtility.read_line_lzf(t_reader_tree))!=null){ //align_str = new String("0-0 1-2 2-3 3-9 4-10 5-11 5-12 6-13 7-14 8-15 9-9 10-4 10-5 11-4 11-5 12-7 13-6 13-8 14-16"); align_str = FileUtility.read_line_lzf(t_reader_align); en_str = FileUtility.read_line_lzf(t_reader_en); String[] t_ens = en_str.split("\\s+"); tgt_len = t_ens.length; if(tgt_len<=1){System.out.println("empty english string"); continue;} Tree t_tree = new Tree(parse_str); Hashtable align_tbl = setup_align_tbl(align_str); System.out.println("##### spans are"); t_tree.derive_span(align_tbl); System.out.println("##### complement spans are"); t_tree.derive_complement_spans(tgt_len); t_tree.tag_frontier_node(); Hashtable out_gr=new Hashtable(); count++; System.out.println("#####Rules from " + count +" sentence: " +parse_str); t_tree.extract_rule(out_gr,tgt_len,t_writer_rules); //if(count>2)break; } FileUtilityOld.closeReadFile(t_reader_tree); FileUtilityOld.closeReadFile(t_reader_align); FileUtilityOld.closeReadFile(t_reader_en); FileUtilityOld.closeWriteFile(t_writer_rules); System.out.println("In total, processed sentences: "+count); } public static Hashtable setup_align_tbl(String align_str){ Hashtable res_tbl = new Hashtable(); String[] links = align_str.split("\\s+"); for(int i=0; i < links.length; i++){ String[] ids = links[i].split("-"); Vector t_aligns =null; if(res_tbl.containsKey(new Integer(ids[0]))){//already have entry t_aligns = (Vector) res_tbl.get(new Integer(ids[0])); }else{ t_aligns = new Vector(); res_tbl.put(new Integer(ids[0]), t_aligns); //System.out.println(ids[0]); } t_aligns.add(new Integer(ids[1])); } //debug for(int i=0; i<15; i++){ if( res_tbl.containsKey(new Integer(i))) { //System.out.println(i + ": " + res_tbl.get(new Integer(i)).toString()); } } return res_tbl; } /*public static Hashtable setup_align_tbl(String align_str){ Hashtable res_tbl = new Hashtable(); String[] links = align_str.split("\\s+"); for(int i=0; i < links.length; i++){ String[] ids = links[i].split("-"); Vector t_aligns =null; if(res_tbl.containsKey(ids[0])){//already have entry t_aligns = (Vector) res_tbl.get(ids[0]); }else{ t_aligns = new Vector(); res_tbl.put(ids[0], t_aligns); //System.out.println(ids[0]); } t_aligns.add(ids[1]); } //debug for(int i=0; i<9 ; i++){ if(res_tbl.containsKey(Integer.toString(i))){ System.out.println(i + ": " + res_tbl.get(Integer.toString(i)).toString()); } } return res_tbl; } */ }