package joshua.discriminative.syntax_reorder;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Set;
import java.util.Vector;
import joshua.discriminative.FileUtilityOld;
import joshua.util.FileUtility;
public class ReorderPreprocess {
/**by zhifei li@jhu
* zhifei.work@gmail.com
* to display chinese: change the locale (as well as format) to Chinese/PRC in regional setting
*/
public static void main(String[] args) {
if(args.length!=4){
System.out.println("java ReorderPreprocess grammar file_parse file_parse_reordered file_flat_reordered");
//System.exit(0);
}
//( (FRAG (NR �»���) (NR ����) (NT ����) (NT ʮһ��) (NN ��) (PU () (NN ����) (NR �ƺ�) (PU ))))
//String parse_str = new String("(IP (NP (NN ����) (NN ͳ��)) (VP (VV ����) (PU ,) (IP (IP (NP (NP (NP (PU ��) (NT ����) (PU ��)) (NP (NN �ڼ�))) (NP (NR ())) (VP (NP (NT һ�ž�0��D) (NT һ�ž�����)) (VP (VV ))))) (PU ,) (IP (NP (DNP (NP (NP (NR �й�)) (NP (NN ����) (NN Ͷ��) (NN ��ҵ))) (DEG ��)) (NP (NN ����))) (VP (VV ��) (NP (CP (IP (VP (NP (NN ֱ��)) (VP (VV ����)))) (DEC ֮)) (NP (NN ��))))) (PU ,) (IP (NP (NN ���)) (VP (ADVP (AD ���)) (VP (VV ��) (QP (CD �ٷ�֮��ʮ����))))) (PU ,) (IP (NP (NN ���)) (VP (ADVP (AD ���)) (VP (VV ��) (QP (CD �ٷ�֮��ʮ�˵���))))))) (PU ��))");
//String parse_str = new String("(FRAG (NR ֮��) (NR biejing) (NT er) (NT 1) (NN dian) (PU () (NN jizhe) (NR tangn) (PU )))");
//BufferedReader f_rules = FileUtility.getReadFileStream("C:\\data_disk\\java_work_space\\PhraseExtraction\\phraseExtraction\\cn_reordering_rules.txt","UTF8");
BufferedReader f_rules = FileUtilityOld.getReadFileStream(args[0].trim(),"UTF8");
//BufferedReader t_reader_tree = FileUtility.getReadFileStream("C:\\data_disk\\java_work_space\\PhraseExtraction\\phraseExtraction\\manual_align.txt.word.cn.parsed","UTF8");
//BufferedReader t_reader_tree = FileUtility.getReadFileStream("C:\\data_disk\\java_work_space\\PhraseExtraction\\phraseExtraction\\corpus.zh.parsed","UTF8");
BufferedReader t_reader_tree = FileUtilityOld.getReadFileStream(args[1].trim(),"UTF8");
//BufferedWriter t_writer_tree = FileUtility.getWriteFileStream("C:\\data_disk\\java_work_space\\PhraseExtraction\\phraseExtraction\\manual_align.txt.word.cn.tree.reordered","UTF8");
BufferedWriter t_writer_tree = FileUtilityOld.getWriteFileStream(args[2].trim(),"UTF8");
//BufferedWriter t_writer_flat = FileUtility.getWriteFileStream("C:\\data_disk\\java_work_space\\PhraseExtraction\\phraseExtraction\\manual_align.txt.word.cn.flat.reordered","UTF8");
BufferedWriter t_writer_flat = FileUtilityOld.getWriteFileStream(args[3].trim(),"UTF8");
//read the rules into a hashtable
ReorderPreprocess handler = new ReorderPreprocess();
Hashtable rules_tbl = handler.setup_rules_tbl(f_rules);
//read the parse trees, and transform them
String parse_str;
int count=0;
while((parse_str=FileUtility.read_line_lzf(t_reader_tree))!=null){
//parse_str = new String("( (IP (NP (NP (NR ��-��)) (NP (NN ��2�) (NN ��Ա))) (VP (VV ˵) (PU ,) (IP (IP (NP (CP (IP (NP (DNP (NP (ADJP (JJ ���)) (NP (NN ����))) (DEG ��)) (NP (NN �ɻ�))) (VP (NP (NP (NT 13��)) (NP (NN ����) (NN ʱ��)) (NT �賿) (NT 4��)) (VP (VV Ͷ��)))) (DEC ��)) (QP (CD }) (CLP (M ö))) (NP (NN ����) (CC ��) (NN ը��))) (PU ,) (VP (VV ����) (NP (QP (CD һ) (CLP (M ��))) (ADJP (JJ ����)) (NP (NN ����)) (ADJP (JJ �?�)) (NP (NN ����))))) (PU ,) (IP (NP (CP (IP (VP (VV ��) (PP (P ��) (NP (NN ����))))) (DEC ��)) (QP (ADVP (AD ����)) (QP (CD 500) (CLP (M ��)))) (NP (NN ƽ��))) (VP (SB ��) (VP (VV ը��)))) (PU ,) (IP (NP (NN ����)) (VP (ADVP (AD ���)) (VP (VC ��) (NP (NN ��Ů) (CC ��) (NN ��ͯ))))))) (PU ��)))");
//System.out.println(parse_str);
Tree t_tree = new Tree(parse_str);
handler.reorder_tree(t_tree, rules_tbl);
t_tree.printTree(t_writer_tree);
//t_tree.print_tree(null);
t_tree.printTreeTerminals(t_writer_flat);
//t_tree.print_tree_terminals(null);
//t_tree.print_tree_statistics();
count++;
if(count%1000==0)System.out.println("Process lines : " + count);
}
FileUtilityOld.closeWriteFile(t_writer_tree);
FileUtilityOld.closeWriteFile(t_writer_flat);
FileUtilityOld.closeReadFile(t_reader_tree);
handler.print_reorder_statistics(rules_tbl);
System.out.println("In total, processed sentences: "+count);
}
public void reorder_tree(Tree t_tree,Hashtable rules_tbl){
reorder_node(t_tree.root,rules_tbl);
}
private void reorder_node(TreeNode root, Hashtable rules_tbl){//root is the root of the current sub-tree
if(root.name.compareTo("fake_root")!=0 && root.name.compareTo("")!=0 && root.children.size()>1){//not the fake root, nor fake node, must have more than two children
if(rules_tbl.containsKey(root.name)){//in the rules_tbl, have rules with the lhs as root.name
//System.out.println("have reording rules for: " +root.name);
Vector t_rules = (Vector)rules_tbl.get(root.name);
for(int i=0; i<t_rules.size(); i++){//try to match each rule
ReorderRule rule = (ReorderRule) t_rules.get(i);
if( rule.reorder(root)==true)//TODO: now, we just reorder the node based on the first match, need to reconsider
break;
}
}
}
//recursively reorder children
for(int i=0;i<root.children.size();i++){
reorder_node((TreeNode)root.children.get(i),rules_tbl);
}
}
//read the rules into a hashatable
private Hashtable setup_rules_tbl(BufferedReader f_rules){
Hashtable res_tbl = new Hashtable();
String rule_str = "";
while((rule_str=FileUtility.read_line_lzf(f_rules))!=null){////( (FRAG (NR �»���) (NR ����) (NT ����) (NT ʮһ��) (NN ��) (PU () (NN ����) (NR �ƺ�) (PU ))))
ReorderRule t_r = new ReorderRule(rule_str);
Vector t_rules =null;
if(res_tbl.containsKey(t_r.name)){//already have entry
t_rules = (Vector) res_tbl.get(t_r.name);
}else{
t_rules = new Vector();
res_tbl.put(t_r.name, t_rules);
}
t_rules.add(t_r);
t_r.print_rule();
}
FileUtilityOld.closeReadFile(f_rules);
return res_tbl;
}
private void print_reorder_statistics(Hashtable rules_tbl){
System.out.println("------ Statistics of the number of times that a rule get applied -----");
Set keys = rules_tbl.keySet();
Iterator iter = keys.iterator();
int c_total=0;
while (iter.hasNext())
{
String tag = (String)iter.next();
Vector t_rules = (Vector) rules_tbl.get(tag);
for(int i=0 ; i<t_rules.size();i++){
ReorderRule t_rule = (ReorderRule) t_rules.get(i);
t_rule.print_rule();
System.out.println("; Times applied: " + t_rule.count_applied);
c_total+=t_rule.count_applied;
}
}
System.out.println("In totoal, number of times applied: " + c_total);
}
}