/** * */ package joshua.discriminative.syntax_reorder; import java.io.BufferedWriter; import java.util.ArrayList; import java.util.Hashtable; import java.util.List; import java.util.Map; import java.util.Vector; import joshua.discriminative.FileUtilityOld; public class TreeNode { public String name=""; public String nameAfterReorder=""; public List<TreeNode> children = new ArrayList<TreeNode>(); public TreeNode parent=null; //public int pos=-1;//what is my position in my parent public String terminalSymbol=""; //with alignment information int terminalID=-1; //if this is a pre-terminal, then remember the terminal id (start from 0) Vector span = null;//according to ISI's syntaxMT, this only consider the first/last word index in the english string, e.g., 1-3 or 10-10 Vector complementSpans = null;//according to ISI's syntaxMT, it can be : 1-3,6-8,10-10 boolean isFrontier=false; public TreeNode(){ } public TreeNode(String n){ this.name = n; this.nameAfterReorder = name; } public void setAsPreTerminal(String symbol){ this.terminalSymbol = symbol; } public void replaceContentWith(TreeNode to){ this.name=to.name; this.nameAfterReorder=to.nameAfterReorder; this.children=to.children; this.parent=to.parent; this.terminalSymbol=to.terminalSymbol; } //check whether the "from" is subsumed by myself, from may be taged by X public boolean isSubsume(TreeNode from, Map<String, TreeNode> tag_tbl){ //tag_tbl will remember how the node is taged according rule in "from" boolean res=true; String from_str = new String(from.name); String this_str = new String(this.name); if(from_str.matches("x\\d+\\:.+")){ tag_tbl.put(from_str.substring(0, 2),this);//TODO: assume the d is between [0,9] //System.out.println("from_str: " + from_str + " Size: " +tag_tbl.size()); } from_str = from_str.replaceAll("x\\d+\\:", "");//e.g., skip "x0:" in x0:NP //if(from_str.compareTo(this_str)!=0){//first, see whether the name match int res_match = patternMatch(from_str, this_str); if(res_match==0){//first, see whether the name match res=false; //tag_tbl.clear(); }else if(res_match==1){//name match pass, need recursively "AND" chilren match //System.out.println("AND children match"); if(from.children.size()>0){//if from has no children, then return true if(from.children.size()==this.children.size()){ //System.out.println("chilren size:" + this.l_children.size()); for(int i=0; i<from.children.size();i++){ if(( this.children.get(i)).isSubsume(from.children.get(i), tag_tbl)==false){ res=false; //tag_tbl.clear(); break; } } }else{ res=false; //tag_tbl.clear(); } } }else if(res_match==2){//name match pass, need recursively "OR" chilren match //System.out.println("OR children match"); boolean t_res=false; if(from.children.size()==1){//In "OR" condition, must have one and only one child for(int i=0; i<this.children.size();i++){ /*note that we should not clear the tag_tbl if one of the chilren fails, that's why we delete tag_tbl.clear()*/ if(( this.children.get(i)).isSubsume( from.children.get(0),tag_tbl)==true){//any sucess t_res=true; break; } } } if(t_res==false){ res=false; //tag_tbl.clear(); } }else{ //this should not happen } //System.out.println("res: " + res + " size: " + tag_tbl.size()); return res; } //########################## with alignment information public boolean setFrontierFlag(){ if(span==null){//unaligned source node isFrontier = false; return false; }else if(complementSpans==null || complementSpans.size()<=0){//span over all the english words isFrontier=true; return true; }else{ int span_start=((Integer)span.get(0)).intValue(); int span_end=((Integer)span.get(1)).intValue(); for(int i=0; i< complementSpans.size(); i++){ Vector t_comp = (Vector) complementSpans.get(i); if( ( span_start >= ((Integer)t_comp.get(0)).intValue() && span_start <= ((Integer)t_comp.get(1)).intValue()) || ( span_end >= ((Integer)t_comp.get(0)).intValue() && span_end <= ((Integer)t_comp.get(1)).intValue()) || ( span_start <= ((Integer)t_comp.get(0)).intValue() && span_end >= ((Integer)t_comp.get(1)).intValue())//subsume ){ isFrontier = false; return false; } } isFrontier=true; return true; } } public void deriveRule(Hashtable rule_tbl, int len_tgt, BufferedWriter out){ //NP ||| (x0:DNP (LCP fake) (DEG fake)) (x1:NP fake) ||| x1 x0 ||| 0 0 0 0 0 if(isFrontier==true && children.size()>1){//only extract rule for frontier node with more than one children int[] x_id = new int[1]; x_id[0]=0; String[] v_rhs_frags= new String[len_tgt];//the index is the start pos in the target, value is the rhs symbol String[] v_rhs_unaligned= new String[len_tgt+1];//the index remember how many spans should put before it String[] str_lhs = new String[1]; str_lhs[0]=""; //get the lhs symbols ctrlDeriveSubrule( x_id, str_lhs, v_rhs_frags, v_rhs_unaligned); //now begin to work on the rhs symbols System.out.print(str_lhs[0]+" => "); String str_rhs=""; int num_comsumed_span=0; for(int start_pos = 0; start_pos< v_rhs_frags.length; start_pos++){ if(v_rhs_frags[start_pos]!=null){ //before we print aligned symbol, look at unaligned one for(int n_left = 0; n_left<v_rhs_unaligned.length && n_left<=num_comsumed_span; n_left++){ if(v_rhs_unaligned[n_left]!=null){ //System.out.print(v_rhs_unaligned[n_left] +" "); str_rhs += v_rhs_unaligned[n_left] +" "; v_rhs_unaligned[n_left]=null; } } //print aligned symbols //System.out.print(v_rhs_frags[start_pos] +" "); str_rhs += v_rhs_frags[start_pos] +" "; num_comsumed_span++; } } //print all the remaining unalinged words for(int n_left = 0; n_left<v_rhs_unaligned.length; n_left++){ if(v_rhs_unaligned[n_left]!=null){ //System.out.print(v_rhs_unaligned[n_left] +" "); str_rhs += v_rhs_unaligned[n_left] +" "; v_rhs_unaligned[n_left]=null; } } if(out==null) System.out.print(str_lhs[0].trim()+" => " + str_rhs.trim() +"\n"); else{ FileUtilityOld.writeLzf(out,str_lhs[0].trim()+" ||| " + str_rhs.trim() +" ||| 1\n"); } } } /* return value * 0: "AND"/"OR" name-matches-stage fail * 1: "AND" name-matches-stage susscessful, need to do full-chilren match * 2: "OR" name-matches-stage susscessful, means that for children match, we should return true as long as one match the pattern, do not consider the number of chilren * */ private int patternMatch(String from, String to){ if(from.matches("\\|.+")){ String from2 = from.replaceFirst("\\|", ""); if(to.compareTo(from2)==0) return 2; }else{ if(from.compareTo("*")==0 || to.compareTo("*")==0)//anything will match return 1; else if(from.matches("\\!.+")){ String from2 = from.replaceFirst("\\!", ""); if(to.compareTo(from2)!=0) return 1; }else if(to.matches("\\!.+")){ String to2 = to.replaceFirst("\\!", ""); if(from.compareTo(to2)!=0) return 1; }else if(from.compareTo(to)==0){ return 1; } } return 0; } private void ctrlDeriveSubrule(int[] x_id, String[] str_lhs, String[] v_rhs_frags, String[] v_rhs_unaligned){ //System.out.print("(" + name + " "); str_lhs[0] += "(" + name + " "; for(int i=0; i<children.size(); i++){ TreeNode t_child = (TreeNode) children.get(i); t_child.deriveSubrule(x_id,str_lhs,v_rhs_frags, v_rhs_unaligned); if(i<children.size()-1){ //System.out.print(" "); str_lhs[0] +=" "; } } //System.out.print(")"); str_lhs[0] +=")"; } private void addRhs(String[] v_rhs_unaligned, int pos, String sym){ if(v_rhs_unaligned[pos]==null) v_rhs_unaligned[pos] = sym; else v_rhs_unaligned[pos] +=" "+ sym; } private void deriveSubrule(int[] x_id,String[] str_lhs, String[] v_rhs_frags, String[] v_rhs_unaligned){ if(this.isFrontier==true){//note: pre-terminal can be frontier node //System.out.print("(x"+x_id[0]+":"+name + " f)"); str_lhs[0]+="(x"+x_id[0]+":"+name + " f)"; addRhs(v_rhs_frags, (Integer)span.get(0), "x"+x_id[0]); x_id[0]++; }else if(terminalSymbol != ""){//pre-terminal, chilren are special if(span==null){ //System.out.print("(" + name + " ("+terminal_symbol+" n))"); str_lhs[0]+="(" + name + " ("+terminalSymbol+" n))"; addRhs(v_rhs_unaligned, v_rhs_frags.length, terminalSymbol);//remember how many continuos span before me }else{//non-frontier pre-terminal /*TODO: now, we sort the words according to their start span, this may not be good for non-coninous translation * for example, a prominent rile <=> ����(prominent) ����(a role), may need to a rule "(NP (ADJP (JJ (���� f))) (NP (NN (���� f)))) => ���� ����" * problem is due to 1-to-m non-continuous translation*/ //System.out.print("(" + name + " ("+terminal_symbol+" f))"); str_lhs[0]+="(" + name + " ("+terminalSymbol+" f))"; addRhs(v_rhs_frags, (Integer)span.get(0), terminalSymbol); } }else{//call my children ctrlDeriveSubrule( x_id, str_lhs, v_rhs_frags,v_rhs_unaligned); } } }