/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public * License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this library; if not, write to the Free Software Foundation, * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package joshua.aligner; import java.util.*; public class JoshuaDerivationTree { public String root; public Vector<JoshuaDerivationTree> branches; public int numBranches; public boolean isPre; // true only if some child is a leaf public boolean isLeaf; // true only if branches is null (equiv. numBranches is 0) public int numTgtWords; // i.e. in all leaves of this tree public int leftSrcIndex; public int rightSrcIndex; public int leftTgtIndex; public int rightTgtIndex; public JoshuaDerivationTree() {} public JoshuaDerivationTree(String str, int seenWords) { if (str.charAt(0) != '(') { // leaf root = str; branches = null; numBranches = 0; isPre = false; isLeaf = true; leftSrcIndex = -1; // unknown rightSrcIndex = -1; // unknown numTgtWords = root.split("\\s+").length; leftTgtIndex = seenWords; rightTgtIndex = leftTgtIndex + numTgtWords; // println(root + " IS $leaf$ spanning tgt words " + leftTgtIndex + "-" + rightTgtIndex); } else { // inner node str = str.substring(1,str.length()-1); // strip parentheses String rootInfo = str.substring(0,str.indexOf(' ')); root = rootInfo.substring(0,rootInfo.indexOf('{')); leftSrcIndex = Integer.parseInt(rootInfo.substring(rootInfo.indexOf('{')+1,rootInfo.indexOf('-'))); rightSrcIndex = Integer.parseInt(rootInfo.substring(rootInfo.indexOf('-')+1,rootInfo.indexOf('}'))); str = (str.substring(str.indexOf(' '))).trim(); // println("ROOT: " + rootInfo); if (!rootInfo.equals(root+"{"+leftSrcIndex+"-"+rightSrcIndex+"}")) println("ROOT MISMATCH!!!"); branches = new Vector<JoshuaDerivationTree>(); numBranches = 0; numTgtWords = 0; int len = str.length(); int i = 0; while (i < len) { ++numBranches; int i_init, i_fin; /* print("Before route decision, "); if (i == len) println("i equals len"); else if (i < len) println("char @ i is '" + str.charAt(i)+ "'"); else println("WEIRD: i is " + i); */ i_init = i; if (str.charAt(i_init) == '(') { int open = 1; ++i; while (open != 0) { if (str.charAt(i) == '(') ++open; else if (str.charAt(i) == ')') --open; ++i; } i_fin = i; } else { while (i < len && str.charAt(i) != '(' && str.charAt(i) != ')') ++i; if (i != len) --i; i_fin = i; } // println("About to add branch #" + numBranches + ":"); // println(" \"" + (str.substring(i_init,i_fin)).trim() + "\""); /* if (i == len) println("i equals len"); else if (i < len) println("char @ i is '" + str.charAt(i)+ "'"); else println("WEIRD: i is " + i); */ branches.add(new JoshuaDerivationTree((str.substring(i_init,i_fin)).trim(),seenWords)); int numNewTgtWords = branches.elementAt(numBranches-1).numTgtWords; seenWords += numNewTgtWords; numTgtWords += numNewTgtWords; while (i < len && str.charAt(i) == ' ') ++i; /* print("After advancing i, "); if (i == len) println("i equals len"); else if (i < len) println("char @ i is '" + str.charAt(i)+ "'"); else println("WEIRD: i is " + i); */ } leftTgtIndex = branches.elementAt(0).leftTgtIndex; rightTgtIndex = branches.elementAt(branches.size()-1).rightTgtIndex; isPre = false; for (JoshuaDerivationTree b : branches) { isPre = isPre || b.isLeaf; } isLeaf = false; // if (isPre) print(rootInfo + " IS *pre*"); // else print(rootInfo + " is neither leaf nor pre,"); // println(" spanning tgt words " + leftTgtIndex + "-" + rightTgtIndex); } // if (leaf) else (inner node) // println("GO UP"); } public String toString() { if (isLeaf) { return root; } else { String retStr = "(" + root + "{" + leftSrcIndex + "-" + rightSrcIndex + "}"; for (JoshuaDerivationTree b : branches) { retStr += " " + b.toString(); } retStr += ")"; return retStr; } } public String toTree() { return toString(); } public String toSentence() { if (isLeaf) { return root; } else { String retStr = ""; for (JoshuaDerivationTree b : branches) { retStr += " " + b.toSentence(); } return retStr.trim(); } } public String alignments() { if (isLeaf) { return ""; } else { if (!isPre) { String retStr = ""; for (JoshuaDerivationTree b : branches) { retStr += " " + b.alignments(); } return retStr.trim(); } else { if (numBranches == 1) { String retStr = "" + leftSrcIndex; for (int i = leftSrcIndex+1; i < rightSrcIndex; ++i) { retStr += "," + i; } retStr += "--" + leftTgtIndex; for (int i = leftTgtIndex+1; i < rightTgtIndex; ++i) { retStr += "," + i; } return retStr; } else { String retStr = ""; // first, add alignments from non-leaves for (JoshuaDerivationTree b : branches) { if (!b.isLeaf) { retStr += " " + b.alignments(); } } retStr = retStr.trim(); TreeSet<Integer> availableSrcIndices = new TreeSet<Integer>(); TreeSet<Integer> availableTgtIndices = new TreeSet<Integer>(); for (int i = leftSrcIndex; i < rightSrcIndex; ++i) availableSrcIndices.add(i); for (int i = leftTgtIndex; i < rightTgtIndex; ++i) availableTgtIndices.add(i); for (JoshuaDerivationTree b : branches) { if (!b.isLeaf) { for (int i = b.leftSrcIndex; i < b.rightSrcIndex; ++i) availableSrcIndices.remove(i); for (int i = b.leftTgtIndex; i < b.rightTgtIndex; ++i) availableTgtIndices.remove(i); } } String srcStr = ""; for (Integer i : availableSrcIndices) srcStr += "," + i; srcStr = srcStr.substring(1); String tgtStr = ""; for (Integer i : availableTgtIndices) tgtStr += "," + i; tgtStr = tgtStr.substring(1); retStr += " " + srcStr + "--" + tgtStr; return retStr; } } } } static private void println(Object obj) { System.out.println(obj); } @SuppressWarnings("unused") static private void print(Object obj) { System.out.print(obj); } }