/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package joshua.aligner;
import java.util.*;
public class ParseTree
{
public String root;
public Vector<ParseTree> branches;
public ParseTree parent;
public int numBranches;
public boolean isPre; // true only if some child is a leaf
public boolean isLeaf; // true only if branches is null (equiv. numBranches is 0)
public int numWords; // i.e. # leaves (all leaves) of this tree
public int leftIndex; // *inner* index, *not* word index
public int rightIndex; // *inner* index, *not* word index
public double nodeScore; // score at the root; 0.0 for NO, 1.0 for YES, 0.5 for default/NOTSURE
// (if leaf, score is 0.0, if single branch and not preterminal, score is 0.0,
// if it maps to an empty string in the candidate, score is 0.5)
public ParseTree() {}
public ParseTree(String str, int seenWords)
{
if (str.charAt(0) != '(') { // leaf
root = str;
branches = null;
numBranches = 0;
isPre = false;
isLeaf = true;
leftIndex = -1; // unknown
rightIndex = -1; // unknown
nodeScore = 0.0;
numWords = root.split("\\s+").length; // usually 1
leftIndex = seenWords;
rightIndex = leftIndex + numWords;
// println(root + " IS $leaf$ spanning tgt words " + leftIndex + "-" + (rightIndex-1));
} else { // inner node
parent = null;
// if a parent does exist, the parent will set this after the ParseTree is created
str = str.substring(1,str.length()-1); // strip parentheses
root = str.substring(0,str.indexOf(' '));
str = (str.substring(str.indexOf(' '))).trim();
// println("ROOT: " + root);
branches = new Vector<ParseTree>();
numBranches = 0;
numWords = 0;
int len = str.length();
int i = 0;
while (i < len) {
++numBranches;
int i_init, i_fin;
/*
print("Before route decision, ");
if (i == len) println("i equals len");
else if (i < len) println("char @ i is '" + str.charAt(i)+ "'");
else println("WEIRD: i is " + i);
*/
i_init = i;
if (str.charAt(i_init) == '(') {
int open = 1;
++i;
while (open != 0) {
if (str.charAt(i) == '(') ++open;
else if (str.charAt(i) == ')') --open;
++i;
}
i_fin = i;
} else {
while (i < len && str.charAt(i) != '(' && str.charAt(i) != ')') ++i;
if (i != len) --i;
i_fin = i;
}
// println("About to add branch #" + numBranches + ":");
// println(" \"" + (str.substring(i_init,i_fin)).trim() + "\"");
/*
if (i == len) println("i equals len");
else if (i < len) println("char @ i is '" + str.charAt(i)+ "'");
else println("WEIRD: i is " + i);
*/
branches.add(new ParseTree((str.substring(i_init,i_fin)).trim(),seenWords));
branches.elementAt(numBranches-1).parent = this;
int numNewWords = branches.elementAt(numBranches-1).numWords;
seenWords += numNewWords;
numWords += numNewWords;
while (i < len && str.charAt(i) == ' ') ++i;
/*
print("After advancing i, ");
if (i == len) println("i equals len");
else if (i < len) println("char @ i is '" + str.charAt(i)+ "'");
else println("WEIRD: i is " + i);
*/
}
leftIndex = branches.elementAt(0).leftIndex;
rightIndex = branches.elementAt(branches.size()-1).rightIndex;
isPre = false;
for (ParseTree b : branches) { isPre = isPre || b.isLeaf; }
isLeaf = false;
if (numBranches == 1 && !isPre) { nodeScore = 0; }
else { nodeScore = 0.5; }
// if (isPre) print(rootInfo + " IS *pre*");
// else print(rootInfo + " is neither leaf nor pre,");
// println(" spanning tgt words " + leftTgtIndex + "-" + (rightTgtIndex-1));
} // if (leaf) else (inner node)
// println("GO UP");
}
public String toString()
{
if (isLeaf) {
return root;
} else {
String retStr = "(" + root;
for (ParseTree b : branches) { retStr += " " + b.toString(); }
retStr += ")";
return retStr;
}
}
public String toTree() { return toString(); }
public String toVerboseTree()
{
if (isLeaf) {
return "" + leftIndex + "_" + root + "_" + rightIndex;
} else {
String retStr = "(" + root + "{" + leftIndex + "-" + rightIndex + "}";
for (ParseTree b : branches) { retStr += " " + b.toVerboseTree(); }
retStr += ")";
return retStr;
}
}
public String toSentence()
{
if (isLeaf) {
return root;
} else {
String retStr = "";
for (ParseTree b : branches) { retStr += " " + b.toSentence(); }
return retStr.trim();
}
}
public int numNodes()
{
if (isLeaf) {
return 0;
} else {
int retVal = 1; // for root
for (ParseTree b : branches) { retVal += b.numNodes(); }
return retVal;
}
}
public String frontierRanges_str(int[] maxLenA)
{
String retStr = "";
if (maxLenA != null && maxLenA.length > 0) {
for (int k = 0; k < maxLenA.length; ++k) {
retStr += frontierRanges_str(maxLenA[k]) + " ";
}
}
return retStr.trim();
}
public String frontierRanges_str(int maxLen)
{
Vector<ParseTree> frontierSet = new Vector<ParseTree>(); // the frontier set
Vector<ParseTree> currNodes = new Vector<ParseTree>();
currNodes.add(this); // initialize at ROOT
//int i = 0;
while (currNodes.size() > 0) {
//++i;
//println("i=" + i + ", currNodes.size() = " + currNodes.size());
Vector<ParseTree> newNodes = new Vector<ParseTree>();
for (ParseTree N : currNodes) {
if (N.numWords <= maxLen) {
frontierSet.add(N);
} else {
for (ParseTree ch : N.branches) {
newNodes.add(ch);
}
}
} // for (N)
//println("After for (N), currNodes.size() = " + currNodes.size() + ", newNodes.size() = " + newNodes.size());
currNodes = newNodes;
//println("After assignment, currNodes.size() = " + currNodes.size() + ", newNodes.size() = " + newNodes.size());
//println("");
}
String retStr = "";
for (ParseTree N : frontierSet) {
retStr += " " + N.leftIndex + "_" + (N.rightIndex-1);
}
return retStr.trim();
}
public int numDistinctRanges()
{
// similar to numNodes, but excludes nodes that are not preterminals yet have exactly one child
if (isLeaf) {
return 0;
} else {
int retVal = 1;
if (numBranches == 1 && !isPre) { retVal = 0; }
for (ParseTree b : branches) { retVal += b.numDistinctRanges(); }
return retVal;
}
}
public int numScoredDistinctRanges()
{
// similar to numDistinctRanges, but only includes nodes that have nodeScore 1.0 or 0.0
if (isLeaf) {
return 0;
} else {
int retVal = 0;
if ((nodeScore == 0.0 || nodeScore == 1.0) && (numBranches != 1 || isPre)) { retVal = 1; }
for (ParseTree b : branches) { retVal += b.numScoredDistinctRanges(); }
return retVal;
}
}
public int numUnscoredPreTerminals()
{
// OPPOSITE OF numScoredDistinctRanges, but only includes preterminals
if (isLeaf) {
return 0;
} else {
int retVal = 0;
if (isPre && nodeScore == 0.5) { retVal = 1; }
for (ParseTree b : branches) { retVal += b.numUnscoredPreTerminals(); }
return retVal;
}
}
public void setNodeScores(int i, String[] candWords, String[] linksSrcCand, HashMap<String,String> judgments)
{
if (!isLeaf && (numBranches != 1 || isPre)) {
TreeSet<Integer> srcIndices = new TreeSet<Integer>();
for (int srcI = leftIndex; srcI <= rightIndex-1; ++srcI) srcIndices.add(srcI);
// String rangeStr = srcIndices.first() + "_" + srcIndices.last();
// print(" srcIndices(" + rangeStr + ")");
TreeSet<Integer> candIndices = new TreeSet<Integer>();
for (int k = 0; k < linksSrcCand.length; ++k) {
// for each srcI-candI link, if srcI is in srcIndices, add candI to candIndices
String link = linksSrcCand[k];
int srcI = Integer.parseInt(link.substring(0,link.indexOf("-")));
if (srcIndices.contains(srcI)) {
int candI = Integer.parseInt(link.substring(link.indexOf("-")+1));
candIndices.add(candI);
}
}
// construct candidate substring.
// This is based on code from GenerateQueries.java
String candSubstring = "";
if (candIndices.size() > 0) {
int skipSequences = 0; // how many [Skip] sequences?
int skipWords = 0; // how many [Skip] words?
boolean prevIsSkip = false;
for (int candI = candIndices.first(); candI <= candIndices.last(); ++candI) {
if (candIndices.contains(candI)) {
candSubstring += " " + candWords[candI];
prevIsSkip = false;
} else { // there's a [Skip]
candSubstring += " " + "[Skip]";
++skipWords;
if (!prevIsSkip) { // new [Skip] sequence
++skipSequences;
prevIsSkip = true;
}
}
}
candSubstring = Skip_to_GAP(candSubstring.trim());
}
if (!candSubstring.equals("")) {
String key = "" + i + " " + leftIndex + "_" + (rightIndex-1) + " ||| " + candSubstring + " |||";
// "i j_k ||| candidate substring |||"
String judge = judgments.get(key);
if (judge == null) {
nodeScore = 0.5;
//print(key + " not found, ");
} else if (judge.equals("YES")) {
nodeScore = 1.0;
//println(key + "[+], ");
} else if (judge.equals("NO")) {
nodeScore = 0.0;
//println(key + "[-], ");
} else if (judge.equals("NOTSURE")) {
nodeScore = 0.5;
//println(key + "[0], ");
}
} else {
// maps to an empty string in the candidate
nodeScore = 0.5;
}
} // if (!isLeaf)
if (!isLeaf) {
// recurse
for (ParseTree b : branches) { b.setNodeScores(i, candWords, linksSrcCand, judgments); }
}
}
public boolean has_NO_node_below()
{
// does this ParseTree have at least one NO inner node?
// (root included in the tree)
// test needs to return false in order to percolate YES down
if (!isLeaf) {
if (nodeScore == 0.0 && (numBranches != 1 || isPre)) return true;
for (ParseTree b : branches) { if (b.has_NO_node_below()) return true; }
}
return false; // either a leaf, or no branch has a NO node
}
public boolean has_YES_node_above()
{
// does the path from the root of this ParseTree to the superroot have at least one YES inner node?
// (root included in the path)
// test needs to return false in order to percolate NO up
if (nodeScore == 1.0) {
return true;
} else if (parent == null) {
return false;
} else {
return parent.has_YES_node_above();
}
}
public boolean percolateNO_up()
{
Vector<ParseTree> NO_nodes = extract_NO_nodes();
boolean retVal = false;
for (ParseTree n : NO_nodes) {
if (!n.has_YES_node_above()) {
// percolate NO to all ancestors with nodeScore = 0.5
ParseTree p = n.parent;
while (p != null && p.nodeScore == 0.5) {
p.nodeScore = 0.0;
p = p.parent;
}
}
} // for (n)
return retVal;
}
public Vector<ParseTree> extract_NO_nodes()
{
Vector<ParseTree> NO_nodes = new Vector<ParseTree>();
if (!isLeaf) {
if (nodeScore == 0.0 && (numBranches != 1 || isPre)) {
NO_nodes.add(this);
}
for (ParseTree b : branches) {
Vector<ParseTree> b_NO_nodes = b.extract_NO_nodes();
for (ParseTree n : b_NO_nodes) {
NO_nodes.add(n);
}
}
}
return NO_nodes;
}
public boolean percolateYES_down()
{
// find at inner nodes judged YES and percolate down, and recurse.
// If any percolation happens, return true, otherwise return false.
if (isLeaf) {
return false;
} else {
boolean retVal = false;
if (nodeScore == 1.0) {
for (ParseTree b : branches) {
if (!b.isLeaf) {
if (b.nodeScore == 1.0) {
boolean b_retVal = b.percolateYES_down();
retVal = retVal || b_retVal;
} else if (b.nodeScore == 0.5) { // unknown
if (!b.has_NO_node_below()) {
b.nodeScore = 1.0;
retVal = true;
}
boolean b_retVal = b.percolateYES_down();
retVal = retVal || b_retVal;
} else if (b.nodeScore == 0.0) {
if (!b.has_NO_node_below()) { // a non-distinct inner node (otherwise it would have returned true)
b.nodeScore = 1.0; // temporarily
boolean b_retVal = b.percolateYES_down();
retVal = retVal || b_retVal;
b.nodeScore = 0.0; // reverse
} else { // distinct node with NO judgment
boolean b_retVal = b.percolateYES_down();
retVal = retVal || b_retVal;
}
}
} // if (!b.isLeaf)
} // for (b)
} else {
for (ParseTree b : branches) {
boolean b_retVal = b.percolateYES_down();
retVal = retVal || b_retVal;
}
}
return retVal;
}
}
public void resetNodeScores()
{
if (isLeaf) {
nodeScore = 0.0;
} else {
if (numBranches == 1 && !isPre) { nodeScore = 0.0; }
else { nodeScore = 0.5; }
for (ParseTree b : branches) { b.resetNodeScores(); }
}
}
public double nodeScoreSum()
{
double retVal = nodeScore; // for root
if (!isLeaf) {
for (ParseTree b : branches) { retVal += b.nodeScoreSum(); }
}
return retVal;
}
public String distinctRanges_str()
{
if (isLeaf) {
return "";
} else {
String retStr = "";
for (ParseTree b : branches) { retStr += " " + b.distinctRanges_str(); }
if (numBranches != 1 || isPre)
retStr += " " + leftIndex + "_" + (rightIndex-1); // add own range
return retStr.trim();
}
}
public Vector<TreeSet<Integer>> distinctRanges()
{
return strToRanges(distinctRanges_str());
}
public Vector<TreeSet<Integer>> frontierRanges(int maxLen)
{
// return strToRanges(frontierRanges_str(maxLen));
int[] maxLenA = new int[1];
maxLenA[0] = maxLen;
return frontierRanges(maxLenA);
}
public Vector<TreeSet<Integer>> frontierRanges(int[] maxLenA)
{
String str = "";
for (int k = 0; k < maxLenA.length; ++k) {
str += frontierRanges_str(maxLenA[k]) + " ";
}
return strToRanges(str.trim());
}
public Vector<TreeSet<Integer>> strToRanges(String RS)
{
// returns a Vector of ranges, where each range is a TreeSet of the
// word indices in the range (e.g. 1-3 becomes {1,2,3}, 1-1 becomes {1}).
RS += " ";
Vector<TreeSet<Integer>> retRanges = new Vector<TreeSet<Integer>>();
String t = "";
int i1 = 0;
int len = RS.length();
while (i1 < len) {
int i2 = RS.indexOf(' ',i1);
String sp = RS.substring(i1,i2);
int _i = sp.indexOf('_');
int spL = Integer.parseInt(sp.substring(0,_i));
int spR = Integer.parseInt(sp.substring(_i+1));
t += spL + "_" + spR + " ";
TreeSet<Integer> T = new TreeSet<Integer>();
for (int j = spL; j <= spR; ++j) T.add(j);
retRanges.add(T);
i1 = i2+1;
}
if (t.equals(RS)) println("t is good"); else println("t is BAD");
return retRanges;
}
private static String Skip_to_GAP(String str)
{
while (str.indexOf("[Skip] [Skip]") >= 0) {
str = str.replaceAll("\\[Skip\\] \\[Skip\\]","\\[Skip\\]");
}
str = str.replaceAll("\\[Skip\\]","\\[GAP\\]");
return str;
}
static private void println(Object obj) { System.out.println(obj); }
@SuppressWarnings("unused")
static private void print(Object obj) { System.out.print(obj); }
}