package fr.orsay.lri.varna.models.treealign; import java.util.*; /** * Tree alignment algorithm. * This class implements the tree alignment algorithm * for ordered trees explained in article: * T. Jiang, L. Wang, K. Zhang, * Alignment of trees - an alternative to tree edit, * Theoret. Comput. Sci. 143 (1995). * Other references: * - Claire Herrbach, Alain Denise and Serge Dulucq. * Average complexity of the Jiang-Wang-Zhang pairwise tree alignment * algorithm and of a RNA secondary structure alignment algorithm. * Theoretical Computer Science 411 (2010) 2423-2432. * * Our implementation supposes that the trees will never have more * than 32000 nodes and that the total distance will never require more * significant digits that a float (single precision) has. * * @author Raphael Champeimont * @param <ValueType1> The type of values on nodes in the first tree. * @param <ValueType2> The type of values on nodes in the second tree. */ public class TreeAlign<ValueType1, ValueType2> { private class TreeData<ValueType> { /** * The tree. */ public Tree<ValueType> tree; /** * The tree size (number of nodes). */ public int size = -1; /** * The number of children of a node is called the node degree. * This variable is the maximum node degree in the tree. */ public int degree = -1; /** * The number of children of a node is called the node degree. * degree[i] is the degree of node i, with i being an index in nodes. */ public int[] degrees; /** * The trees as an array of its nodes (subtrees rooted at each node * in fact), in postorder. */ public Tree<ValueType>[] nodes; /** * children[i] is the array of children (as indexes in nodes) * of i (an index in nodes) */ public int[][] children; /** * Values of nodes. */ public ValueType[] values; } /** * The distance function between labels. */ private TreeAlignLabelDistanceAsymmetric<ValueType1,ValueType2> labelDist; /** * Create a TreeAlignSymmetric object, which can align trees. * The distance function will be called only once on every pair * of nodes. The result is then kept in a matrix, so you need not manage * yourself a cache of f(value1, value2). * Note that it is permitted to have null values on nodes, * so comparing a node with a non-null value with a node with a null * value will give the same cost as to insert the first node. * This can be useful if you tree has "fake" nodes. * @param labelDist The label distance. */ public TreeAlign(TreeAlignLabelDistanceAsymmetric<ValueType1,ValueType2> labelDist) { this.labelDist = labelDist; } private class ConvertTreeToArray<ValueType> { private int nextNodeIndex = 0; private TreeData<ValueType> treeData; public ConvertTreeToArray(TreeData<ValueType> treeData) { this.treeData = treeData; } private void convertTreeToArrayAux( Tree<ValueType> subtree, int[] siblingIndexes, int siblingNumber) throws TreeAlignException { // We want it in postorder, so first we put the children List<Tree<ValueType>> children = subtree.getChildren(); int numberOfChildren = children.size(); int[] childrenIndexes = new int[numberOfChildren]; int myIndex = -1; { int i = 0; for (Tree<ValueType> child: children) { convertTreeToArrayAux(child, childrenIndexes, i); i++; } } // Compute the maximum degree if (numberOfChildren > treeData.degree) { treeData.degree = numberOfChildren; } // Now we add the node (root of the given subtree). myIndex = nextNodeIndex; nextNodeIndex++; treeData.nodes[myIndex] = subtree; // Record how many children I have treeData.degrees[myIndex] = numberOfChildren; // Store my value in an array ValueType v = subtree.getValue(); treeData.values[myIndex] = v; // Tell the caller my index siblingIndexes[siblingNumber] = myIndex; // Record my children indexes treeData.children[myIndex] = childrenIndexes; } /** * Reads: treeData.tree * Computes: treeData.nodes, treeData.degree, treeData.degrees * treeData.fathers, treeData.children, treeData.size, * treeData.values * Converts a tree to an array of nodes, in postorder. * We also compute the maximum node degree in the tree. * @throws TreeAlignException */ @SuppressWarnings("unchecked") public void convert() throws TreeAlignException { treeData.degree = 0; treeData.size = treeData.tree.countNodes(); // we didn't write new Tree<ValueType>[treeData.size] because // java does not support generics with arrays treeData.nodes = new Tree[treeData.size]; treeData.children = new int[treeData.size][]; treeData.degrees = new int[treeData.size]; treeData.values = (ValueType[]) new Object[treeData.size]; int rootIndex[] = new int[1]; convertTreeToArrayAux(treeData.tree, rootIndex, 0); } } /** * For arrays that take at least O(|T1|*|T2|) we take care * not to use too big data types. */ private class Aligner { /** * The first tree. */ private TreeData<ValueType1> treeData1; /** * The second tree. */ private TreeData<ValueType2> treeData2; /** * DF1[i][j_t] is DFL for (i,j,s,t) with s=0. * See description of DFL in Aligner.computeAlignmentP1(). * DF1 and DF2 are the "big" arrays, ie. those that may the space * complexity what it is. */ private float[][][][] DF1; /** * DF2[j][i_s] is DFL for (i,j,s,t) with t=0. * See description of DFL in Aligner.computeAlignmentP1(). */ private float[][][][] DF2; /** * This arrays have the same shape as respectively DF1. * They are used to remember which term in the minimum won, so that * we can compute the alignment. * Decision1 is a case number (< 10) * and Decision2 is a child index, hence the types. */ private byte[][][][] DF1Decisions1; private short[][][][] DF1Decisions2; /** * This arrays have the same shape as respectively DF2. * They are used to remember which term in the minimum won, so that * we can compute the alignment. */ private byte[][][][] DF2Decisions1; private short[][][][] DF2Decisions2; /** * Distances between subtrees. * DT[i][j] is the distance between the subtree rooted at i in the first tree * and the subtree rooted at j in the second tree. */ private float[][] DT; /** * This array has the same shape as DT, but is used to remember which * case gave the minimum, so that we can later compute the alignment. */ private byte[][] DTDecisions1; private short[][] DTDecisions2; /** * Distances between labels. * DL[i][j] is the distance labelDist.f(value(T1[i]), value(T2[i])). * By convention, we say that value(T1[|T1|]) = null * and value(T2[|T2|]) = null */ private float[][] DL; /** * DET1[i] is the distance between the empty tree and T1[i] * (the subtree rooted at node i in the first tree). */ private float[] DET1; /** * Same as DET1, but for second tree. */ private float[] DET2; /** * DEF1[i] is the distance between the empty forest and F1[i] * (the forest of children of node i in the first tree). */ private float[] DEF1; /** * Same as DEF1, but for second tree. */ private float[] DEF2; /** * @param i node in T1 * @param s number of first child of i to consider * @param m_i degree of i * @param j node in T2 * @param t number of first child of j to consider * @param n_j degree of j * @param DFx which array to fill (DF1 or DF2) */ private void computeAlignmentP1(int i, int s, int m_i, int j, int t, int n_j, int DFx) { /** * DFL[pr][qr] is D(F1[i_s, i_p], F2[j_t, j_q]) * where p=s+pr-1 and q=t+qr-1 (ie. pr=p-s+1 and qr=q-t+1) * By convention, F1[i_s, i_{s-1}] and F2[j_t, j_{t-1}] are the * empty forests. * Said differently, DFL[pr][qr] is the distance between the forest * of the pr first children of i, starting with child s * (first child is s = 0), and the forest of the qr first children * of j, starting with child t (first child is t = 0). * This array is allocated for a fixed value of (i,j,s,t). */ float[][] DFL; /** * Same shape as DFL, but to remember which term gave the min, * so that we can later compute the alignment. */ byte[][] DFLDecisions1; short[][] DFLDecisions2; DFL = new float[m_i-s+2][n_j-t+2]; DFL[0][0] = 0; // D(empty forest, empty forest) = 0 DFLDecisions1 = new byte[m_i-s+2][n_j-t+2]; DFLDecisions2 = new short[m_i-s+2][n_j-t+2]; // Compute indexes of i_s and j_t because we will need them int i_s = m_i != 0 ? treeData1.children[i][s] : -1; int j_t = n_j != 0 ? treeData2.children[j][t] : -1; for (int p=s; p<m_i; p++) { DFL[p-s+1][0] = DFL[p-s][0] + DET1[treeData1.children[i][p]]; } for (int q=t; q<n_j; q++) { DFL[0][q-t+1] = DFL[0][q-t] + DET2[treeData2.children[j][q]]; } for (int p=s; p<m_i; p++) { int i_p = treeData1.children[i][p]; for (int q=t; q<n_j; q++) { int j_q = treeData2.children[j][q]; float min = Float.POSITIVE_INFINITY; int decision1 = -1; int decision2 = -1; // Lemma 3 - Case: We delete the rightmost tree of T1 { float minCandidate = DFL[p-s][q-t+1] + DET1[i_p]; if (minCandidate < min) { min = minCandidate; decision1 = 1; } } // Lemma 3 - Case: We insert the rightmost tree of T2 (symmetric of previous case) { float minCandidate = DFL[p-s+1][q-t] + DET2[j_q]; if (minCandidate < min) { min = minCandidate; decision1 = 2; } } // Lemma 3 - Case: Align rightmost trees with each other { float minCandidate = DFL[p-s][q-t] + DT [i_p] [j_q]; if (minCandidate < min) { min = minCandidate; decision1 = 3; } } // Lemma 3 - Case: We cut the T1 forest and match the first part // with the T2 forest except the rightmost tree, and we match the second // part with the T2 rightmost tree's forest of children { float minCandidate = Float.POSITIVE_INFINITY; int best_k = -1; for (int k=s; k<p; k++) { float d = DFL[k-s][q-t] + DF2 [j_q] [treeData1.children[i][k]] [p-k+1] [treeData2.degrees[j_q]]; if (d < minCandidate) { minCandidate = d; best_k = k; } } minCandidate += DL[treeData1.size][j_q]; if (minCandidate < min) { min = minCandidate; decision1 = 4; decision2 = best_k; } } // Lemma 3 - Case: Syemmetric of preivous case { float minCandidate = Float.POSITIVE_INFINITY; int best_k = -1; for (int k=t; k<q; k++) { float d = DFL[p-s][k-t] + DF1 [i_p] [treeData2.children[j][k]] [treeData1.degrees[i_p]] [q-k+1]; if (d < minCandidate) { minCandidate = d; best_k = k; } } minCandidate += DL[i_p][treeData2.size]; if (minCandidate < min) { min = minCandidate; decision1 = 5; decision2 = best_k; } } DFL[p-s+1][q-t+1] = min; DFLDecisions1[p-s+1][q-t+1] = (byte) decision1; DFLDecisions2[p-s+1][q-t+1] = (short) decision2; } } // Copy references to DFL to persistent arrays if (DFx == 2) { DF2[j][i_s] = DFL; DF2Decisions1[j][i_s] = DFLDecisions1; DF2Decisions2[j][i_s] = DFLDecisions2; } else { DF1[i][j_t] = DFL; DF1Decisions1[i][j_t] = DFLDecisions1; DF1Decisions2[i][j_t] = DFLDecisions2; } } public float align() throws TreeAlignException { (new ConvertTreeToArray<ValueType1>(treeData1)).convert(); (new ConvertTreeToArray<ValueType2>(treeData2)).convert(); // Allocate necessary arrays DT = new float[treeData1.size][treeData2.size]; DTDecisions1 = new byte[treeData1.size][treeData2.size]; DTDecisions2 = new short[treeData1.size][treeData2.size]; DL = new float[treeData1.size+1][treeData2.size+1]; DET1 = new float[treeData1.size]; DET2 = new float[treeData2.size]; DEF1 = new float[treeData1.size]; DEF2 = new float[treeData2.size]; DF1 = new float[treeData1.size][treeData2.size][][]; DF1Decisions1 = new byte[treeData1.size][treeData2.size][][]; DF1Decisions2 = new short[treeData1.size][treeData2.size][][]; DF2 = new float[treeData2.size][treeData1.size][][]; DF2Decisions1 = new byte[treeData2.size][treeData1.size][][]; DF2Decisions2 = new short[treeData2.size][treeData1.size][][]; DL[treeData1.size][treeData2.size] = (float) labelDist.f(null, null); for (int i=0; i<treeData1.size; i++) { int m_i = treeData1.degrees[i]; DEF1[i] = 0; for (int k=0; k<m_i; k++) { DEF1[i] += DET1[treeData1.children[i][k]]; } DL[i][treeData2.size] = (float) labelDist.f((ValueType1) treeData1.values[i], null); DET1[i] = DEF1[i] + DL[i][treeData2.size]; } for (int j=0; j<treeData2.size; j++) { int n_j = treeData2.degrees[j]; DEF2[j] = 0; for (int k=0; k<n_j; k++) { DEF2[j] += DET2[treeData2.children[j][k]]; } DL[treeData1.size][j] = (float) labelDist.f(null, (ValueType2) treeData2.values[j]); DET2[j] = DEF2[j] + DL[treeData1.size][j]; } for (int i=0; i<treeData1.size; i++) { int m_i = treeData1.degrees[i]; for (int j=0; j<treeData2.size; j++) { int n_j = treeData2.degrees[j]; // Precompute f(value(i), value(j)) and keep the result // to avoid calling f on the same values several times. // This is important in case the computation of f takes // long. DL[i][j] = (float) labelDist.f((ValueType1) treeData1.values[i], (ValueType2) treeData2.values[j]); for (int s=0; s<m_i; s++) { computeAlignmentP1(i, s, m_i, j, 0, n_j, 2); } for (int t=0; t<n_j; t++) { computeAlignmentP1(i, 0, m_i, j, t, n_j, 1); } DT[i][j] = Float.POSITIVE_INFINITY; // Lemma 2 - Case: Root is (blank, j) { float minCandidate = Float.POSITIVE_INFINITY; int best_r = -1; for (int r=0; r<n_j; r++) { float d = DT[i][treeData2.children[j][r]] - DET2[treeData2.children[j][r]]; if (d < minCandidate) { minCandidate = d; best_r = r; } } minCandidate += DET2[j]; if (minCandidate < DT[i][j]) { DT[i][j] = minCandidate; DTDecisions1[i][j] = 1; DTDecisions2[i][j] = (short) best_r; } } // Lemma 2 - Case: Root is (i, blank) { float minCandidate = Float.POSITIVE_INFINITY; int best_r = -1; for (int r=0; r<m_i; r++) { float d = DT[treeData1.children[i][r]][j] - DET1[treeData1.children[i][r]]; if (d < minCandidate) { minCandidate = d; best_r = r; } } minCandidate += DET1[i]; if (minCandidate < DT[i][j]) { DT[i][j] = minCandidate; DTDecisions1[i][j] = 2; DTDecisions2[i][j] = (short) best_r; } } // Lemma 2 - Case: Root is (i,j) { float minCandidate; if (n_j != 0) { minCandidate = DF1 [i] [treeData2.children[j][0]] [m_i] [n_j]; } else { if (m_i != 0) { minCandidate = DF2 [j] [treeData1.children[i][0]] [m_i] [n_j]; } else { minCandidate = 0; // D(empty forest, empty forest) = 0 } } minCandidate += DL[i][j]; if (minCandidate < DT[i][j]) { DT[i][j] = minCandidate; DTDecisions1[i][j] = 3; } } } } // We return the distance beetween T1[root] and T2[root]. return DT[treeData1.size-1][treeData2.size-1]; } public Aligner(Tree<ValueType1> T1, Tree<ValueType2> T2) { treeData1 = new TreeData<ValueType1>(); treeData1.tree = T1; treeData2 = new TreeData<ValueType2>(); treeData2.tree = T2; } /** Align F1[i_s,i_p] with F2[j_t,j_q]. * If p = s-1, by convention it means F1[i_s,i_p] = empty forest. * Idem for q=t-1. */ private List<Tree<AlignedNode<ValueType1,ValueType2>>> computeForestAlignment(int i, int s, int p, int j, int t, int q) { if (p == s-1) { // left forest is the empty forest List<Tree<AlignedNode<ValueType1,ValueType2>>> result = new ArrayList<Tree<AlignedNode<ValueType1,ValueType2>>>(); for (int k=t; k<=q; k++) { result.add(treeInserted(treeData2.children[j][k])); } return result; } else { if (q == t-1) { // right forest is the empty forest List<Tree<AlignedNode<ValueType1,ValueType2>>> result = new ArrayList<Tree<AlignedNode<ValueType1,ValueType2>>>(); for (int k=s; k<=p; k++) { result.add(treeDeleted(treeData1.children[i][k])); } return result; } else { // both forests are non-empty int decision1, k; if (s == 0) { decision1 = DF1Decisions1 [i] [treeData2.children[j][t]] [p-s+1] [q-t+1]; k = DF1Decisions2 [i] [treeData2.children[j][t]] [p-s+1] [q-t+1]; } else if (t == 0) { decision1 = DF2Decisions1 [j] [treeData1.children[i][s]] [p-s+1] [q-t+1]; k = DF2Decisions2 [j] [treeData1.children[i][s]] [p-s+1] [q-t+1]; } else { throw (new Error("TreeAlignSymmetric bug: both s and t are non-zero")); } switch (decision1) { case 1: { List<Tree<AlignedNode<ValueType1,ValueType2>>> result; result = computeForestAlignment(i, s, p-1, j, t, q); result.add(treeDeleted(treeData1.children[i][p])); return result; } case 2: { List<Tree<AlignedNode<ValueType1,ValueType2>>> result; result = computeForestAlignment(i, s, p, j, t, q-1); result.add(treeInserted(treeData2.children[j][q])); return result; } case 3: { List<Tree<AlignedNode<ValueType1,ValueType2>>> result; result = computeForestAlignment(i, s, p-1, j, t, q-1); result.add(computeTreeAlignment(treeData1.children[i][p], treeData2.children[j][q])); return result; } case 4: { List<Tree<AlignedNode<ValueType1,ValueType2>>> result; result = computeForestAlignment(i, s, k-1, j, t, q-1); int j_q = treeData2.children[j][q]; Tree<AlignedNode<ValueType1,ValueType2>> insertedNode = new Tree<AlignedNode<ValueType1,ValueType2>>(); AlignedNode<ValueType1,ValueType2> insertedNodeValue = new AlignedNode<ValueType1,ValueType2>(); insertedNodeValue.setLeftNode(null); insertedNodeValue.setRightNode((Tree<ValueType2>) treeData2.nodes[j_q]); insertedNode.setValue(insertedNodeValue); insertedNode.replaceChildrenListBy(computeForestAlignment(i, k, p, j_q, 0, treeData2.degrees[j_q]-1)); result.add(insertedNode); return result; } case 5: { List<Tree<AlignedNode<ValueType1,ValueType2>>> result; result = computeForestAlignment(i, s, p-1, j, t, k-1); int i_p = treeData1.children[i][p]; Tree<AlignedNode<ValueType1,ValueType2>> deletedNode = new Tree<AlignedNode<ValueType1,ValueType2>>(); AlignedNode<ValueType1,ValueType2> deletedNodeValue = new AlignedNode<ValueType1,ValueType2>(); deletedNodeValue.setLeftNode((Tree<ValueType1>) treeData1.nodes[i_p]); deletedNodeValue.setRightNode(null); deletedNode.setValue(deletedNodeValue); deletedNode.replaceChildrenListBy(computeForestAlignment(i_p, 0, treeData1.degrees[i_p]-1, j, k, q)); result.add(deletedNode); return result; } default: throw (new Error("TreeAlign: decision1 = " + decision1)); } } } } /** * Align T1[i] with the empty tree. * @return the alignment */ private Tree<AlignedNode<ValueType1,ValueType2>> treeDeleted(int i) { Tree<AlignedNode<ValueType1,ValueType2>> root = new Tree<AlignedNode<ValueType1,ValueType2>>(); AlignedNode<ValueType1,ValueType2> alignedNode = new AlignedNode<ValueType1,ValueType2>(); alignedNode.setLeftNode(treeData1.nodes[i]); alignedNode.setRightNode(null); root.setValue(alignedNode); for (int r = 0; r<treeData1.degrees[i]; r++) { root.getChildren().add(treeDeleted(treeData1.children[i][r])); } return root; } /** * Align the empty tree with T2[j]. * @return the alignment */ private Tree<AlignedNode<ValueType1,ValueType2>> treeInserted(int j) { Tree<AlignedNode<ValueType1,ValueType2>> root = new Tree<AlignedNode<ValueType1,ValueType2>>(); AlignedNode<ValueType1,ValueType2> alignedNode = new AlignedNode<ValueType1,ValueType2>(); alignedNode.setLeftNode(null); alignedNode.setRightNode(treeData2.nodes[j]); root.setValue(alignedNode); for (int r = 0; r<treeData2.degrees[j]; r++) { root.getChildren().add(treeInserted(treeData2.children[j][r])); } return root; } private Tree<AlignedNode<ValueType1,ValueType2>> computeTreeAlignment(int i, int j) { switch (DTDecisions1[i][j]) { case 1: { Tree<AlignedNode<ValueType1,ValueType2>> root = new Tree<AlignedNode<ValueType1,ValueType2>>(); // Compute the value of the node AlignedNode<ValueType1,ValueType2> alignedNode = new AlignedNode<ValueType1,ValueType2>(); alignedNode.setLeftNode(null); alignedNode.setRightNode(treeData2.nodes[j]); root.setValue(alignedNode); // Compute the children for (int r = 0; r<treeData2.degrees[j]; r++) { if (r == DTDecisions2[i][j]) { root.getChildren().add(computeTreeAlignment(i, treeData2.children[j][r])); } else { root.getChildren().add(treeInserted(treeData2.children[j][r])); } } return root; } case 2: { Tree<AlignedNode<ValueType1,ValueType2>> root = new Tree<AlignedNode<ValueType1,ValueType2>>(); // Compute the value of the node AlignedNode<ValueType1,ValueType2> alignedNode = new AlignedNode<ValueType1,ValueType2>(); alignedNode.setLeftNode(treeData1.nodes[i]); alignedNode.setRightNode(null); root.setValue(alignedNode); // Compute the children for (int r = 0; r<treeData1.degrees[i]; r++) { if (r == DTDecisions2[i][j]) { root.getChildren().add(computeTreeAlignment(treeData1.children[i][r], j)); } else { root.getChildren().add(treeDeleted(treeData1.children[i][r])); } } return root; } case 3: { Tree<AlignedNode<ValueType1,ValueType2>> root = new Tree<AlignedNode<ValueType1,ValueType2>>(); // Compute the value of the node AlignedNode<ValueType1,ValueType2> alignedNode = new AlignedNode<ValueType1,ValueType2>(); alignedNode.setLeftNode(treeData1.nodes[i]); alignedNode.setRightNode(treeData2.nodes[j]); root.setValue(alignedNode); // Compute the children List<Tree<AlignedNode<ValueType1,ValueType2>>> children = computeForestAlignment(i, 0, treeData1.degrees[i]-1, j, 0, treeData2.degrees[j]-1); root.replaceChildrenListBy(children); return root; } default: throw (new Error("TreeAlign: DTDecisions1[i][j] = " + DTDecisions1[i][j])); } } public Tree<AlignedNode<ValueType1,ValueType2>> computeAlignment() { return computeTreeAlignment(treeData1.size-1, treeData2.size-1); } } /** * Align T1 with T2, computing both the distance and the alignment. * Time: O(|T1|*|T2|*(deg(T1)+deg(T2))^2) * Space: O(|T1|*|T2|*(deg(T1)+deg(T2))) * Average (over possible trees) time: O(|T1|*|T2|) * @param T1 The first tree. * @param T2 The second tree. * @return The distance and the alignment. * @throws TreeAlignException */ public TreeAlignResult<ValueType1, ValueType2> align(Tree<ValueType1> T1, Tree<ValueType2> T2) throws TreeAlignException { TreeAlignResult<ValueType1, ValueType2> result = new TreeAlignResult<ValueType1, ValueType2>(); Aligner aligner = new Aligner(T1, T2); result.setDistance(aligner.align()); result.setAlignment(aligner.computeAlignment()); return result; } /** * Takes a alignment, and compute the distance between the two * original trees. If you have called align(), the result object already * contains the distance D and the alignment A. If you call * distanceFromAlignment on the alignment A it will compute the distance D. */ public float distanceFromAlignment(Tree<AlignedNode<ValueType1,ValueType2>> alignment) { Tree<ValueType1> originalT1Node; Tree<ValueType2> originalT2Node; originalT1Node = alignment.getValue().getLeftNode(); originalT2Node = alignment.getValue().getRightNode(); float d = (float) labelDist.f( originalT1Node != null ? originalT1Node.getValue() : null, originalT2Node != null ? originalT2Node.getValue() : null); for (Tree<AlignedNode<ValueType1,ValueType2>> child: alignment.getChildren()) { d += distanceFromAlignment(child); } return d; } }