package com.interview.suffixprefix; import java.util.ArrayList; import java.util.List; /** * Date 06/01/2015 * @author tusroy * * Construct suffix tree using Ukkonen's algorithm * * Solution * Rule 1: For phase i+1 if S[j..i] ends at last character of leaf edge then add S[i+1] at * the end. * Rule 2: For phase i+1 if S[j..i] ends somewhere in middle of edge and next character is * not S[i+1] then a new leaf edge with label S[i+1] should be created * Rule 3: For phase i+1 if S[j..i] ends somewhere in middle of edge and next character is * S[i+1] then do nothing(resulting in implicit tree) * * Suffix Link: * For every node with label x@ where x is a single character and @ is possibly empty substring * there is another node with label x. This node is suffix link of first node. If @ is * empty then suffix link is root. * * Trick1 * Skip/Count trick * While traveling down if number of characters on edge is less than number of characters * to traverse then skip directly to the end of the edge. If number of characters on label * is more than number of characters to traverse then go directly to that character * we care about. * * Edge-label compression * Instead of storing actual characters on the path store start and end indices on the * path. * * Trick2 - Stop process as soon as you hit rule 3. Rule 3 is show stopper * * Trick3 - Keep a global end on leaf to do rule 1 extension. * * Active point - It is the point from which traversal starts for next extension or next phase. * Active point always starts from root. Other extension will get active point set up * correctly by last extension. * * Active node - Node from which active point will start * Active Edge - It is used to choose the edge from active node. It has index of character. * Active Length - How far to go on active edge. * * Active point rules * 1) If rule 3 extension is applied then active length will increment by 1 if active length is not greater then length of path on edge. * 2) If rule 3 extension is applied and if active length gets greater than length path of edge then change active node, active edge and active length * 3) If active length is 0 then always start looking for the character from root. * 4) If rule 2 extension is applied and if active node is root then active edge is active edge + 1 and active length is active lenght -1 * 5) If rule 2 extension is applied and if active node is not root then follow suffix link and make active node as suffix link and do no change * anything. * * Test cases * adeacdade * abcabxabcd * abcdefabxybcdmnabcdex * abcadak * dedododeodo * abcabxabcd * mississippi * banana * ooooooooo * * References * http://web.stanford.edu/~mjkay/gusfield.pdf * http://www.geeksforgeeks.org/ukkonens-suffix-tree-construction-part-6/ * https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf * https://gist.github.com/axefrog/2373868 */ public class SuffixTree { public static void main(String args[]){ SuffixTree st = new SuffixTree("mississippi".toCharArray()); st.build(); st.dfsTraversal(); System.out.println(st.validate()); } private SuffixNode root; private Active active; private int remainingSuffixCount; private End end; private char input[]; private static char UNIQUE_CHAR = '$'; public SuffixTree(char input[]){ this.input = new char[input.length+1]; for(int i=0; i < input.length; i++){ this.input[i] = input[i]; } this.input[input.length] = UNIQUE_CHAR; } public void build(){ root = SuffixNode.createNode(1, new End(0)); root.index = -1; active = new Active(root); this.end = new End(-1); //loop through string to start new phase for(int i=0; i < input.length; i++){ startPhase(i); } if (remainingSuffixCount != 0) { System.out.print("Something wrong happened"); } //finally walk the tree again and set up the index. setIndexUsingDfs(root, 0, input.length); } private void startPhase(int i){ //set lastCreatedInternalNode to null before start of every phase. SuffixNode lastCreatedInternalNode = null; //global end for leaf. Does rule 1 extension as per trick 3 by incrementing end. end.end++; //these many suffixes need to be created. remainingSuffixCount++; while(remainingSuffixCount > 0){ //if active length is 0 then look for current character from root. if(active.activeLength == 0){ //if current character from root is not null then increase active length by 1 //and break out of while loop. This is rule 3 extension and trick 2 (show stopper) if(selectNode(i) != null){ active.activeEdge = selectNode(i).start; active.activeLength++; break; } //create a new leaf node with current character from leaf. This is rule 2 extension. else { root.child[input[i]] = SuffixNode.createNode(i, end); remainingSuffixCount--; } } else{ //if active length is not 0 means we are traversing somewhere in middle. So check if next character is same as //current character. try { char ch = nextChar(i); //if next character is same as current character then do a walk down. This is again a rule 3 extension and //trick 2 (show stopper). if(ch == input[i]){ //if lastCreatedInternalNode is not null means rule 2 extension happened before this. Point suffix link of that node //to selected node using active point. //TODO - Could be wrong here. Do we only do this if when walk down goes past a node or we do it every time. if(lastCreatedInternalNode != null){ lastCreatedInternalNode.suffixLink = selectNode(); } //walk down and update active node if required as per rules of active node update for rule 3 extension. walkDown(i); break; } else { //next character is not same as current character so create a new internal node as per //rule 2 extension. SuffixNode node = selectNode(); int oldStart = node.start; node.start = node.start + active.activeLength; //create new internal node SuffixNode newInternalNode = SuffixNode.createNode(oldStart, new End(oldStart + active.activeLength - 1)); //create new leaf node SuffixNode newLeafNode = SuffixNode.createNode(i, this.end); //set internal nodes child as old node and new leaf node. newInternalNode.child[input[newInternalNode.start + active.activeLength]] = node; newInternalNode.child[input[i]] = newLeafNode; newInternalNode.index = -1; active.activeNode.child[input[newInternalNode.start]] = newInternalNode; //if another internal node was created in last extension of this phase then suffix link of that //node will be this node. if (lastCreatedInternalNode != null) { lastCreatedInternalNode.suffixLink = newInternalNode; } //set this guy as lastCreatedInternalNode and if new internalNode is created in next extension of this phase //then point suffix of this node to that node. Meanwhile set suffix of this node to root. lastCreatedInternalNode = newInternalNode; newInternalNode.suffixLink = root; //if active node is not root then follow suffix link if(active.activeNode != root){ active.activeNode = active.activeNode.suffixLink; } //if active node is root then increase active index by one and decrease active length by 1 else{ active.activeEdge = active.activeEdge + 1; active.activeLength--; } remainingSuffixCount--; } } catch (EndOfPathException e) { //this happens when we are looking for new character from end of current path edge. Here we already have internal node so //we don't have to create new internal node. Just create a leaf node from here and move to suffix new link. SuffixNode node = selectNode(); node.child[input[i]] = SuffixNode.createNode(i, end); if (lastCreatedInternalNode != null) { lastCreatedInternalNode.suffixLink = node; } lastCreatedInternalNode = node; //if active node is not root then follow suffix link if(active.activeNode != root){ active.activeNode = active.activeNode.suffixLink; } //if active node is root then increase active index by one and decrease active length by 1 else{ active.activeEdge = active.activeEdge + 1; active.activeLength--; } remainingSuffixCount--; } } } } private void walkDown(int index){ SuffixNode node = selectNode(); //active length is greater than path edge length. //walk past current node so change active point. //This is as per rules of walk down for rule 3 extension. if(diff(node) < active.activeLength){ active.activeNode = node; active.activeLength = active.activeLength - diff(node); active.activeEdge = node.child[input[index]].start; }else{ active.activeLength++; } } //find next character to be compared to current phase character. private char nextChar(int i) throws EndOfPathException{ SuffixNode node = selectNode(); if(diff(node) >= active.activeLength){ return input[active.activeNode.child[input[active.activeEdge]].start + active.activeLength]; } if(diff(node) + 1 == active.activeLength ){ if(node.child[input[i]] != null){ return input[i]; } } else{ active.activeNode = node; active.activeLength = active.activeLength - diff(node) -1; active.activeEdge = active.activeEdge + diff(node) +1; return nextChar(i); } throw new EndOfPathException(); } private static class EndOfPathException extends Exception{ } private SuffixNode selectNode(){ return active.activeNode.child[input[active.activeEdge]]; } private SuffixNode selectNode(int index){ return active.activeNode.child[input[index]]; } private int diff(SuffixNode node){ return node.end.end - node.start; } private void setIndexUsingDfs(SuffixNode root,int val, int size){ if(root == null){ return; } val += root.end.end - root.start + 1; if(root.index != -1){ root.index = size - val; return; } for(SuffixNode node : root.child){ setIndexUsingDfs(node, val, size); } } /** * Do a DFS traversal of the tree. */ public void dfsTraversal(){ List<Character> result = new ArrayList<>(); for(SuffixNode node : root.child){ dfsTraversal(node, result); } } private void dfsTraversal(SuffixNode root, List<Character> result){ if(root == null){ return; } if(root.index != -1){ for(int i=root.start; i <= root.end.end; i++){ result.add(input[i]); } result.stream().forEach(System.out::print); System.out.println(" " + root.index); for(int i=root.start; i <= root.end.end; i++){ result.remove(result.size()-1); } return; } for(int i=root.start; i <= root.end.end; i++){ result.add(input[i]); } for(SuffixNode node : root.child){ dfsTraversal(node, result); } for(int i=root.start; i <= root.end.end; i++){ result.remove(result.size()-1); } } /** * Do validation of the tree by comparing all suffixes and their index at leaf node. */ private boolean validate(SuffixNode root, char[] input, int index, int curr){ if(root == null){ System.out.println("Failed at " + curr + " for index " + index); return false; } if(root.index != -1){ if(root.index != index){ System.out.println("Index not same. Failed at " + curr + " for index " + index); return false; }else{ return true; } } if(curr >= input.length){ System.out.println("Index not same. Failed at " + curr + " for index " + index); return false; } SuffixNode node = root.child[input[curr]]; if(node == null){ System.out.println("Failed at " + curr + " for index " + index); return false; } int j = 0; for(int i=node.start ; i <= node.end.end; i++){ if(input[curr+j] != input[i] ){ System.out.println("Mismatch found " + input[curr+j] + " " + input[i]); return false; } j++; } curr += node.end.end - node.start + 1; return validate(node, input, index, curr); } public boolean validate(){ for(int i=0; i < this.input.length; i++){ if(!validate(this.root, this.input, i, i)){ System.out.println("Failed validation"); return false; } } return true; } } class SuffixNode{ private SuffixNode(){ } private static final int TOTAL = 256; SuffixNode[] child = new SuffixNode[TOTAL]; int start; End end; int index; SuffixNode suffixLink; public static SuffixNode createNode(int start, End end){ SuffixNode node = new SuffixNode(); node.start = start; node.end = end; return node; } @Override public String toString() { StringBuffer buffer = new StringBuffer(); int i=0; for(SuffixNode node : child){ if(node != null){ buffer.append((char)i + " "); } i++; } return "SuffixNode [start=" + start + "]" + " " + buffer.toString(); } } class End{ public End(int end){ this.end = end; } int end; } class Active{ Active(SuffixNode node){ activeLength = 0; activeNode = node; activeEdge = -1; } @Override public String toString() { return "Active [activeNode=" + activeNode + ", activeIndex=" + activeEdge + ", activeLength=" + activeLength + "]"; } SuffixNode activeNode; int activeEdge; int activeLength; }