package com.jwetherell.algorithms.data_structures;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import com.jwetherell.algorithms.data_structures.interfaces.ISuffixTree;
/**
* A suffix tree is a data structure that presents the suffixes of a given
* string in a way that allows for a particularly fast implementation of many
* important string operations. This implementation is based on the Ukkonen's
* algorithm.
*
* http://en.wikipedia.org/wiki/Suffix_tree
*
* @author Justin Wetherell <phishman3579@gmail.com>
*/
public class SuffixTree<C extends CharSequence> implements ISuffixTree<C> {
private static final char DEFAULT_END_SEQ_CHAR = '$';
private String string = null;
private char[] characters = null;
private Map<Integer, Link> linksMap = new HashMap<Integer, Link>();
private Map<Integer, Edge<C>> edgeMap = new TreeMap<Integer, Edge<C>>();
private int currentNode = 0;
private int firstCharIndex = 0;
private int lastCharIndex = -1;
private char END_SEQ_CHAR = DEFAULT_END_SEQ_CHAR;
/**
* Create suffix tree with sequence and default end sequence.
*
* @param seq
* to create a suffix tree with.
*/
public SuffixTree(C seq) {
this(seq, DEFAULT_END_SEQ_CHAR);
}
/**
* Create suffix tree with sequence and end sequence parameter.
*
* @param seq
* to create a suffix tree with.
* @param endSeq
* which defines the end of a sequence.
*/
public SuffixTree(C seq, char endSeq) {
END_SEQ_CHAR = endSeq;
StringBuilder builder = new StringBuilder(seq);
if (builder.indexOf(String.valueOf(END_SEQ_CHAR)) < 0)
builder.append(END_SEQ_CHAR);
string = builder.toString();
int length = string.length();
characters = new char[length];
for (int i = 0; i < length; i++) {
char c = string.charAt(i);
characters[i] = c;
}
for (int j = 0; j < length; j++) {
addPrefix(j);
}
}
/**
* Does the sub-sequence exist in the suffix tree.
*
* @param sub
* sub-sequence to locate in the tree.
* @return True if the sub-sequence exist in the tree.
*/
@Override
public boolean doesSubStringExist(C sub) {
char[] chars = new char[sub.length()];
for (int i = 0; i < sub.length(); i++) {
chars[i] = sub.charAt(i);
}
int[] indices = searchEdges(chars);
int start = indices[0];
int end = indices[1];
int length = end - start;
if (length == (chars.length - 1))
return true;
return false;
}
/**
* Get all the suffixes in the tree.
*
* @return set of suffixes in the tree.
*/
@Override
public Set<String> getSuffixes() {
Set<String> set = getSuffixes(0);
return set;
}
/**
* Get all suffixes at starting node.
*
* @param start
* node.
* @return set of suffixes in the tree at start node.
*/
private Set<String> getSuffixes(int start) {
Set<String> set = new TreeSet<String>();
for (int key : edgeMap.keySet()) {
Edge<C> e = edgeMap.get(key);
if (e == null)
continue;
if (e.startNode != start)
continue;
String s = (string.substring(e.firstCharIndex, e.lastCharIndex + 1));
Link n = linksMap.get(e.endNode);
if (n == null) {
int index = s.indexOf(END_SEQ_CHAR);
if (index >= 0)
s = s.substring(0, index);
set.add(s);
} else {
Set<String> set2 = getSuffixes(e.endNode);
for (String s2 : set2) {
int index = s2.indexOf(END_SEQ_CHAR);
if (index >= 0)
s2 = s2.substring(0, index);
set.add(s + s2);
}
}
}
return set;
}
/**
* Get all edges in the table
*
* @return debug string.
*/
public String getEdgesTable() {
StringBuilder builder = new StringBuilder();
if (edgeMap.size() > 0) {
int charsLength = characters.length;
builder.append("Edge\tStart\tEnd\tSuf\tfirst\tlast\tString\n");
for (int key : edgeMap.keySet()) {
Edge<C> e = edgeMap.get(key);
Link link = linksMap.get(e.endNode);
int suffix = (link != null) ? link.suffixNode : -1;
builder.append("\t" + e.startNode + "\t" + e.endNode + "\t" + suffix + "\t" + e.firstCharIndex + "\t"
+ e.lastCharIndex + "\t");
int begin = e.firstCharIndex;
int end = (charsLength < e.lastCharIndex) ? charsLength : e.lastCharIndex;
builder.append(string.substring(begin, end + 1));
builder.append("\n");
}
builder.append("Link\tStart\tEnd\n");
for (int key : linksMap.keySet()) {
Link link = linksMap.get(key);
builder.append("\t" + link.node + "\t" + link.suffixNode + "\n");
}
}
return builder.toString();
}
/**
* Add prefix at index.
*
* @param index
* to add prefix at.
*/
private void addPrefix(int index) {
int parentNodeIndex = 0;
int lastParentIndex = -1;
while (true) {
Edge<C> edge = null;
parentNodeIndex = currentNode;
if (isExplicit()) {
edge = Edge.find(this, currentNode, characters[index]);
if (edge != null) {
// Edge already exists
break;
}
} else {
// Implicit node, a little more complicated
edge = Edge.find(this, currentNode, characters[firstCharIndex]);
int span = lastCharIndex - firstCharIndex;
if (characters[edge.firstCharIndex + span + 1] == characters[index]) {
// If the edge is the last char, don't split
break;
}
parentNodeIndex = edge.split(currentNode, firstCharIndex, lastCharIndex);
}
edge = new Edge<C>(this, index, characters.length - 1, parentNodeIndex);
if (lastParentIndex > 0) {
// Last parent is not root, create a link.
linksMap.get(lastParentIndex).suffixNode = parentNodeIndex;
}
lastParentIndex = parentNodeIndex;
if (currentNode == 0) {
firstCharIndex++;
} else {
// Current node is not root, follow link
currentNode = linksMap.get(currentNode).suffixNode;
}
if (!isExplicit())
canonize();
}
if (lastParentIndex > 0) {
// Last parent is not root, create a link.
linksMap.get(lastParentIndex).suffixNode = parentNodeIndex;
}
lastParentIndex = parentNodeIndex;
lastCharIndex++; // Now the endpoint is the next active point
if (!isExplicit())
canonize();
}
/**
* Is the tree explicit
*
* @return True if explicit.
*/
private boolean isExplicit() {
return firstCharIndex > lastCharIndex;
}
/**
* Canonize the tree.
*/
private void canonize() {
Edge<C> edge = Edge.find(this, currentNode, characters[firstCharIndex]);
int edgeSpan = edge.lastCharIndex - edge.firstCharIndex;
while (edgeSpan <= (lastCharIndex - firstCharIndex)) {
firstCharIndex = firstCharIndex + edgeSpan + 1;
currentNode = edge.endNode;
if (firstCharIndex <= lastCharIndex) {
edge = Edge.find(this, edge.endNode, characters[firstCharIndex]);
edgeSpan = edge.lastCharIndex - edge.firstCharIndex;
}
}
}
/**
* Returns a two element int array who's 0th index is the start index and
* 1th is the end index.
*/
private int[] searchEdges(char[] query) {
int startNode = 0;
int queryPosition = 0;
int startIndex = -1;
int endIndex = -1;
boolean stop = false;
while (!stop && queryPosition < query.length) {
Edge<C> edge = Edge.find(this, startNode, query[queryPosition]);
if (edge == null) {
stop = true;
break;
}
if (startNode == 0)
startIndex = edge.firstCharIndex;
for (int i = edge.firstCharIndex; i <= edge.lastCharIndex; i++) {
if (queryPosition >= query.length) {
stop = true;
break;
} else if (query[queryPosition] == characters[i]) {
queryPosition++;
endIndex = i;
} else {
stop = true;
break;
}
}
if (!stop) { // proceed with next node
startNode = edge.endNode;
if (startNode == -1)
stop = true;
}
}
return (new int[] { startIndex, endIndex });
}
/**
* {@inheritDoc}
*/
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("String = ").append(this.string).append("\n");
builder.append("End of word character = ").append(END_SEQ_CHAR).append("\n");
builder.append(TreePrinter.getString(this));
return builder.toString();
}
private static class Link implements Comparable<Link> {
private int node = 0;
private int suffixNode = -1;
public Link(int node) {
this.node = node;
}
/**
* {@inheritDoc}
*/
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("node=").append(node).append("\n");
builder.append("suffixNode=").append(suffixNode).append("\n");
return builder.toString();
}
/**
* {@inheritDoc}
*/
@Override
public int compareTo(Link link) {
if (link == null)
return -1;
if (node < link.node)
return -1;
if (node > link.node)
return 1;
if (suffixNode < link.suffixNode)
return -1;
if (suffixNode > link.suffixNode)
return 1;
return 0;
}
}
private static class Edge<C extends CharSequence> implements Comparable<Edge<C>> {
private static final int KEY_MOD = 2179; // Should be a prime that is
// roughly 10% larger than the
// String
private static int count = 1;
private SuffixTree<C> tree = null;
private int startNode = -1;
private int endNode = 0;
private int firstCharIndex = 0;
private int lastCharIndex = 0;
private Edge(SuffixTree<C> tree, int first, int last, int parent) {
this.tree = tree;
firstCharIndex = first;
lastCharIndex = last;
startNode = parent;
endNode = count++;
insert(this);
}
private int getKey() {
return key(startNode, tree.characters[firstCharIndex]);
}
private static int key(int node, char c) {
return ((node << 8) + c) % KEY_MOD;
}
private void insert(Edge<C> edge) {
tree.edgeMap.put(edge.getKey(), edge);
}
private void remove(Edge<C> edge) {
int i = edge.getKey();
Edge<C> e = tree.edgeMap.remove(i);
while (true) {
e.startNode = -1;
int j = i;
while (true) {
i = ++i % KEY_MOD;
e = tree.edgeMap.get(i);
if (e == null)
return;
int r = key(e.startNode, tree.characters[e.firstCharIndex]);
if (i >= r && r > j)
continue;
if (r > j && j > i)
continue;
if (j > i && i >= r)
continue;
break;
}
tree.edgeMap.put(j, e);
}
}
private static <C extends CharSequence> Edge<C> find(SuffixTree<C> tree, int node, char c) {
int key = key(node, c);
return tree.edgeMap.get(key);
}
private int split(int originNode, int firstIndex, int lastIndex) {
remove(this);
Edge<C> newEdge = new Edge<C>(tree, this.firstCharIndex, this.firstCharIndex + lastIndex - firstIndex, originNode);
Link link = tree.linksMap.get(newEdge.endNode);
if (link == null) {
link = new Link(newEdge.endNode);
tree.linksMap.put(newEdge.endNode, link);
}
tree.linksMap.get(newEdge.endNode).suffixNode = originNode;
this.firstCharIndex += lastIndex - firstIndex + 1;
this.startNode = newEdge.endNode;
insert(this);
return newEdge.endNode;
}
/**
* {@inheritDoc}
*/
@Override
public int hashCode() {
return getKey();
}
/**
* {@inheritDoc}
*/
@Override
public boolean equals(Object obj) {
if (obj == null)
return false;
if (obj instanceof Edge)
return false;
@SuppressWarnings("unchecked")
Edge<C> e = (Edge<C>) obj;
if (startNode == e.startNode && tree.characters[firstCharIndex] == tree.characters[e.firstCharIndex]) {
return true;
}
return false;
}
/**
* {@inheritDoc}
*/
@Override
public int compareTo(Edge<C> edge) {
if (edge == null)
return -1;
if (startNode < edge.startNode)
return -1;
if (startNode > edge.startNode)
return 1;
if (endNode < edge.endNode)
return -1;
if (endNode > edge.endNode)
return 1;
if (firstCharIndex < edge.firstCharIndex)
return -1;
if (firstCharIndex > edge.firstCharIndex)
return 1;
if (lastCharIndex < edge.lastCharIndex)
return -1;
if (lastCharIndex > edge.lastCharIndex)
return 1;
return 0;
}
/**
* {@inheritDoc}
*/
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("startNode=").append(startNode).append("\n");
builder.append("endNode=").append(endNode).append("\n");
builder.append("firstCharIndex=").append(firstCharIndex).append("\n");
builder.append("lastCharIndex=").append(lastCharIndex).append("\n");
String s = tree.string.substring(firstCharIndex, lastCharIndex + 1);
builder.append("string=").append(s).append("\n");
return builder.toString();
}
}
protected static class TreePrinter {
public static <C extends CharSequence> void printNode(SuffixTree<C> tree) {
System.out.println(getString(tree, null, "", true));
}
public static <C extends CharSequence> String getString(SuffixTree<C> tree) {
return getString(tree, null, "", true);
}
private static <C extends CharSequence> String getString(SuffixTree<C> tree, Edge<C> e, String prefix,
boolean isTail) {
StringBuilder builder = new StringBuilder();
int value = 0;
if (e != null) {
value = e.endNode;
String string = tree.string.substring(e.firstCharIndex, e.lastCharIndex + 1);
int index = string.indexOf(tree.END_SEQ_CHAR);
if (index >= 0)
string = string.substring(0, index + 1);
builder.append(prefix + (isTail ? "└── " : "├── ") + "(" + value + ") " + string + "\n");
} else {
builder.append(prefix + (isTail ? "└── " : "├── ") + "(" + 0 + ")" + "\n");
}
if (tree.edgeMap.size() > 0) {
List<Edge<C>> children = new LinkedList<Edge<C>>();
for (Edge<C> edge : tree.edgeMap.values()) {
if (edge != null && (edge.startNode == value)) {
children.add(edge);
}
}
if (children.size() > 0) {
for (int i = 0; i < children.size() - 1; i++) {
Edge<C> edge = children.get(i);
builder.append(getString(tree, edge, prefix + (isTail ? " " : "│ "), false));
}
if (children.size() >= 1) {
Edge<C> edge = children.get(children.size() - 1);
builder.append(getString(tree, edge, prefix + (isTail ? " " : "│ "), true));
}
}
}
return builder.toString();
}
}
}