/*
* Created on Jan 22, 2008
*/
package org.seqcode.gseutils.strings;
import java.io.PrintStream;
import java.util.*;
import java.util.logging.*;
/**
* @author Timothy Danford
*
* An implementation of suffix trees, and of Ukkonen's Algorithm for
* building a suffix tree in linear time, adapted from Chapter 6 of
* Gusfield, "Algorithms on Strings, Trees, and Sequences."
*/
public class UkkonenSuffixTree {
public static void main(String[] args) {
String target = args[0];
String query = args[1];
UkkonenSuffixTree suffixTree = new UkkonenSuffixTree();
suffixTree.addString(target);
System.out.println("Test1 Tree:");
suffixTree.print(System.out);
System.out.println(); System.out.flush();
System.err.println();
System.err.flush();
assert suffixTree.check();
Set<StringSuffix> matches = suffixTree.matchString(query);
System.out.println("Full matches are " + matches);
matches = suffixTree.matchStringPartial(query);
System.out.println("Partial matches are " + matches);
assert suffixTree.check();
}
private Vector<TreeString> strings;
private Vector<TreeEdge> totalStringEdges;
private TreeNode root;
private char terminal;
private UkkonenState extState; // this is a field, because TreeEdge depends on it.
private Logger logger;
private boolean isLogging;
private Level minLogLevel;
public UkkonenSuffixTree() {
logger = Logger.getLogger("org.seqcode.gse.projects.chipseq.assembler.UkkonenSuffixTree");
isLogging = true;
minLogLevel = Level.SEVERE;
logger.setFilter(new LoggingFilter());
logger.addHandler(new LoggingHandler(System.err, false));
logger.setUseParentHandlers(false);
logger.setLevel(Level.SEVERE);
logger.log(Level.FINE, "Logger setup complete.");
strings = new Vector<TreeString>();
terminal = '$';
root = new TreeNode(null);
totalStringEdges = new Vector<TreeEdge>();
extState = null;
}
public TreeString getString(int i) { return strings.get(i); }
public int size() { return strings.size(); }
public boolean isTerminal(char c) { return c==terminal; }
public void print(PrintStream ps) {
root.print(0, ps);
}
public void addString(String str) {
char[] array = str.toCharArray();
strings.add(new TreeString(strings.size(), array));
logger.log(Level.INFO, String.format("Adding string \"%s\"", str));
ukkonenExtendSuffixTree(strings.size()-1);
//naiveExtendSuffixTree(strings.size()-1);
}
public Set<StringSuffix> matchString(String str) {
char[] array = str.toCharArray();
EdgeMatch m = findEdge(root, array, 0, array.length, false);
if(m.completedMatch()) {
return collectSuffixes(m.lastEdge.tailNode);
} else {
return new TreeSet<StringSuffix>();
}
}
public Set<StringSuffix> matchStringPartial(String str) {
char[] array = str.toCharArray();
EdgeMatch m = findEdge(root, array, 0, array.length, false);
if (m.lastEdge != null) {
return collectSuffixes(m.lastEdge.tailNode);
} else {
return new TreeSet<StringSuffix>();
}
}
public boolean check() {
return root.check();
}
/** Internal Methods ************************************************************/
private EdgeMatch findEdge(TreeNode currentNode, char[] array,
int start, int end, boolean skipcount) {
EdgeMatch em = new EdgeMatch(array, start, end);
em.matchFrom(currentNode, skipcount);
return em;
}
private EdgeMatch findEdge(TreeNode currentNode, TreeString string,
int start, int end, boolean skipcount) {
EdgeMatch em = new EdgeMatch(string, start, end);
em.matchFrom(currentNode, skipcount);
return em;
}
private void naiveExtendSuffixTree(int arrayIdx) {
TreeString string = strings.get(arrayIdx);
// the array.length-1 constraint, instead of array.length, is because
// we assume that the terminal character has already been added to the
// string, and we don't want to *just* add the suffix that is that
// character.
for(int i = 0; i <= string.length(); i++) {
logger.log(Level.FINEST, String.format("Naive Extension: \"%s\"",
string.substring(i, string.length()+1)));
naiveExtendSuffix(string, i);
}
}
private void naiveExtendSuffix(TreeString string, int start) {
EdgeMatch em = findEdge(root, string, start, string.length(), false);
StringSuffix stringSuffix = new StringSuffix(string, start);
TreeEdge leafEdge = null;
if(em.completedMatch()) {
leafEdge = em.lastEdge;
} else {
if(em.lastEdge == null) {
leafEdge = new TreeEdge(string, start, string.length(), root);
root.addEdge(leafEdge);
} else {
leafEdge = new TreeEdge(string, em.matchedTo, string.length(), em.lastEdge.tailNode);
if(em.inEdgeMiddle()) {
int offset = em.lastMatchLength();
em.lastEdge.split(offset);
}
em.lastEdge.tailNode.addEdge(leafEdge);
}
}
leafEdge.tailNode.suffixes.add(stringSuffix);
}
private void ukkonenExtendSuffixTree(int arrayIdx) {
logger.entering("UkkonenSuffixTree", "ukkonenExtendSuffixTree");
logger.log(Level.FINEST, String.format("Ukkonen Algorithm String #%d", arrayIdx));
TreeString string = strings.get(arrayIdx);
extState = new UkkonenState(string);
logger.log(Level.FINEST, String.format(
"Ukkonen: (%d,%d)", extState.nextPhaseStart, extState.string.length()));
for(int phase = extState.nextPhaseStart; phase < extState.string.length(); phase++) {
ukkonenSPA(phase);
System.err.println(String.format("Phase %d results: ", phase));
print(System.err); System.err.println(); System.err.flush();
}
logger.log(Level.FINEST, String.format("Finishing edges: %d", extState.lastE));
extState.finishFinalEdges();
System.err.println(String.format("Finished results: "));
print(System.err); System.err.println(); System.err.flush();
logger.exiting("UkkonenSuffixTree", "ukkonenExtendSuffixTree");
}
/* ukkonenSPA(i) performs phase i of Ukkonen's algorithm. This
* means that we're making sure that array[0,i] (note the inclusivity!)
* is a part of the current suffix tree.
*
* Original Description: pg. 106 of Gusfield
*/
private void ukkonenSPA(int i) {
logger.entering("UkkonenSuffixTree", "ukkonenSPA");
logger.log(Level.FINEST, String.format("i=%d", i));
assert i >= 0;
/*
* SPA Step 1:
* "Increment index e to i+1"
*
* The equivalent of Gusfield's i+1 is, in our situation, just i.
* However, the coordinates are inclusive in Gusfield,
* and exclusive in our case (along the tree edges). Therefore,
* lastE should be updated to be i+1, exactly.
*/
extState.lastE = i+1;
logger.log(Level.FINEST, String.format("e=%d", extState.lastE));
/*
* SPA Step 2:
* "Explicitly compute successive extensions, using the SEA algorithm,
* starting at j_i + 1 until reaching the first extension j* where rule3
* applies or until all extensions are done in this phase."
*
* extState.nextExtStart encodes the (j_i)+1 value. We start there, and
* iterate forward until all extensions have been performed, or until
* ukkonenSEA returns false (ukkonenSEA returns a true if rule 1 or rule 2
* applies in its extension).
*
* We extend until j==i, because the last extension of each phase is
* the extension that *just* adds the new character into the tree.
*/
logger.log(Level.FINEST, String.format("jstart=%d", extState.nextExtStart));
boolean keepExtending = true;
int j = extState.nextExtStart;
while(keepExtending && j <= i) {
if(ukkonenSEA(i, j)) {
j++;
// we don't want to just put in the terminal character.
if(i == extState.string.length()-1 && j == i) {
keepExtending = false;
}
} else {
keepExtending = false;
}
System.out.println(String.format("Phase %d, Extension %d tree: ", i, j));
print(System.out);
System.out.println(); System.out.flush();
System.err.println(); System.err.flush();
}
/*
* SPA Step 3:
* "Set j_{i+1} to j*-1, to prepare for the next phase."
*/
extState.nextExtStart = j;
logger.log(Level.FINEST, String.format("j*=%d", extState.nextExtStart));
logger.exiting("UkkonenSuffixTree", "ukkonenSPA");
}
/*
* ukkonenSEA(i, j) performs extension j of phase i of Ukkonen's algorithm.
* This means that we're making sure that array[j,i] (note the inclusivity!)
* is a part of the current suffix tree.
*
* Original Description: pg. 100 of Gusfield
*/
private boolean ukkonenSEA(int i, int j) {
logger.exiting("UkkonenSuffixTree", "ukkonenSEA");
logger.log(Level.FINEST, String.format("j=%d", j));
assert j <= i;
boolean rule3 = false;
TreeNode newRule2Node = null;
EdgeMatch m = extState.matcher;
char lastChar = extState.string.getChar(i);
boolean lastCharIsTerminal = isTerminal(lastChar);
/*
* SEA Step 1:
* "Find the first node v at or above the end of S[j-1,i] that either
* has a suffix link from it or is the root. This requires walking up
* at most one edge from the end of S[j-1,i] in the current tree. Let
* \gamma (possibly empty) denote the string between v and the
* end of S[j-1,i]."
*/
/*
* SEA Step 2:
* "If v is not the root, traverse the suffix link from v to node
* s(v) and then walk down from s(v) following the path for string
* gamma. If v is the root, then follow the path for S[j,i] from the
* root (as in the naive algorithm)."
*/
int gammaEnd = i;
int gammaStart = gammaEnd - extState.gammaLength;
if(extState.nextNode == null || extState.nextNode.isRoot()) {
String beta = extState.string.substring(j,i);
logger.log(Level.FINEST, String.format("beta: %d,%d <%s>%c", j, i, beta, lastChar));
m.reset(j, i);
m.matchFrom(root, true);
} else {
logger.log(Level.FINEST, String.format("gammaLength:%d", extState.gammaLength));
String gamma = extState.string.substring(gammaStart, gammaEnd);
logger.log(Level.FINEST, String.format("gamma: %d,%d <%s>%c", gammaStart, gammaEnd, gamma, lastChar));
m.reset(gammaStart, gammaEnd);
m.matchFrom(extState.nextNode, true);
}
/*
* SEA Step 3:
* "Using the extension rules, ensure that the string S[j,i]S(i+1) is
* in the tree."
*
* In our coordinates, this is array[j,i)+array[i]
* \beta = array[j,i)
*
* Rule 1: the path \beta ends at a leaf. (we shouldn't see this case).
* Rule 2: the path \beta is not continued by array[i]. That is, \beta
* ends either at a node (in which case, no child of the node
* starts with array[i]), or in an edge (in which case, the edge
* doesn't continue with array[i]). Either way, we create a new
* edge that is labeled with array[i] (coordinates: [i,i+1) ).
* Rule 3: \beta+array[i] is already in the tree -- either \beta ends in
* an edge that continues with array[i], or at a node that has
* a child under array[i]. Either way, return false (break!).
*/
TreeEdge newEdge = null;
if(m.lastEdge == null) {
logger.log(Level.FINEST, String.format("Found root."));
// the \beta string matched to the root (was empty). So we need
// to simply check the children of the root.
boolean foundLastChar = !lastCharIsTerminal ?
root.childEdges.containsKey(lastChar) :
root.terminalEdges.containsKey(extState.string.getIndex());
if(foundLastChar) {
// Rule 3
rule3 = true;
logger.log(Level.FINEST, "Rule #3, Root");
extState.nextNode = null;
extState.gammaLength = 0;
logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
} else {
// Rule 2
logger.log(Level.FINEST, "Rule #2, Root");
newEdge = new TreeEdge(extState.string, i, null, root);
root.addEdge(newEdge);
extState.nextNode = null;
extState.gammaLength = 0;
logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
}
} else if(m.inEdgeMiddle()) {
int offset = m.lastMatchLength();
logger.log(Level.FINEST, String.format("Found edge middle: %d", offset));
boolean foundLastChar = !lastCharIsTerminal ?
m.lastEdge.getChar(offset) == lastChar :
(m.lastEdge.string.getIndex() == extState.string.getIndex() &&
offset == m.lastEdge.length()-1);
logger.log(Level.FINEST, String.format("foundLastChar: %s", foundLastChar));
if(foundLastChar) {
// Rule 3
rule3 = true;
logger.log(Level.FINEST, "Rule #3, Edge");
extState.nextNode = m.lastEdge.headNode;
//extState.gammaLength = m.lastMatchLength() + 1;
extState.gammaLength = m.lastMatchLength() + (j==i ? 1 : 0);
assert extState.gammaLength >= 0;
logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
} else {
// Rule 2
logger.log(Level.FINEST, "Rule #2, Edge");
TreeEdge newLowerEdge = m.lastEdge.split(offset);
extState.edgesWithE.add(newLowerEdge);
newEdge = new TreeEdge(extState.string, i, null, m.lastEdge.tailNode);
m.lastEdge.tailNode.addEdge(newEdge);
newRule2Node = m.lastEdge.tailNode;
extState.nextNode = m.lastEdge.headNode;
//extState.gammaLength = m.lastEdge.length() + 1;
extState.gammaLength = m.lastEdge.length() + (j==i ? 1 : 0);
assert extState.gammaLength >= 0;
logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
}
if(extState.nextNode.suffixLink == null && !extState.nextNode.isRoot()) {
logger.log(Level.FINEST, String.format("Walking up edge: %d",
extState.nextNode.parentEdge.length()));
extState.gammaLength += extState.nextNode.parentEdge.length();
extState.nextNode = extState.nextNode.parentEdge.headNode;
}
} else {
logger.log(Level.FINEST, String.format("Found node."));
boolean foundLastChar = !lastCharIsTerminal ?
m.lastEdge.tailNode.childEdges.containsKey(lastChar) :
m.lastEdge.tailNode.terminalEdges.containsKey(extState.string.getIndex());
logger.log(Level.FINEST, String.format("foundLastChar: %s", foundLastChar));
if(foundLastChar) {
// Rule 3
rule3 = true;
logger.log(Level.FINEST, "Rule #3, Node");
extState.nextNode = m.lastEdge.headNode;
//extState.gammaLength = m.lastEdge.length() + 1;
extState.gammaLength = m.lastEdge.length() + (j==i ? 1 : 0);
assert extState.gammaLength >= 0;
logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
} else {
// Rule 2
logger.log(Level.FINEST, "Rule #2, Node");
newEdge = new TreeEdge(extState.string, i, null, m.lastEdge.tailNode);
m.lastEdge.tailNode.addEdge(newEdge);
extState.nextNode = m.lastEdge.headNode;
//extState.gammaLength = m.lastEdge.length() + 1;
extState.gammaLength = m.lastEdge.length() + (j==i ? 1 : 0);
logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
assert extState.gammaLength >= 0;
}
if(extState.nextNode.suffixLink == null && !extState.nextNode.isRoot()) {
logger.log(Level.FINEST, String.format("Walking up edge: %d",
extState.nextNode.parentEdge.length()));
extState.gammaLength += extState.nextNode.parentEdge.length();
extState.nextNode = extState.nextNode.parentEdge.headNode;
}
}
if(extState.nextNode != null) {
logger.log(Level.FINEST, "Following suffix link.");
extState.nextNode = extState.nextNode.suffixLink;
} else {
logger.log(Level.FINEST, "Suffix link not found.");
}
if(newEdge != null) {
newEdge.tailNode.suffixes.add(extState.currentSuffix);
extState.nextSuffix();
extState.edgesWithE.add(newEdge);
logger.log(Level.FINEST, String.format("Added suffix: %d", j));
}
/*
* SEA Step 4:
* "If a new internal node w was created in extension j-1 (by extension rule 2)
* then by Lemma 6.1.1 string alpha must end at node s(w), the end node for the
* suffix link from w. Create the suffix link (w, s(w)) from w to s(w)."
*
* This wording is confusing -- is there a typo in Gusfield? I'm not sure where
* the 'w' comes from.
*/
if(extState.rule2Node != null) {
if(m.lastEdge != null) {
extState.rule2Node.suffixLink = m.lastEdge.tailNode;
logger.log(Level.FINEST, "Adding suffix link --> internal node.");
} else {
extState.rule2Node.suffixLink = root;
logger.log(Level.FINEST, "Adding suffix link --> root.");
}
}
/*
* Update any state that will be needed in the next extension.
*/
extState.rule2Node = newRule2Node;
logger.exiting("UkkonenSuffixTree", "ukkonenSEA");
// "Rule 3 is a show stopper" means that, if we encounter rule 3,
// we *don't* continue.
return !rule3;
}
private Set<StringSuffix> collectSuffixes(TreeNode tn) {
TreeSet<StringSuffix> set = new TreeSet<StringSuffix>();
tn.collectSuffixes(set);
return set;
}
/** Helper Classes **************************************************************/
public class TreeString {
private int index;
private char[] array;
private TreeString(int idx, char[] a) {
index = idx;
array = a;
}
public int getIndex() { return index; }
public int length() { return array.length+1; }
public char getChar(int i) { return i < array.length ? array[i] : terminal; }
public boolean matches(int offset, TreeString str, int strOffset) {
assert str != null;
assert offset >= 0;
assert offset <= array.length;
assert strOffset >= 0;
assert strOffset <= str.array.length;
if(offset==array.length || strOffset == str.array.length) {
return index==str.index && offset==array.length && strOffset==str.array.length;
} else {
return array[offset] == str.array[strOffset];
}
}
public boolean matches(int offset, char[] str, int strOffset) {
if(offset == array.length || strOffset >= str.length) {
return false;
} else {
return array[offset]==str[strOffset];
}
}
public String substring(int start, int end) {
StringBuilder sb = new StringBuilder();
for(int i = start; i < end; i++) {
sb.append(getChar(i));
}
return sb.toString();
}
public int hashCode() {
int code = 17;
code += index; code *= 37;
return code;
}
public boolean equals(Object o) {
if(!(o instanceof TreeString)) { return false; }
TreeString ts = (TreeString)o;
return ts.index == index;
}
public String toString() {
return String.format("#%d:%s", index, new String(array));
}
}
public class StringSuffix implements Comparable<StringSuffix> {
private int offset;
private TreeString string;
private StringSuffix(TreeString ts, int off) {
string = ts;
offset = off;
}
public int getStringIndex() { return string.getIndex(); }
public int getOffset() { return offset; }
public String getSuffixString() {
StringBuilder sb = new StringBuilder();
for(int i = offset; i <= string.length(); i++) {
sb.append(string.getChar(i));
}
return sb.toString();
}
public String toString() {
return String.format("(#%d,+%d)", getStringIndex(), offset);
}
public int hashCode() {
int code = 17;
code += string.hashCode(); code *= 37;
code += offset; code *= 37;
return code;
}
public int compareTo(StringSuffix ss) {
int stringID = getStringIndex();
if(stringID < ss.getStringIndex()) { return -1; }
if(stringID > ss.getStringIndex()) { return 1; }
if(offset < ss.offset) { return -1; }
if(offset > ss.offset) { return 1; }
return 0;
}
public boolean equals(Object o) {
if(!(o instanceof StringSuffix)) { return false; }
StringSuffix ss = (StringSuffix)o;
if(!ss.string.equals(string)) { return false; }
return offset==ss.offset;
}
}
/** Internal Classes ************************************************************/
private class TreeNode {
public TreeEdge parentEdge;
public TreeNode suffixLink;
public Set<StringSuffix> suffixes;
private Map<Character,TreeEdge> childEdges;
private Map<Integer,TreeEdge> terminalEdges;
public TreeNode(TreeEdge p) {
parentEdge = p;
suffixLink = null;
suffixes = new TreeSet<StringSuffix>();
childEdges = new TreeMap<Character,TreeEdge>();
terminalEdges = new TreeMap<Integer,TreeEdge>();
}
// Walks the tree, checking to make sure that we haven't violated any
// logical constraints. This is a method for debugging.
public boolean check() {
String path = pathLabel();
if(suffixLink != null) {
String suffixPath = suffixLink.pathLabel();
if(!path.substring(1, path.length()).equals(suffixPath)) {
logger.log(Level.SEVERE, String.format("Suffix Link for node (%s) didn't match: %s", path, suffixPath));
return false;
}
}
for(char c : childEdges.keySet()) {
TreeEdge e = childEdges.get(c);
if(!e.check()) {
return false;
}
}
for(int k : terminalEdges.keySet()) {
TreeEdge e = terminalEdges.get(k);
if(!e.check()) {
return false;
}
}
return true;
}
public String pathLabel() {
if(parentEdge != null) {
StringBuilder sb = new StringBuilder(parentEdge.headNode.pathLabel());
for(int i = 0; i < parentEdge.length(); i++) {
sb.append(parentEdge.getChar(i));
}
return sb.toString();
} else {
return "";
}
}
public void print(int indent, PrintStream ps) {
if(isLeaf()) {
printSuffixes(ps);
ps.println();
} else {
int i = 0;
for(char c : childEdges.keySet()) {
TreeEdge edge = childEdges.get(c);
edge.print(indent, i!=0, ps);
i++;
}
for(int k : terminalEdges.keySet()) {
TreeEdge edge = terminalEdges.get(k);
edge.print(indent, i!=0, ps);
i++;
}
}
}
public void printSuffixes(PrintStream ps) {
ps.print(" [");
int i = 0;
for(StringSuffix ss : suffixes) {
ps.print((i == 0 ? "" : ",") + ss.toString());
i++;
}
ps.print("]");
}
public boolean isRoot() { return parentEdge == null; }
public boolean isLeaf() { return childEdges.isEmpty(); }
public void collectSuffixes(Set<StringSuffix> suffices) {
suffices.addAll(suffixes);
for(char key : childEdges.keySet()) {
childEdges.get(key).tailNode.collectSuffixes(suffices);
}
for(int key : terminalEdges.keySet()) {
terminalEdges.get(key).tailNode.collectSuffixes(suffices);
}
}
public void addEdge(TreeEdge e) {
if(e.start() == e.string.length()-1) {
int key = e.string.getIndex();
if(terminalEdges.containsKey(key)) { throw new IllegalArgumentException(); }
e.headNode = this;
terminalEdges.put(key, e);
} else {
char key = e.getChar(0);
if(childEdges.containsKey(key)) { throw new IllegalArgumentException(); }
e.headNode = this;
childEdges.put(key, e);
}
}
}
/*
* This is the state that we need to carry through, from extension to
* extension and from phase to phase, of the Ukkonen extension algo-
* rithm. Particularly important is the "lastE" field, which is
* globally referenced by a certain subset of the TreeEdges (the 'leaf'
* edges) during the execution of the Ukkonen Algorithm. As a result,
* there is a global field ('extState') that is maintained during any
* run of the algorithm.
*/
private class UkkonenState {
public TreeString string;
// this is set for the very first phase, when we're
// adding a string that shares a prefix with a string
// already in the tree.
public int nextPhaseStart;
// these are updated once a phase.
public int lastE;
public int nextExtStart;
// these are updated each extension (potentially).
public EdgeMatch matcher;
public TreeNode rule2Node;
public TreeNode nextNode;
public int gammaLength;
public LinkedList<TreeEdge> edgesWithE;
public StringSuffix currentSuffix;
public UkkonenState(TreeString str) {
string = str;
lastE = 0;
edgesWithE = new LinkedList<TreeEdge>();
nextPhaseStart = 0;
nextExtStart = 0;
matcher = null;
nextNode = root;
gammaLength = 0;
rule2Node = null;
if(string.getIndex() > 0) {
matcher = findEdge(root, string, 0, string.length(), false);
nextPhaseStart = matcher.matchedTo;
nextExtStart = 0;
lastE = matcher.matchedTo;
logger.log(Level.FINEST, String.format(
"String %s can start at phase %d (E:%d)",
string.toString(), nextPhaseStart, lastE));
} else {
matcher = new EdgeMatch(string, 0, string.length());
}
currentSuffix = new StringSuffix(string, 0);
}
public void nextSuffix() {
currentSuffix = new StringSuffix(string, currentSuffix.getOffset() + 1);
}
public void finishFinalEdges() {
for(TreeEdge edge : edgesWithE) {
if(edge.isEndSymbolic()) {
edge.setEnd(lastE);
}
}
edgesWithE.clear();
}
}
/*
* We can't just match into the tree in a naive way -- we need to
* store some state about how *far* we matched into the tree, where
* we ended up, etc. As a result, this EdgeMatch class implements
* tree descent -- it walks as far down into the tree as possible
* and then (upon return from the matchFrom() method) contains all the
* state we need about where that match ended.
*/
private class EdgeMatch {
// only one of these is non-null.
public char[] array;
public TreeString string;
public int start, end;
public TreeEdge lastEdge;
public int matchingFrom, matchedTo;
public EdgeMatch(char[] a, int st, int ed) {
array = a;
string = null;
assert string != null || array != null;
start = st;
end = ed;
lastEdge = null;
matchingFrom = matchedTo = -1;
}
public EdgeMatch(TreeString s, int st, int ed) {
array = null;
string = s;
assert string != null || array != null;
start = st;
end = ed;
lastEdge = null;
matchingFrom = matchedTo = -1;
}
public char getChar(int i) {
return array != null ? array[i] : string.getChar(i);
}
public void reset(int st, int ed) {
start = st;
end = ed;
lastEdge = null;
matchingFrom = matchedTo = -1;
}
public String matchingString() {
if(array != null) {
return new String(array, start, end-start);
} else {
return string.substring(start,end);
}
}
public String matchedString() {
if(array != null) {
return new String(array, start, matchedTo-start);
} else {
return string.substring(start,matchedTo);
}
}
public String currentMatchString() {
if(array != null) {
return new String(array, matchingFrom, end-matchingFrom);
} else {
return string.substring(matchingFrom,end);
}
}
public String toString() {
return String.format("EdgeMatch (%d,%d-->%d,%d) '%s' in '%s'",
start, matchingFrom, matchedTo, end, matchedString(), matchingString());
}
public void matchFrom(TreeNode startNode, boolean skipcount) {
assert string != null || array != null;
assert lastEdge == null;
matchingFrom = start;
matchedTo = start;
// a base case.
if(end-start <= 0) { return; }
char nextChar = getChar(matchingFrom);
assert startNode != null;
System.err.println("startNode is " + startNode + " and nextChar is " + nextChar);
if(startNode.childEdges.containsKey(nextChar)) {
lastEdge = startNode.childEdges.get(nextChar);
} else if(skipcount) {
System.err.println("Failure Node: ");
startNode.print(0, System.err);
System.err.println(); System.err.flush();
throw new IllegalArgumentException();
}
boolean keepMatching = lastEdge != null;
while(keepMatching) {
int remaining = end-matchingFrom;
int matching = skipcount ?
Math.min(lastEdge.length(), remaining) :
(string != null ? lastEdge.countMatches(string, matchingFrom, end) :
lastEdge.countMatches(array, matchingFrom, end));
matchedTo = matchingFrom + matching;
if(matching < lastEdge.length()) {
// we either matched all the way into the middle of the
// current edge, or we found a mismatch in the current edge.
// either way, update the matchedTo variable and we're done.
keepMatching = false;
} else {
if(matching < remaining) {
nextChar = getChar(matchedTo);
if(lastEdge.tailNode.childEdges.containsKey(nextChar)) {
lastEdge = lastEdge.tailNode.childEdges.get(nextChar);
matchingFrom += matching;
} else if(skipcount) {
System.err.println("ERROR TREE: ");
print(System.err); System.err.println(); System.err.flush();
String err = String.format("[%s] node:'%s' (next: %c)", toString(), startNode.pathLabel(), nextChar);
throw new IllegalArgumentException(err);
} else {
keepMatching = false;
}
} else {
keepMatching = false;
}
}
}
}
public int lastMatchLength() { return matchedTo - matchingFrom; }
public int matchLength() { return matchedTo - start; }
public boolean inEdgeMiddle() {
return lastEdge != null && lastMatchLength() < lastEdge.length();
}
public boolean completedMatch() { return lastEdge != null && matchedTo == end; }
}
private class TreeEdge {
// This is a pointer to the overall string of which this edge is a substring.
public TreeString string;
// The nodes between which this edge exists in the tree.
public TreeNode headNode, tailNode;
// These are tricky -- 'end' can be null, and when it is, it takes
// the value of extState.lastE. This is part of the Ukkonen extension
// algorithm, and as a result if you want to find the coordinates of
// a TreeEdge, you should always call start() and end(), the methods.
private Integer start, end; // coordinates: [start, end)
public TreeEdge(TreeString str, Integer st, Integer ed, TreeNode h) {
assert st >= 0;
assert ed == null ? st < extState.lastE : st < ed;
string = str;
start = st;
end = ed;
headNode = h;
tailNode = new TreeNode(this);
}
public TreeEdge(TreeString str, Integer st, Integer ed, TreeNode h, TreeNode t) {
assert st >= 0;
assert ed == null ? st < extState.lastE : st < ed;
string = str;
start = st;
end = ed;
headNode = h;
tailNode = t;
}
public boolean check() {
if(end == null) {
logger.log(Level.SEVERE, String.format("Edge %s still has a [null] for it's end-value.", edgeLabel()));
return false;
}
return tailNode.check();
}
public String edgeLabel() {
return String.format("%s+%c", headNode.pathLabel(), getChar(0));
}
public void print(int indent, boolean printIndent, PrintStream ps) {
char nodeChar = headNode.suffixLink != null ? '*' : '-';
for(int i = 0; printIndent && i < indent; i++) { ps.print(" "); }
//String coords = String.format("(%d,%d)", start, end);
String coords = "";
ps.print(String.format("%c%s%s", nodeChar, coords, getSubstring()));
tailNode.print(indent+length()+coords.length()+1, ps);
}
public TreeEdge split(int offset) {
if(offset <= 0 || offset >= length()) {
throw new IllegalArgumentException(String.format(
"Illegal edge split offset %d (length %d)",
offset, length()));
}
TreeNode newInternal = new TreeNode(this);
TreeEdge newLowerEdge = new TreeEdge(string, start+offset, end, newInternal, tailNode);
tailNode = newInternal;
newInternal.addEdge(newLowerEdge);
end = start+offset;
return newLowerEdge;
}
public char getChar(int offset) {
return string.getChar(start+offset);
}
public int countMatches(char[] array, int startMatch, int endMatch) {
int ei = start;
int mi = startMatch;
int eend = end();
for(; ei < eend && mi < endMatch; ei++, mi++) {
assert mi >= 0;
assert mi < array.length;
if(array[mi] != string.getChar(ei)) {
return ei-start;
}
}
return ei-start;
}
public int countMatches(TreeString str, int startMatch, int endMatch) {
int ei = start;
int mi = startMatch;
int eend = end();
for(; ei < eend && mi < endMatch; ei++, mi++) {
if(!string.matches(ei, str, mi)) {
return ei-start;
}
}
return ei-start;
}
public void setEnd(int e) {
assert e > start;
end = e;
}
public boolean isEndSymbolic() { return end == null; }
public boolean isTerminal() {
return start==string.length();
}
public int end() { return end != null ? end : extState.lastE; }
public int start() { return start; }
public int length() { return end() - start(); }
public String getSubstring() { return string.substring(start(), end()); }
public boolean isLeafEdge() {
return tailNode.isLeaf();
}
}
private class LoggingHandler extends Handler {
private PrintStream logstream;
private boolean closeStream;
public LoggingHandler(PrintStream ps) {
logstream = ps;
closeStream = false;
}
public LoggingHandler(PrintStream ps, boolean cs) {
logstream = ps;
closeStream = cs;
}
public void setCloseStream(boolean cs) { closeStream = cs; }
public void close() throws SecurityException {
if(closeStream) {
logstream.close();
}
}
public void flush() {
logstream.flush();
}
public void publish(LogRecord rec) {
String msg = String.format("%s: %s", rec.getLevel().toString(), rec.getMessage());
logstream.println(msg);
}
}
/*
* The Level.intValue() method apparently doesn't do what we want? So
* I'll write my own. This lets us filter by "greater than" relations
* on Level's, with the natural (to me) ordering.
*/
private class LoggingFilter implements Filter {
public LoggingFilter() {
}
public boolean isLoggable(LogRecord rec) {
return isLogging && logValue(rec.getLevel()) >= logValue(minLogLevel);
}
private int logValue(Level l) {
if(l.equals(Level.FINEST)) {
return 0;
} else if(l.equals(Level.FINE)) {
return 1;
} else if(l.equals(Level.INFO)) {
return 2;
} else if(l.equals(Level.WARNING)) {
return 3;
} else if(l.equals(Level.SEVERE)) {
return 4;
}
return -1;
}
}
}