package org.limewire.collection;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Locale;
import java.util.NoSuchElementException;
/**
* An information reTRIEval tree, a.k.a., a prefix tree. A Trie is similar to
* a dictionary, except that keys must be strings. Furthermore, Trie provides
* an efficient means ({@link #getPrefixedBy(String)}) to find all values given
* just a PREFIX of a key.
* <p>
* All retrieval operations run in O(nm) time, where n is the size of the
* key/prefix and m is the size of the alphabet. Some implementations may
* reduce this to O(n log m) or even O(n) time. Insertion operations are
* assumed to be infrequent and may be slower. The space required is roughly
* linear with respect to the sum of the sizes of all keys in the tree, though
* this may be reduced if many keys have common prefixes.
* <p>
* The Trie can be set to ignore case, which is the same as making all
* keys and prefixes lower case. Therefore, ignoring case means the original
* keys cannot be extracted from the Trie.
* <p>
* Restrictions (not necessarily limitations)
* <ul>
* <li>Keys and values may not be null.
* <li>The interface to this is not complete.
* </ul>
*
* See <a href="http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Tree/Trie.html">Tries</a>
* for a discussion of Tries.
* <p>
* This class is not thread-safe.
*<p>
* @modified David Soh (yunharla00@hotmail.com)
* added getIterator() for enhanced AutoCompleteTextField use.
*
*/
public class StringTrie<V> {
/**
* Our representation consists of a tree of nodes whose edges are labelled
* by strings. The first characters of all labels of all edges of a node
* must be distinct. Typically the edges are sorted, but this is
* determined by TrieNode.<p>
*
* An abstract TrieNode is a mapping from String keys to values,
* { <K1, V1>, ..., <KN, VN> }, where all Ki and Kj are distinct for all
* i != j. For any node N, define KEY(N) to be the concatenation of all
* labels on the edges from the root to that node. Then the abstraction
* function is:<p>
*
* <xmp>
* { <KEY(N), N.getValue() | N is a child of root
* and N.getValue() != null }
* </xmp>
*
* An earlier version used character labels on edges. This made
* implementation simpler but used more memory because one node would be
* allocated to each character in long strings if that string had no
* common prefixes with other elements of the Trie.<p>
*
* <dl>
* <dt>INVARIANT:</td>
* <dd>For any node N, for any edges Ei and Ej from N,<br>
* i != j <==>
* Ei.getLabel().getCharAt(0) != Ej.getLabel().getCharAt(0)</dd>
* <dd>Also, all invariants for TrieNode and TrieEdge must hold.</dd>
* </dl>
*/
private TrieNode<V> root;
/**
* Indicates whenever search keys are case-sensitive or not.
* If true, keys will be canonicalized to lowercase.
*/
private final boolean ignoreCase;
/**
* Constructs a new, empty tree.
*/
public StringTrie(boolean ignoreCase) {
this.ignoreCase = ignoreCase;
clear();
}
/**
* Makes this empty.
* @modifies this.
*/
public void clear() {
this.root = new TrieNode<V>();
}
/**
* Returns the canonical version of the given string.<p>
*
* In the basic version, strings are added and searched without
* modification. So this simply returns its parameter s.<p>
*
* Other overrides may also perform a conversion to the NFC form
* (inter-operable across platforms) or to the NFKC form after removal of
* accents and diacritics from the NFKD form (ideal for searches using
* strings in natural language).<p>
*
* Made public instead of protected, because the public Prefix operations
* below may need to use a coherent conversion of search prefixes.
*/
public String canonicalCase(final String s) {
if (!ignoreCase)
return s;
return s.toUpperCase(Locale.US).toLowerCase(Locale.US);
}
/**
* Matches the pattern <tt>b</tt> against the text
* <tt>a[startOffset...stopOffset - 1]</tt>.
*
* @return the first <tt>j</tt> so that:<br>
* <tt>0 <= i < b.length()</tt> AND<br>
* <tt>a[startOffset + j] != b[j]</tt> [a and b differ]<br>
* OR <tt>stopOffset == startOffset + j</tt> [a is undefined];<br>
* Returns -1 if no such <tt>j</tt> exists, i.e., there is a match.<br>
* Examples:
* <ol>
* <li>a = "abcde", startOffset = 0, stopOffset = 5, b = "abc"<br>
* abcde ==> returns -1<br>
* abc
* <li>a = "abcde", startOffset = 1, stopOffset = 5, b = "bXd"<br>
* abcde ==> returns 1
* bXd
* <li>a = "abcde", startOffset = 1, stopOffset = 3, b = "bcd"<br>
* abc ==> returns 2<br>
* bcd
* </ol>
*
* @requires 0 <= startOffset <= stopOffset <= a.length()
*/
private int match(String a, int startOffset, int stopOffset,
String b) {
//j is an index into b
//i is a parallel index into a
int i = startOffset;
for (int j = 0; j < b.length(); j++) {
if (i >= stopOffset)
return j;
if (a.charAt(i) != b.charAt(j))
return j;
i++;
}
return -1;
}
/**
* Maps the given key (which may be empty) to the given value.
*
* @return the old value associated with key, or <tt>null</tt> if none
* @requires value != null
* @modifies this
*/
public V add(String key, V value) {
// early conversion of key, for best performance
key = canonicalCase(key);
// Find the largest prefix of key, key[0..i - 1], already in this.
TrieNode<V> node = root;
int i = 0;
while (i < key.length()) {
// Find the edge whose label starts with key[i].
TrieEdge<V> edge = node.get(key.charAt(i));
if (edge == null) {
// 1) Additive insert.
TrieNode<V> newNode = new TrieNode<V>(value);
node.put(key.substring(i), newNode);
return null;
}
// Now check that rest of label matches
String label = edge.getLabel();
int j = match(key, i, key.length(), label);
assert j != 0 : "Label didn't start with prefix[0].";
if (j >= 0) {
// 2) Prefix overlaps perfectly with just part of edge label
// Do split insert as follows...
//
// node node ab = label
// ab | ==> a | a = label[0...j - 1] (inclusive)
// child intermediate b = label[j...] (inclusive)
// b / \ c c = key[i + j...] (inclusive)
// child newNode
//
// ...unless c = "", in which case you just do a "splice
// insert" by omitting newNew and setting intermediate's value.
TrieNode<V> child = edge.getChild();
TrieNode<V> intermediate = new TrieNode<V>();
String a = label.substring(0, j);
//Assert.that(canonicalCase(a).equals(a), "Bad edge a");
String b = label.substring(j);
//Assert.that(canonicalCase(b).equals(b), "Bad edge a");
String c = key.substring(i + j);
if (c.length() > 0) {
// Split.
TrieNode<V> newNode = new TrieNode<V>(value);
node.remove(label.charAt(0));
node.put(a, intermediate);
intermediate.put(b, child);
intermediate.put(c, newNode);
} else {
// Splice.
node.remove(label.charAt(0));
node.put(a, intermediate);
intermediate.put(b, child);
intermediate.setValue(value);
}
return null;
}
// Prefix overlaps perfectly with all of edge label.
// Keep searching.
assert j == -1 : "Bad return value from match: " + i;
node = edge.getChild();
i += label.length();
}
// 3) Relabel insert. Prefix already in this, though not necessarily
// associated with a value.
V ret = node.getValue();
node.setValue(value);
return ret;
}
/**
* Returns the node associated with prefix, or null if none. (internal)
*/
private TrieNode<V> fetch(String prefix) {
// This private method uses prefixes already in canonical form.
TrieNode<V> node = root;
for (int i = 0; i < prefix.length(); ) {
// Find the edge whose label starts with prefix[i].
TrieEdge<V> edge = node.get(prefix.charAt(i));
if (edge == null)
return null;
// Now check that rest of label matches.
String label = edge.getLabel();
int j = match(prefix, i, prefix.length(), label);
assert j != 0 : "Label didn't start with prefix[0].";
if (j != -1)
return null;
i += label.length();
node = edge.getChild();
}
return node;
}
/**
* Returns the value associated with the given key, or null if none.
*
* @return the <tt>Object</tt> value or <tt>null</tt>
*/
public V get(String key) {
// early conversion of search key
key = canonicalCase(key);
// search the node associated with key, if it exists
TrieNode<V> node = fetch(key);
if (node == null)
return null;
// key exists, return the value
return node.getValue();
}
/**
* Ensures no values are associated with the given key.
*
* @return <tt>true</tt> if any values were actually removed
* @modifies this.
*/
public boolean remove(String key) {
// early conversion of search key
key = canonicalCase(key);
// search the node associated with key, if it exists
TrieNode<V> node = fetch(key);
if (node == null)
return false;
// key exists and can be removed.
//TODO: prune unneeded nodes to save space
boolean ret = node.getValue() != null;
node.setValue(null);
return ret;
}
/**
* Returns an iterator (of V) of the values mapped by keys in this
* that start with the given prefix, in any order. That is, the returned
* iterator contains exactly the values v for which there exists a key k
* so that k.startsWith(prefix) and get(k) == v. The remove() operation
* on the iterator is unimplemented.
*
* @requires this not modified while iterator in use.
*/
public Iterator<V> getPrefixedBy(String prefix) {
// Early conversion of search key
prefix = canonicalCase(prefix);
// Note that canonicalization MAY have changed the prefix length!
return getPrefixedBy(prefix, 0, prefix.length());
}
/**
* Same as getPrefixedBy(prefix.substring(startOffset, stopOffset).
* This is useful as an optimization in certain applications to avoid
* allocations.<p>
*
* Important: canonicalization of prefix substring is NOT performed here!
* But it can be performed early on the whole buffer using the public
* method <tt>canonicalCase(String)</tt> of this.
*
* @requires 0 <= startOffset <= stopOffset <= prefix.length
* @see #canonicalCase(String)
*/
public Iterator<V> getPrefixedBy(String prefix,
int startOffset, int stopOffset) {
// Find the first node for which "prefix" prefixes KEY(node). (See the
// implementation overview for a definition of KEY(node).) This code is
// similar to fetch(prefix), except that if prefix extends into the
// middle of an edge label, that edge's child is considered a match.
TrieNode<V> node = root;
for (int i = startOffset; i < stopOffset; ) {
// Find the edge whose label starts with prefix[i].
TrieEdge<V> edge = node.get(prefix.charAt(i));
if (edge == null) {
return EmptyIterator.emptyIterator();
}
// Now check that rest of label matches
node = edge.getChild();
String label = edge.getLabel();
int j = match(prefix, i, stopOffset, label);
assert j != 0 : "Label didn't start with prefix[0].";
if (i + j == stopOffset) {
// a) prefix overlaps perfectly with just part of edge label
break;
} else if (j >= 0) {
// b) prefix and label differ at some point
node = null;
break;
} else {
// c) prefix overlaps perfectly with all of edge label.
assert j == -1 : "Bad return value from match: " + i;
}
i += label.length();
}
// Yield all children of node, including node itself.
if (node == null)
return EmptyIterator.emptyIterator();
else
return new ValueIterator(node);
}
/**
* Returns all values (entire Trie).
*/
public Iterator<V> getIterator() {
return new ValueIterator(root);
}
/**
* @return the number of values stored in the trie.
*/
public int size() {
int ret = 0;
for (Iterator<V> iter = getIterator();iter.hasNext();)
ret++;
return ret;
}
/**
* Returns all the (non-null) values associated with a given
* node and its children. (internal)
*/
private class ValueIterator extends UnmodifiableIterator<V> {
private NodeIterator delegate;
ValueIterator(TrieNode<V> start) {
delegate = new NodeIterator(start, false);
}
// inherits javadoc comment
public V next() {
return delegate.next().getValue();
}
public boolean hasNext() {
return delegate.hasNext();
}
}
/**
* Ensures that this consumes the minimum amount of memory. If
* valueCompactor is not null, also sets each node's value to
* valueCompactor.apply(node). Any exceptions thrown by a call to
* valueCompactor are thrown by this.<p>
*
* This method should typically be called after add(..)'ing a number of
* nodes. Insertions can be done after the call to compact, but they might
* be slower. Because this method only affects the performance of this,
* there is no <tt>modifies</tt> clause listed.
*/
public void trim(Function<V, ? extends V> valueCompactor)
throws IllegalArgumentException, ClassCastException {
if (valueCompactor != null) {
// For each node in this...
for (Iterator<TrieNode<V>> iter = new NodeIterator(root, true); iter.hasNext(); ) {
TrieNode<V> node = iter.next();
node.trim();
// Apply compactor to value (if any).
V value = node.getValue();
if (value != null)
node.setValue(valueCompactor.apply(value));
}
}
}
private class NodeIterator extends UnmodifiableIterator<TrieNode<V>> {
/** Stack for DFS. Push and pop from back. */
private ArrayList<Iterator<TrieNode<V>>> stack = new ArrayList<Iterator<TrieNode<V>>>();
/** The next node to return. */
private TrieNode<V> nextNode;
private boolean withNulls;
/**
* Creates a new iterator that yields all the nodes of start and its
* children that have values (ignoring internal nodes).
*/
private NodeIterator(TrieNode<V> start, boolean withNulls) {
this.withNulls = withNulls;
if (withNulls || start.getValue() != null) {
nextNode = start;
} else {
nextNode = null;
// scan node children to find the next node
advance(start);
}
}
// inherits javadoc comment
public boolean hasNext() {
return !stack.isEmpty() || nextNode != null;
}
// inherits javadoc comment
public TrieNode<V> next() {
if (nextNode == null) {
throw new NoSuchElementException();
}
TrieNode<V> node = nextNode;
nextNode = null;
advance(node);
return node;
}
/**
* Scan the tree (top-down) starting at the already visited node
* until finding an appropriate node with not null value for next().
* Keep unvisited nodes in a stack of siblings iterators. Return
* either an empty stack, or a stack whose top will be the next node
* returned by next().
*/
private void advance(TrieNode<V> node) {
Iterator<TrieNode<V>> children = node.childrenForward();
while (true) { // scan siblings and their children
int size;
if (children.hasNext()) {
node = children.next();
if (children.hasNext()) // save siblings
stack.add(children);
// check current node and scan its sibling if necessary
if (withNulls || node.getValue() == null)
children = node.childrenForward(); // loop from there
else { // node qualifies for next()
nextNode = node;
return; // next node exists
}
} else if ((size = stack.size()) == 0)
return; // no next node
else // no more siblings, return to parent
children = stack.remove(size - 1);
}
}
}
/**
* Returns a string representation of the tree state of this, i.e., the
* concrete state. (The version of toString commented out below returns
* a representation of the abstract state of this.
*/
@Override
public String toString() {
StringBuilder buf = new StringBuilder();
buf.append("<root>");
toStringHelper(root, buf, 1);
return buf.toString();
}
/**
* Prints a description of the sub tree starting with <code>start</code> to
* <code>buf</code>.
* The printing starts with the given indent level. (internal)
*/
private void toStringHelper(TrieNode start, StringBuilder buf, int indent) {
// Print value of node.
if (start.getValue() != null) {
buf.append(" -> ");
buf.append(start.getValue().toString());
}
buf.append("\n");
//For each child...
for (Iterator iter = start.labelsForward(); iter.hasNext(); ) {
// Indent child appropriately.
for (int i = 0; i < indent; i++)
buf.append(" ");
// Print edge.
String label = (String)iter.next();
buf.append(label);
// Recurse to print value.
TrieNode child = start.get(label.charAt(0)).getChild();
toStringHelper(child, buf, indent + 1);
}
}
}
/**
* A node of the Trie. Each Trie has a list of children, labeled by strings.
* Each of these [String label, TrieNode child] pairs is considered an "edge".
* The first character of each label must be distinct. When managing
* children, different implementations may trade space for time. Each node
* also stores an arbitrary Object value.<p>
*
* Design note: this is a "dumb" class. It is <i>only</i> responsible for
* managing its value and its children. None of its operations are recursive;
* that is Trie's job. Nor does it deal with case.
*/
final class TrieNode<E> {
/**
* The value of this node.
*/
private E value = null;
/**
* The list of children. Children are stored as a sorted Vector because
* it is a more compact than a tree or linked lists. Insertions and
* deletions are more expensive, but they are rare compared to
* searching.<p>
*
* INVARIANT: children are sorted by distinct first characters of edges,
* i.e., for all i < j,<br>
* children[i].edge.charAt(0) < children[j].edge.charAt(0)
*/
private ArrayList<TrieEdge<E>> children = new ArrayList<TrieEdge<E>>(0);
/**
* Creates a trie with no children and no value.
*/
public TrieNode() { }
/**
* Creates a trie with no children and the given value.
*/
public TrieNode(E value) {
this.value = value;
}
/**
* Gets the value associated with this node, or null if none.
*/
public E getValue() {
return value;
}
/**
* Sets the value associated with this node.
*/
public void setValue(E value) {
this.value = value;
}
/**
* Get the nth child edge of this node.
*
* @requires 0 <= i < children.size()
*/
private TrieEdge<E> get(int i) {
return children.get(i);
}
/**
* (internal) If exact, returns the unique i so that:
* children[i].getLabelStart() == c<br>
* If !exact, returns the largest i so that:
* children[i].getLabelStart() <= c<br>
* In either case, returns -1 if no such i exists.<p>
*
* This method uses binary search and runs in O(log N) time, where
* N = children.size().<br>
* The standard Java binary search methods could not be used because they
* only return exact matches. Also, they require allocating a dummy Trie.
*<xmp>
* Example1: Search non exact c == '_' in {[0] => 'a...', [1] => 'c...'};
* start loop with low = 0, high = 1;
* middle = 0, cmiddle == 'a', c < cmiddle, high = 0 (low == 0);
* middle = 0, cmiddle == 'a', c < cmiddle, high = -1 (low == 0);
* end loop; return high == -1 (no match, insert at 0).
* Example2: Search non exact c == 'a' in {[0] => 'a', [1] => 'c'}
* start loop with low = 0, high = 1;
* middle = 0, cmiddle == 'a', c == cmiddle,
* abort loop by returning middle == 0 (exact match).
* Example3: Search non exact c == 'b' in {[0] => 'a...', [1] => 'c...'};
* start loop with low = 0, high = 1;
* middle = 0, cmiddle == 'a', cmiddle < c, low = 1 (high == 1);
* middle = 1, cmiddle == 'c', c < cmiddle, high = 0 (low == 1);
* end loop; return high == 0 (no match, insert at 1).
* Example4: Search non exact c == 'c' in {[0] => 'a...', [1] => 'c...'};
* start loop with low = 0, high = 1;
* middle = 0, cmiddle == 'a', cmiddle < c, low = 1 (high == 1);
* middle = 1, cmiddle == 'c', c == cmiddle,
* abort loop by returning middle == 1 (exact match).
* Example5: Search non exact c == 'd' in {[0] => 'a...', [1] => 'c...'};
* start loop with low = 0, high = 1;
* middle = 0, cmiddle == 'a', cmiddle < c, low = 1 (high == 1);
* middle = 1, cmiddle == 'c', cmiddle < c, low = 2 (high == 1);
* end loop; return high == 1 (no match, insert at 2).
*</xmp>
*/
private int search(char c, boolean exact) {
// This code is stolen from IntSet.search.
int low = 0;
int high = children.size() - 1;
while (low <= high) {
int middle = (low + high) / 2;
char cmiddle = get(middle).getLabelStart();
if (cmiddle < c)
low = middle + 1;
else if (c < cmiddle)
high = middle - 1;
else // c == cmiddle
return middle; // Return exact match.
}
if (exact)
return -1; // Return no match.
return high; // Return closest *lower or equal* match. (This works!)
}
/**
* Returns the edge (at most one) whose label starts with the given
* character, or null if no such edge.
*/
public TrieEdge<E> get(char labelStart) {
int i = search(labelStart, true);
if (i < 0)
return null;
TrieEdge<E> ret = get(i);
assert(ret.getLabelStart() == labelStart);
return ret;
}
/**
* Inserts an edge with the given label to the given child to this.
* Keeps all edges binary sorted by their label start.
*
* @requires label not empty.
* @requires for all edges E in this, label.getLabel[0] != E not already
* mapped to a node.
* @modifies this
*/
public void put(String label, TrieNode<E> child) {
char labelStart;
int i;
// If there's a match it is the closest lower or equal one, and
// precondition requires it to be lower, so we add the edge *after*
// it. If there's no match, there are two cases: the Trie is empty,
// or the closest match returned is the last edge in the list.
if ((i = search(labelStart = label.charAt(0), // find closest match
false)) >= 0) {
assert get(i).getLabelStart() != labelStart :
"Precondition of TrieNode.put violated.";
}
children.add(i + 1, new TrieEdge<E>(label, child));
}
/**
* Removes the edge (at most one) whose label starts with the given
* character. Returns true if any edges where actually removed.
*/
public boolean remove(char labelStart) {
int i;
if ((i = search(labelStart, true)) < 0)
return false;
assert(get(i).getLabelStart() == labelStart);
children.remove(i);
return true;
}
/**
* Ensures that this's children take a minimal amount of storage. This
* should be called after numerous calls to add().
*
* @modifies this.
*/
public void trim() {
children.trimToSize();
}
/**
* Returns the children of this in forward order,
* as an iterator of TrieNode.
*/
public Iterator<TrieNode<E>> childrenForward() {
return new ChildrenForwardIterator();
}
/**
* Maps (lambda(edge) edge.getChild) on children.iterator().
*/
private class ChildrenForwardIterator extends UnmodifiableIterator<TrieNode<E>> {
int i = 0;
public boolean hasNext() {
return i < children.size();
}
public TrieNode<E> next() {
if (i < children.size())
return get(i++).getChild();
throw new NoSuchElementException();
}
}
/**
* Returns the children of this in forward order,
* as an iterator of TrieNode.
*/ /*
public Iterator childrenBackward() {
return new ChildrenBackwardIterator();
} */
/**
* Maps (lambda(edge) edge.getChild) on children.iteratorBackward().
*/ /*
private class ChildrenBackwardIterator extends UnmodifiableIterator {
int i = children.size() - 1;
public boolean hasNext() {
return i >= 0;
}
public Object next() {
if (i >= 0)
return get(i--).getChild();
throw new NoSuchElementException();
}
} */
/**
* Returns the labels of the children of this in forward order,
* as an iterator of Strings.
*/
public Iterator<String> labelsForward() {
return new LabelForwardIterator();
}
/**
* Maps (lambda(edge) edge.getLabel) on children.iterator()
*/
private class LabelForwardIterator extends UnmodifiableIterator<String> {
int i = 0;
public boolean hasNext() {
return i < children.size();
}
public String next() {
if (i < children.size())
return get(i++).getLabel();
throw new NoSuchElementException();
}
}
/**
* Returns the labels of the children of this in backward order,
* as an iterator of Strings.
*/ /*
public Iterator labelsBackward() {
return new LabelBackwardIterator();
} */
/**
* Maps (lambda(edge) edge.getLabel) on children.iteratorBackward()
*/ /*
private class LabelBackwardIterator extends UnmodifiableIterator {
int i = children.size() - 1;
public boolean hasNext() {
return i >= 0;
}
public Object next() {
if (i >= 0)
return get(i--).getLabel();
throw new NoSuchElementException();
}
} */
// inherits javadoc comment.
@Override
public String toString() {
Object val = getValue();
if (val != null)
return val.toString();
return "NULL";
}
/**
* Unit test.
* @see TrieNodeTest
*/
}
/**
* A labeled edge, i.e., a String label and a TrieNode endpoint.
*/
final class TrieEdge<E> {
private String label;
private TrieNode<E> child;
/**
* @requires label.size() > 0
* @requires child != null
*/
TrieEdge(String label, TrieNode<E> child) {
this.label = label;
this.child = child;
}
public String getLabel() {
return label;
}
/**
* Returns the first character of the label, i.e., getLabel().charAt(0).
*/
public char getLabelStart() {
// You could store this char as an optimization if needed.
return label.charAt(0);
}
public TrieNode<E> getChild() {
return child;
}
}