package com.limegroup.gnutella.util; import java.util.ArrayList; import java.util.Iterator; import java.util.Locale; import java.util.NoSuchElementException; import com.limegroup.gnutella.Assert; /** * An information reTRIEval tree, a.k.a., a prefix tree. A Trie is similar to * a dictionary, except that keys must be strings. Furthermore, Trie provides * an efficient means (getPrefixedBy()) to find all values given just a PREFIX * of a key.<p> * * All retrieval operations run in O(nm) time, where n is the size of the * key/prefix and m is the size of the alphabet. Some implementations may * reduce this to O(n log m) or even O(n) time. Insertion operations are * assumed to be infrequent and may be slower. The space required is roughly * linear with respect to the sum of the sizes of all keys in the tree, though * this may be reduced if many keys have common prefixes.<p> * * The Trie can be set to ignore case. Doing so is the same as making all * keys and prefixes lower case. That means the original keys cannot be * extracted from the Trie.<p> * * Restrictions (not necessarily limitations!) * <ul> * <li><b>This class is not synchronized.</b> Do that externally if you desire. * <li>Keys and values may not be null. * <li>The interface to this is not complete. * </ul> * * See http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Tree/Trie.html for a * discussion of Tries. * * @modified David Soh (yunharla00@hotmail.com) * added getIterator() for enhanced AutoCompleteTextField use. * */ public class Trie { /** * Our representation consists of a tree of nodes whose edges are labelled * by strings. The first characters of all labels of all edges of a node * must be distinct. Typically the edges are sorted, but this is * determined by TrieNode.<p> * * An abstract TrieNode is a mapping from String keys to values, * { <K1, V1>, ..., <KN, VN> }, where all Ki and Kj are distinct for all * i != j. For any node N, define KEY(N) to be the concatenation of all * labels on the edges from the root to that node. Then the abstraction * function is:<p> * * <blockquote> * { <KEY(N), N.getValue() | N is a child of root * and N.getValue() != null} * </blockquote> * * An earlier version used character labels on edges. This made * implementation simpler but used more memory because one node would be * allocated to each character in long strings if that string had no * common prefixes with other elements of the Trie.<p> * * <dl> * <dt>INVARIANT:</td> * <dd>For any node N, for any edges Ei and Ej from N,<br> * i != j <==> * Ei.getLabel().getCharAt(0) != Ej.getLabel().getCharAt(0)</dd> * <dd>Also, all invariants for TrieNode and TrieEdge must hold.</dd> * </dl> */ private TrieNode root; /** * Indicates whever search keys are case-sensitive or not. * If true, keys will be canonicalized to lowercase. */ private boolean ignoreCase; /** * The constant EmptyIterator to return when nothing matches. */ private final static Iterator EMPTY_ITERATOR = new EmptyIterator(); /** * Constructs a new, empty tree. */ public Trie(boolean ignoreCase) { this.ignoreCase = ignoreCase; clear(); } /** * Makes this empty. * @modifies this */ public void clear() { this.root = new TrieNode(); } /** * Returns the canonical version of the given string.<p> * * In the basic version, strings are added and searched without * modification. So this simply returns its parameter s.<p> * * Other overrides may also perform a conversion to the NFC form * (interoperable across platforms) or to the NFKC form after removal of * accents and diacritics from the NFKD form (ideal for searches using * strings in natural language).<p> * * Made public instead of protected, because the public Prefix operations * below may need to use a coherent conversion of search prefixes. */ public String canonicalCase(final String s) { if (!ignoreCase) return s; return s.toUpperCase(Locale.US).toLowerCase(Locale.US); } /** * Matches the pattern <tt>b</tt> against the text * <tt>a[startOffset...stopOffset - 1]</tt>. * * @return the first <tt>j</tt> so that:<br> * <tt>0 <= i < b.length()</tt> AND<br> * <tt>a[startOffset + j] != b[j]</tt> [a and b differ]<br> * OR <tt>stopOffset == startOffset + j</tt> [a is undefined];<br> * Returns -1 if no such <tt>j</tt> exists, i.e., there is a match.<br> * Examples: * <ol> * <li>a = "abcde", startOffset = 0, stopOffset = 5, b = "abc"<br> * abcde ==> returns -1<br> * abc * <li>a = "abcde", startOffset = 1, stopOffset = 5, b = "bXd"<br> * abcde ==> returns 1 * bXd * <li>a = "abcde", startOffset = 1, stopOffset = 3, b = "bcd"<br> * abc ==> returns 2<br> * bcd * </ol> * * @requires 0 <= startOffset <= stopOffset <= a.length() */ private final int match(String a, int startOffset, int stopOffset, String b) { //j is an index into b //i is a parallel index into a int i = startOffset; for (int j = 0; j < b.length(); j++) { if (i >= stopOffset) return j; if (a.charAt(i) != b.charAt(j)) return j; i++; } return -1; } /** * Maps the given key (which may be empty) to the given value. * * @return the old value associated with key, or <tt>null</tt> if none * @requires value != null * @modifies this */ public Object add(String key, Object value) { // early conversion of key, for best performance key = canonicalCase(key); // Find the largest prefix of key, key[0..i - 1], already in this. TrieNode node = root; int i = 0; while (i < key.length()) { // Find the edge whose label starts with key[i]. TrieEdge edge = node.get(key.charAt(i)); if (edge == null) { // 1) Additive insert. TrieNode newNode = new TrieNode(value); node.put(key.substring(i), newNode); return null; } // Now check that rest of label matches String label = edge.getLabel(); int j = match(key, i, key.length(), label); Assert.that(j != 0, "Label didn't start with prefix[0]."); if (j >= 0) { // 2) Prefix overlaps perfectly with just part of edge label // Do split insert as follows... // // node node ab = label // ab | ==> a | a = label[0...j - 1] (inclusive) // child intermediate b = label[j...] (inclusive) // b / \ c c = key[i + j...] (inclusive) // child newNode // // ...unless c = "", in which case you just do a "splice // insert" by ommiting newNew and setting intermediate's value. TrieNode child = edge.getChild(); TrieNode intermediate = new TrieNode(); String a = label.substring(0, j); //Assert.that(canonicalCase(a).equals(a), "Bad edge a"); String b = label.substring(j); //Assert.that(canonicalCase(b).equals(b), "Bad edge a"); String c = key.substring(i + j); if (c.length() > 0) { // Split. TrieNode newNode = new TrieNode(value); node.remove(label.charAt(0)); node.put(a, intermediate); intermediate.put(b, child); intermediate.put(c, newNode); } else { // Splice. node.remove(label.charAt(0)); node.put(a, intermediate); intermediate.put(b, child); intermediate.setValue(value); } return null; } // Prefix overlaps perfectly with all of edge label. // Keep searching. Assert.that(j == -1, "Bad return value from match: " + i); node = edge.getChild(); i += label.length(); } // 3) Relabel insert. Prefix already in this, though not necessarily // associated with a value. Object ret = node.getValue(); node.setValue(value); return ret; } /** * Returns the node associated with prefix, or null if none. (internal) */ private TrieNode fetch(String prefix) { // This private method uses prefixes already in canonical form. TrieNode node = root; for (int i = 0; i < prefix.length(); ) { // Find the edge whose label starts with prefix[i]. TrieEdge edge = node.get(prefix.charAt(i)); if (edge == null) return null; // Now check that rest of label matches. String label = edge.getLabel(); int j = match(prefix, i, prefix.length(), label); Assert.that(j != 0, "Label didn't start with prefix[0]."); if (j != -1) return null; i += label.length(); node = edge.getChild(); } return node; } /** * Returns the value associated with the given key, or null if none. * * @return the <tt>Object</tt> value or <tt>null</tt> */ public Object get(String key) { // early conversion of search key key = canonicalCase(key); // search the node associated with key, if it exists TrieNode node = fetch(key); if (node == null) return null; // key exists, return the value return node.getValue(); } /** * Ensures no values are associated with the given key. * * @return <tt>true</tt> if any values were actually removed * @modifies this */ public boolean remove(String key) { // early conversion of search key key = canonicalCase(key); // search the node associated with key, if it exists TrieNode node = fetch(key); if (node == null) return false; // key exists and can be removed. //TODO: prune unneeded nodes to save space boolean ret = node.getValue() != null; node.setValue(null); return ret; } /** * Returns an iterator (of Object) of the values mapped by keys in this * that start with the given prefix, in any order. That is, the returned * iterator contains exactly the values v for which there exists a key k * so that k.startsWith(prefix) and get(k) == v. The remove() operation * on the iterator is unimplemented. * * @requires this not modified while iterator in use */ public Iterator getPrefixedBy(String prefix) { // Early conversion of search key prefix = canonicalCase(prefix); // Note that canonicalization MAY have changed the prefix length! return getPrefixedBy(prefix, 0, prefix.length()); } /** * Same as getPrefixedBy(prefix.substring(startOffset, stopOffset). * This is useful as an optimization in certain applications to avoid * allocations.<p> * * Important: canonicalization of prefix substring is NOT performed here! * But it can be performed early on the whole buffer using the public * method <tt>canonicalCase(String)</tt> of this. * * @requires 0 <= startOffset <= stopOffset <= prefix.length * @see #canonicalCase(String) */ public Iterator getPrefixedBy(String prefix, int startOffset, int stopOffset) { // Find the first node for which "prefix" prefixes KEY(node). (See the // implementation overview for a definition of KEY(node).) This code is // similar to fetch(prefix), except that if prefix extends into the // middle of an edge label, that edge's child is considered a match. TrieNode node = root; for (int i = startOffset; i < stopOffset; ) { // Find the edge whose label starts with prefix[i]. TrieEdge edge = node.get(prefix.charAt(i)); if (edge == null) { return EMPTY_ITERATOR; } // Now check that rest of label matches node = edge.getChild(); String label = edge.getLabel(); int j = match(prefix, i, stopOffset, label); Assert.that(j != 0, "Label didn't start with prefix[0]."); if (i + j == stopOffset) { // a) prefix overlaps perfectly with just part of edge label break; } else if (j >= 0) { // b) prefix and label differ at some point node = null; break; } else { // c) prefix overlaps perfectly with all of edge label. Assert.that(j == -1, "Bad return value from match: " + i); } i += label.length(); } // Yield all children of node, including node itself. if (node == null) return EMPTY_ITERATOR; else return new ValueIterator(node); } /** * Returns all values (entire Trie) */ public Iterator getIterator() { return new ValueIterator(root); } /** * Returns all the (non-null) values associated with a given * node and its children. (internal) */ private class ValueIterator extends NodeIterator { ValueIterator(TrieNode start) { super(start, false); } // inherits javadoc comment public Object next() { return ((TrieNode)super.next()).getValue(); } } /** * Yields nothing. (internal) */ private static class EmptyIterator extends UnmodifiableIterator { // inherits javadoc comment public boolean hasNext() { return false; } // inherits javadoc comment public Object next() { throw new NoSuchElementException(); } } /** * Ensures that this consumes the minimum amount of memory. If * valueCompactor is not null, also sets each node's value to * valueCompactor.apply(node). Any exceptions thrown by a call to * valueCompactor are thrown by this.<p> * * This method should typically be called after add(..)'ing a number of * nodes. Insertions can be done after the call to compact, but they might * be slower. Because this method only affects the performance of this, * there is no <tt>modifies</tt> clause listed. */ public void trim(Function valueCompactor) throws IllegalArgumentException, ClassCastException { if (valueCompactor != null) { // For each node in this... for (Iterator iter = new NodeIterator(root, true); iter.hasNext(); ) { TrieNode node = (TrieNode)iter.next(); node.trim(); // Apply compactor to value (if any). Object value = node.getValue(); if (value != null) node.setValue(valueCompactor.apply(value)); } } } public class NodeIterator extends UnmodifiableIterator { /** * Stack for DFS. Push and pop from back. The last element * of stack is the next node who's value will be returned.<p> * * INVARIANT: Top of stack contains the next node with not null * value to pop. All other elements in stack are iterators. */ private ArrayList /* of Iterator of TrieNode */ stack = new ArrayList(); private boolean withNulls; /** * Creates a new iterator that yields all the nodes of start and its * children that have values (ignoring internal nodes). */ public NodeIterator(TrieNode start, boolean withNulls) { this.withNulls = withNulls; if (withNulls || start.getValue() != null) // node has a value, push it for next stack.add(start); else // scan node children to find the next node advance(start); } // inherits javadoc comment public boolean hasNext() { return !stack.isEmpty(); } // inherits javadoc comment public Object next() { int size; if ((size = stack.size()) == 0) throw new NoSuchElementException(); TrieNode node = (TrieNode)stack.remove(size - 1); advance(node); return node; } /** * Scan the tree (top-down) starting at the already visited node * until finding an appropriate node with not null value for next(). * Keep unvisited nodes in a stack of siblings iterators. Return * either an empty stack, or a stack whose top will be the next node * returned by next(). */ private void advance(TrieNode node) { Iterator children = node.childrenForward(); while (true) { // scan siblings and their children int size; if (children.hasNext()) { node = (TrieNode)children.next(); if (children.hasNext()) // save siblings stack.add(children); // check current node and scan its sibling if necessary if (withNulls || node.getValue() == null) children = node.childrenForward(); // loop from there else { // node qualifies for next() stack.add(node); return; // next node exists } } else if ((size = stack.size()) == 0) return; // no next node else // no more siblings, return to parent children = (Iterator)stack.remove(size - 1); } } } /** * Returns a string representation of the tree state of this, i.e., the * concrete state. (The version of toString commented out below returns * a representation of the abstract state of this. */ public String toString() { StringBuffer buf = new StringBuffer(); buf.append("<root>"); toStringHelper(root, buf, 1); return buf.toString(); } /** * Prints a description of the substree starting with start to buf. * The printing starts with the given indent level. (internal) */ private void toStringHelper(TrieNode start, StringBuffer buf, int indent) { // Print value of node. if (start.getValue() != null) { buf.append(" -> "); buf.append(start.getValue().toString()); } buf.append("\n"); //For each child... for (Iterator iter = start.labelsForward(); iter.hasNext(); ) { // Indent child appropriately. for (int i = 0; i < indent; i++) buf.append(" "); // Print edge. String label = (String)iter.next(); buf.append(label); // Recurse to print value. TrieNode child = start.get(label.charAt(0)).getChild(); toStringHelper(child, buf, indent + 1); } } } /** * A node of the Trie. Each Trie has a list of children, labelled by strings. * Each of these [String label, TrieNode child] pairs is considered an "edge". * The first character of each label must be distinct. When managing * children, different implementations may trade space for time. Each node * also stores an arbitrary Object value.<p> * * Design note: this is a "dumb" class. It is <i>only</i> responsible for * managing its value and its children. None of its operations are recursive; * that is Trie's job. Nor does it deal with case. */ final class TrieNode { /** * The value of this node. */ private Object value = null; /** * The list of children. Children are stored as a sorted Vector because * it is a more compact than a tree or linked lists. Insertions and * deletions are more expensive, but they are rare compared to * searching.<p> * * INVARIANT: children are sorted by distinct first characters of edges, * i.e., for all i < j,<br> * children[i].edge.charAt(0) < children[j].edge.charAt(0) */ private ArrayList /* of TrieEdge */ children = new ArrayList(0); /** * Creates a trie with no children and no value. */ public TrieNode() { } /** * Creates a trie with no children and the given value. */ public TrieNode(Object value) { this.value = value; } /** * Gets the value associated with this node, or null if none. */ public Object getValue() { return value; } /** * Sets the value associated with this node. */ public void setValue(Object value) { this.value = value; } /** * Get the nth child edge of this node. * * @requires 0 <= i < children.size() */ private final TrieEdge get(int i) { return (TrieEdge)children.get(i); } /** * (internal) If exact, returns the unique i so that: * children[i].getLabelStart() == c<br> * If !exact, returns the largest i so that: * children[i].getLabelStart() <= c<br> * In either case, returns -1 if no such i exists.<p> * * This method uses binary search and runs in O(log N) time, where * N = children.size().<br> * The standard Java binary search methods could not be used because they * only return exact matches. Also, they require allocating a dummy Trie. * * Example1: Search non exact c == '_' in {[0] => 'a...', [1] => 'c...'}; * start loop with low = 0, high = 1; * middle = 0, cmiddle == 'a', c < cmiddle, high = 0 (low == 0); * middle = 0, cmiddle == 'a', c < cmiddle, high = -1 (low == 0); * end loop; return high == -1 (no match, insert at 0). * Example2: Search non exact c == 'a' in {[0] => 'a', [1] => 'c'} * start loop with low = 0, high = 1; * middle = 0, cmiddle == 'a', c == cmiddle, * abort loop by returning middle == 0 (exact match). * Example3: Search non exact c == 'b' in {[0] => 'a...', [1] => 'c...'}; * start loop with low = 0, high = 1; * middle = 0, cmiddle == 'a', cmiddle < c, low = 1 (high == 1); * middle = 1, cmiddle == 'c', c < cmiddle, high = 0 (low == 1); * end loop; return high == 0 (no match, insert at 1). * Example4: Search non exact c == 'c' in {[0] => 'a...', [1] => 'c...'}; * start loop with low = 0, high = 1; * middle = 0, cmiddle == 'a', cmiddle < c, low = 1 (high == 1); * middle = 1, cmiddle == 'c', c == cmiddle, * abort loop by returning middle == 1 (exact match). * Example5: Search non exact c == 'd' in {[0] => 'a...', [1] => 'c...'}; * start loop with low = 0, high = 1; * middle = 0, cmiddle == 'a', cmiddle < c, low = 1 (high == 1); * middle = 1, cmiddle == 'c', cmiddle < c, low = 2 (high == 1); * end loop; return high == 1 (no match, insert at 2). */ private final int search(char c, boolean exact) { // This code is stolen from IntSet.search. int low = 0; int high = children.size() - 1; while (low <= high) { int middle = (low + high) / 2; char cmiddle = get(middle).getLabelStart(); if (cmiddle < c) low = middle + 1; else if (c < cmiddle) high = middle - 1; else // c == cmiddle return middle; // Return exact match. } if (exact) return -1; // Return no match. return high; // Return closest *lower or equal* match. (This works!) } /** * Returns the edge (at most one) whose label starts with the given * character, or null if no such edge. */ public TrieEdge get(char labelStart) { int i = search(labelStart, true); if (i < 0) return null; TrieEdge ret = get(i); Assert.that(ret.getLabelStart() == labelStart); return ret; } /** * Inserts an edge with the given label to the given child to this. * Keeps all edges binary sorted by their label start. * * @requires label not empty. * @requires for all edges E in this, label.getLabel[0] != E not already * mapped to a node. * @modifies this */ public void put(String label, TrieNode child) { char labelStart; int i; // If there's a match it is the closest lower or equal one, and // precondition requires it to be lower, so we add the edge *after* // it. If there's no match, there are two cases: the Trie is empty, // or the closest match returned is the last edge in the list. if ((i = search(labelStart = label.charAt(0), // find closest match false)) >= 0) { Assert.that(get(i).getLabelStart() != labelStart, "Precondition of TrieNode.put violated."); } children.add(i + 1, new TrieEdge(label, child)); } /** * Removes the edge (at most one) whose label starts with the given * character. Returns true if any edges where actually removed. */ public boolean remove(char labelStart) { int i; if ((i = search(labelStart, true)) < 0) return false; Assert.that(get(i).getLabelStart() == labelStart); children.remove(i); return true; } /** * Ensures that this's children take a minimal amount of storage. This * should be called after numerous calls to add(). * * @modifies this */ public void trim() { children.trimToSize(); } /** * Returns the children of this in forward order, * as an iterator of TrieNode. */ public Iterator childrenForward() { return new ChildrenForwardIterator(); } /** * Maps (lambda(edge) edge.getChild) on children.iterator(). */ private class ChildrenForwardIterator extends UnmodifiableIterator { int i = 0; public boolean hasNext() { return i < children.size(); } public Object next() { if (i < children.size()) return get(i++).getChild(); throw new NoSuchElementException(); } } /** * Returns the children of this in forward order, * as an iterator of TrieNode. */ /* public Iterator childrenBackward() { return new ChildrenBackwardIterator(); } */ /** * Maps (lambda(edge) edge.getChild) on children.iteratorBackward(). */ /* private class ChildrenBackwardIterator extends UnmodifiableIterator { int i = children.size() - 1; public boolean hasNext() { return i >= 0; } public Object next() { if (i >= 0) return get(i--).getChild(); throw new NoSuchElementException(); } } */ /** * Returns the labels of the children of this in forward order, * as an iterator of Strings. */ public Iterator labelsForward() { return new LabelForwardIterator(); } /** * Maps (lambda(edge) edge.getLabel) on children.iterator() */ private class LabelForwardIterator extends UnmodifiableIterator { int i = 0; public boolean hasNext() { return i < children.size(); } public Object next() { if (i < children.size()) return get(i++).getLabel(); throw new NoSuchElementException(); } } /** * Returns the labels of the children of this in backward order, * as an iterator of Strings. */ /* public Iterator labelsBackward() { return new LabelBackwardIterator(); } */ /** * Maps (lambda(edge) edge.getLabel) on children.iteratorBackward() */ /* private class LabelBackwardIterator extends UnmodifiableIterator { int i = children.size() - 1; public boolean hasNext() { return i >= 0; } public Object next() { if (i >= 0) return get(i--).getLabel(); throw new NoSuchElementException(); } } */ // inherits javadoc comment. public String toString() { Object val = getValue(); if (val != null) return val.toString(); return "NULL"; } /** * Unit test. * @see TrieNodeTest */ } /** * A labelled edge, i.e., a String label and a TrieNode endpoint. */ final class TrieEdge { private String label; private TrieNode child; /** * @requires label.size() > 0 * @requires child != null */ TrieEdge(String label, TrieNode child) { this.label = label; this.child = child; } public String getLabel() { return label; } /** * Returns the first character of the label, i.e., getLabel().charAt(0). */ public char getLabelStart() { // You could store this char as an optimization if needed. return label.charAt(0); } public TrieNode getChild() { return child; } }