/* * PATTrie.java * * Copyright (C) 2009 Leo Osvald <leo.osvald@gmail.com> * * This file is part of SGLJ. * * SGLJ is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * SGLJ is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this library. If not, see <http://www.gnu.org/licenses/>. */ package org.sglj.util; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.Set; import java.util.TreeMap; /** * <p>Implementation of PAT trie.<br> * A description of this data structure can be found here:<br> * <pre>http://en.wikipedia.org/wiki/Suffix_tree</pre> * <br> * This implementation can be used as a multimap; it can associate multiple * different values with a key which is actually a node which correspond * to some string.</p> * <p>The implementation is quite efficient and all operations take * time proportional to the key length, which is some small constant.<br> * Insertion and removal is done in linear time which is proportional to * the size of the elements removed/inserted. * Prefix query takes linear time proportional to the size of the result set * matching that query.</p> * * @author Leo Osvald * * @param <E> type of the elements which are stored * * @version 0.71 */ public class PATTrie<E> { protected Node root; protected static final boolean REMOVE = false; protected static final boolean ADD = true; public PATTrie(final Collection<E> added, final Collection<E> removed) { root = new Node(); } public PATTrie() { this(null, null); } private int getDifferenceIndex(String a, String b) { int diffInd = 0; while(diffInd < a.length() && diffInd < b.length() && a.charAt(diffInd) == b.charAt(diffInd)) ++diffInd; return diffInd; } private boolean insert(final String key, final E value, int ind, Node node) { if(ind >= key.length()) { if(node.addData(value)) { ++node.prefixCount; return true; } return false; } String s = key.substring(ind); int splitInd = 1; Node subtree = node.getChild(key.charAt(ind)); if(subtree == null) { subtree = new Node(node, s); splitInd = s.length(); } else { //inace, pogledaj gdje treba rasdvojiti splitInd = getDifferenceIndex(s, subtree.edge); if(splitInd < subtree.edge.length()) { subtree = subtree.splitEdge(splitInd); } } // System.out.println("->" + subtree.edge.substring(0, splitInd) // + "\t" + subtree.prefixCount); boolean ret = insert(key, value, ind+splitInd, subtree); if(ret) { ++node.prefixCount; } return ret; } /** * Associates specified value with the specified key, forming a new * entry if it was not already associated. * @param key key * @param value value * @return <code>true</code> if entry was inserted, * <code>false</code> otherwise. */ public boolean put(final String key, final E value) { return insert(key, value, 0, root); } private Node getNode(final String key, int ind, final Node node) { if(ind >= key.length()) return node; Node subtree = node.getChild(key.charAt(ind)); if(subtree == null) return null; String s = key.substring(ind); if(!s.startsWith(subtree.edge)) return null; return getNode(key, ind+subtree.edge.length(), subtree); } /** * Returns collection of values associated with the specified key.<br> * If there are no associated values, an immutable empty collection * will be returned.<br> * @param key key * @return collection of values mapped by this key */ public Collection<E> getValues(final String key) { Node node = getNode(key, 0, root); if(node.isEmpty()) return Collections.emptyList(); return new ArrayList<E>(node.data); } /** * Checks whether the specified value is associated with * the specified key. * @param key key * @param value value * @return <code>true</code> if it is, <code>false</code> otherwise. */ public boolean contains(final String key, final E value) { return getNode(key, 0, root).containsData(value); } private Node getNodeMatchingPrefix(final String prefix, int ind, Node node) { if(ind >= prefix.length()) return node; Node subtree = node.getChild(prefix.charAt(ind)); if(subtree == null) return null; String s = prefix.substring(ind); int diffInd = getDifferenceIndex(subtree.edge, s); //if one is not the prefix of another one, there are no results if(diffInd < s.length() && diffInd < subtree.edge.length()) return null; // System.out.println("->" + subtree.edge + "\t" + subtree.prefixCount); return getNodeMatchingPrefix(prefix, ind+diffInd, subtree); } private boolean findPrefix(final String prefix, int ind, Node node, final Collection<E> added) { if(ind >= prefix.length()) { // System.out.println("[FIND] found subtree: "); return updateCollection(added, node); } Node subtree = node.getChild(prefix.charAt(ind)); if(subtree == null) return false; String s = prefix.substring(ind); int diffInd = getDifferenceIndex(subtree.edge, s); //if one is not the prefix of another one, there are no results if(diffInd < s.length() && diffInd < subtree.edge.length()) return false; return findPrefix(prefix, ind+diffInd, subtree, added); } /** * Retrieves all values mapped by the keys whose prefix is the one * specified. * @param prefix key prefix * @param result collection where retrieved values should be added * @return collection to which retrieved values should be added */ public boolean findPrefix(final String prefix, Collection<E> result) { return findPrefix(prefix, 0, root, result); } private boolean remove(final String key, final E value, int ind, Node node) { if(ind >= key.length()) { if(node.removeData(value)) { if(node.isEmpty()) { if(node.hasSingleChild()) { node.getOnlyChild().mergeWithParent(); } else if(node.isLeaf()) { // System.out.println("Removing subtree: " + node.edge); node.destroy(); } } else --node.prefixCount; return true; } return false; } Node subtree = node.getChild(key.charAt(ind)); if(subtree == null) return false; String s = key.substring(ind); int diffInd = getDifferenceIndex(subtree.edge, s); //if one is not the prefix of another one, there are no results if(diffInd < s.length() && diffInd < subtree.edge.length()) return false; // System.out.println("->" + subtree.edge + "\t" + subtree.prefixCount); boolean ret = remove(key, value, ind+diffInd, subtree); if(ret) { // System.out.println("UP: " + node.edge); --node.prefixCount; if(node != root && node.isEmpty() && node.hasSingleChild()) node.getOnlyChild().mergeWithParent(); } return ret; } /** * Removes the specified value associated with the specified key * (if this entry exists). * @param key key * @param value value * @return <code>true</code> if value was removed, <code>false</code> * otherwise. */ public boolean remove(final String key, final E value) { return remove(key, value, 0, root); } private void clear(final Node node) { if(node.next != null) { Collection<Node> children = node.next.values(); for(Node subtree : children) clear(subtree); } node.purgeNext(); node.purgeData(); } /** * Removes all key-value pairs from this collection.<br> * In addition, resets search state regarding search by prefix * (so that further calls to * {@link #continueFindPrefix(String, String, Collection, Collection)} * works properly). */ public void clear() { clear(root); nodeCount = 0; root = new Node(); } private boolean updateCollection(final Collection<E> addedOrRemoved, final Node node) { if(node == null) return false; boolean ret = false; if(node.data != null) { if(addedOrRemoved != null) ret |= addedOrRemoved.addAll(node.data); } if(node.next != null) for(Node subtree : node.next.values()) ret |= updateCollection(addedOrRemoved, subtree); return ret; } private boolean traverse(final Node lower, final Node higher, final boolean addOrRemove, final Collection<E> addedOrRemoved) { boolean ret = false; for(Node curr = lower; curr != higher && curr != root; curr = curr.parent) { //add/remove all from other subtrees for(Node sibling : curr.parent.next.values()) { if(sibling != curr) { // int oldCnt = (result != null ? result.size() : 0); ret |= updateCollection(addedOrRemoved, sibling); // int nowCnt = (result != null ? result.size() : 0); // System.out.println((addOrRemove == ADD ? "[ASCEND] added " // : "[DESCEND] removed ") // + (nowCnt-oldCnt) + "\t subtree: " + sibling.edge); } } if(curr.parent.data != null) { if(addedOrRemoved != null) ret |= addedOrRemoved.addAll(curr.parent.data); } // System.out.println((addOrRemove == ADD ? "-----UP-----" // : "----DOWN----")); } return ret; } private boolean ascend(final Node from, final Node to, final Collection<E> added) { return traverse(from, to, ADD, added); } private boolean descend(final Node from, final Node to, final Collection<E> removed) { return traverse(to, from, REMOVE, removed); } /** * * <p>Continues searching by prefix.<br> * Collections <code>added</code> and <code>removed</code> are cleared * and one of the following occurs: * <ul> * <li>a) if <code>currPrefix</code> matches more keys than * <code>lastPrefix</code>, values mapped by these matched keys * are added to <code>added</code> collection</li> * <li>b) if <code>currPrefix</code> matches less keys than * <code>lastPrefix</code>, values mapped by these matched keys * which <code>currPrefix</code> does not match * (but <code>lastPrefix</code> did match) will be added to * <code>removed</code> collection * <li>c) if the set of keys that match <code>currPrefix</code> * and the set of keys that match <code>lastPrefix</code> are disjunctive * (that is, no key is matched by both prefixes), * values mapped by newly matched keys are added to * <code>added</code> collection. * </ul></p> * <p>The time complexity of this operation is linear to * the number of values added to <code>added</code> and removed * from <code>removed</code> collection.</p> * @param currPrefix current prefix that is searched by * @param lastPrefix last prefix that was searched by * @param added collection * @param removed collection * @return <code>true</code>if results are non-disjunctive (case a or b), * <code>false</code> if this is not the case (which means * that continuation of the search has not succeeded) (case c). */ public boolean continueFindPrefix(String currPrefix, final String lastPrefix, Collection<E> added, Collection<E> removed) { if(added != null) added.clear(); if(removed != null) removed.clear(); //rubni slucajevi if(currPrefix == null) currPrefix = ""; Node currNode = getNodeMatchingPrefix(currPrefix, 0, root); if(currNode == null) return false; Node lastNode = getNodeMatchingPrefix(lastPrefix, 0, root); if(lastNode == null) { // System.out.println("[lastNode == null]"); findPrefix(currPrefix, 0, root, added); return false; } //trivial cases (currNode != null && lastNode != null) // System.out.println("lastNode data: " + (lastNode == root ? "(root)" : // (lastNode == null ? "(lastNode == null)" : lastNode.data))); // System.out.println("currNode data: " + (currNode == root ? "(root)" : // (currNode == null ? "(currNode == null)" : currNode.data))); boolean isDescendant = currPrefix.startsWith(lastPrefix); boolean isAncestor = lastPrefix.startsWith(currPrefix); //if current and last node represent the same subtree, no changes if(lastNode == currNode) return true; boolean ret; //if they are disjunctive, return a new set if(!isDescendant && !isAncestor) { findPrefix(currPrefix, 0, root, added); ret = false; } //if this node is a descendant, descend and remove results //from other subtrees on the route else if(isDescendant) { System.out.println("DESCENDANT: " + currPrefix.substring(lastPrefix.length())); descend(lastNode, currNode, removed); ret = true; } //if this is an ancestor, ascend and add results from the subtrees //on the route else { System.out.println("ASCENDANT"); ascend(lastNode, currNode, added); ret = true; } return ret; } /** * Returns the number of values contained by this collection. * This value is greater than or equal to the one * returned by {@link #nodeCount()} method, as several * values can be mapped by the same key. * @return total number of values */ public int size() { return root.prefixCount; } /** * Checks whether this collection is empty, that is, whether * it contains at least one key-value entry. * @return <code>true</code> if it is empty, <code>false</code> otherwise. */ public boolean isEmpty() { return root.prefixCount == 0; } /** * Returns the number of nodes in the trie.<br> * This method is equivalent to the {@link #keyCount()} method. * @return number of nodes in the trie * @see Node */ public int nodeCount() { return nodeCount; } /** * Returns the total number of keys. * This method is equivalent to the {@link #nodeCount()} method. * @return number of keys in this collection */ public int keyCount() { return nodeCount(); } protected int nodeCount = 0; /** * Node of the trie. * * @author Leo Osvald * */ private class Node { private Set<E> data; private TreeMap<Character, Node> next; private Node parent; private String edge; private int prefixCount; private static final int INIT_DATA_CAPACITY = 2; public Node() { } /** * Creates node which is doubly-linked with its parent node. * @param parent parent node * @param edgeFromParent edge from parent node */ public Node(Node parent, String edgeFromParent) { ++nodeCount; parent.link(edgeFromParent, this); prefixCount = 0; } /** * Creates double link to its future child. * @param edge edge which represent the link * @param child child node which should be created and linked */ public void link(String edge, Node child) { if(next == null) next = new TreeMap<Character, Node>(); next.put(edge.charAt(0), child); child.parent = this; child.edge = edge; } /** * Destroys double link to its parent and links to its children * (but not from its children). */ public void destroy() { // TODO maybe links from children to this node should be destroyed?? // System.out.println("Destroy: " + edge); --nodeCount; if(this.parent != null) { this.parent.next.remove(edge.charAt(0)); if(this.parent.next.isEmpty()) { this.parent.purgeNext(); } this.parent = null; } edge = null; // //let GC deallocate memory purgeNext(); purgeData(); } /** * Splits edge which connects this node to its parent, creating * a new node in between, which is then returned. * @param splitInd index at which the edge should be split * @return node new node that was created */ public Node splitEdge(int splitInd) { //napravi link s parentom->splitNode parent.next.remove(edge.charAt(0)); Node splitNode = new Node(parent, edge.substring(0, splitInd)); //napravi link splitNode -> this splitNode.prefixCount = prefixCount; splitNode.link(edge.substring(splitInd), this); // System.out.println("Splitnode(" + splitNode.edge // + ")\tchild(" + edge + ")"); return splitNode; } /** * Merges this node with its parent. */ public void mergeWithParent() { // System.out.println("Merge(" + parent.edge + "+" + edge + ")"); addData(parent.data); //prespoji edgeve (izbrisi parenta i spoji ovog na grandparenta) Node grandParent = parent.parent; String edgeToParent = parent.edge; parent.destroy(); parent = null; //TODO UNTESTED grandParent.link(edgeToParent + edge, this); } /** * Checks whether the node contains values. * @return <code>true</code> if it contains at least one value, * <code>false</code> otherwise. */ public boolean isEmpty() { return data == null || data.isEmpty(); } /** * Checks whether the node is a leaf node. * @return <code>true</code>if it is leaf but not the root, * <code>false</code> otherwise. */ public boolean isLeaf() { return this != root && (next == null || next.isEmpty()); } /** * Checks whether the node has only one child. * @return <code>true</code> if this is the case, * <code>false</code> otherwise. */ public boolean hasSingleChild() { return next != null && next.size() == 1; } /** * Returns the only child of the node. * @return node child node, or * <code>null</code> if the node has no children. */ public Node getOnlyChild() { return next.firstEntry().getValue(); } public Node getChild(Character c) { //TODO Character -> String return next == null ? null : next.get(c); } /** * Adds value to the node, if it is not already * contained in the node. * @param data value that should be added * @return <code>true</code> if the value is added, * <code>false</code> otherwise. */ public boolean addData(E data) { if(data == null) return false; createDataIfNeeded(); return this.data.add(data); } /** * Adds all values from the specified collection. * @param data values to add * @return <code>true</code> if at least one value was added, * <code>false</code> if none was added (in other words, if all * were already contained). */ public boolean addData(Collection<? extends E> data) { if(data == null) return false; createDataIfNeeded(); return this.data.addAll(data); } /** * Removes the specified value (if it exists). * @param data the value that should be removed * @return <code>true</code> if the value was removed, * <code>false</code> otherwise. */ public boolean removeData(Object data) { if(this.data == null) return false; boolean ret = this.data.remove(data); if(this.data.isEmpty()) this.data = null; return ret; } /** * Removes all values that are contained in the specified collection. * @param data values to be removed * @return <code>true</code> if at least one value was removed, * <code>false</code> otherwise. */ @SuppressWarnings("unused") public boolean removeAllData(Collection<?> data) { if(this.data == null) return false; boolean ret = this.data.removeAll(data); if(this.data.isEmpty()) this.data = null; return ret; } /** * Checks whether the specified value is contained. * @param data value * @return <code>true</code> if it is, * <code>false</code> otherwise. */ public boolean containsData(Object data) { if(isEmpty()) return false; return this.data.contains(data); } /** * Initialization of collection which holds values. */ public void createDataIfNeeded() { if(this.data == null) this.data = new HashSet<E>(INIT_DATA_CAPACITY); } /** * Destruction of the collection which holds values - saves memory. */ public void purgeData() { if(this.data != null) { this.data.clear(); this.data = null; } } /** * Removes all links to children (but not from children). */ public void purgeNext() { if(this.next != null) { this.next.clear(); this.next = null; } } } }