package edu.northwestern.at.utils; import java.io.PrintWriter; import java.util.*; /** * <p> * <code>TernaryTrie</code> is an implementation of a ternary tree. * Methods are provided for inserting strings and searching for strings. * The algorithms in this class are all recursive, and have not been * optimized for any particular purpose. * Data which is inserted is not sorted before insertion, however data * can be inserted beginning with the median of the supplied data. * </p> * * @author <a href="mailto:dfisher@vt.edu">Daniel Fisher</a> * @version $Revision: 2134 $ * $Date: 2005-03-28 14:48:04 -0500 (Mon, 28 Mar 2005) $ */ public class TernaryTrie implements TaggedStrings { /** root node of the ternary tree */ protected TernaryTrieNode root; /** Count of nodes in tree. */ protected int nodeCount = 0; /** Maximum key length. */ protected int maxKeyLength = 0; /** Construct empty ternary trie. */ public TernaryTrie() { } /** Construct a ternary trie from keys and values in a map. * * @param stringsMap The map whose keys and values are to be * added to the trie. * * @param addValuesAsKeys True to add each value as a key * as well. */ public TernaryTrie ( Map<String, String> stringsMap , boolean addValuesAsKeys ) { for ( String s : stringsMap.keySet() ) { String s2 = stringsMap.get( s ); put( s , s2 ); if ( addValuesAsKeys ) { put( s2 , s2 ); } } } /** Construct a ternary trie from values in a set. * * @param stringsSet The set whose values are to be added * to the trie. */ public TernaryTrie( Set<String> stringsSet ) { for ( String s : stringsSet ) { put( s , s ); } } /** Construct a ternary trie from values in a list. * * @param stringsList The list whose values are to be added * to the trie. */ public TernaryTrie( List<String> stringsList ) { for ( String s : stringsList ) { put( s , s ); } } /** Construct a ternary trie from values in a tagged strings list. * * @param stringsList The tagged strings list whose values * are to be added to the trie. * * @param addValuesAsKeys True to add each value as a key * as well. */ public TernaryTrie ( TaggedStrings stringsList , boolean addValuesAsKeys ) { Iterator<String> iterator = stringsList.getAllStrings().iterator(); while ( iterator.hasNext() ) { String s = iterator.next(); String s2 = stringsList.getTag( s ); put( s , s2 ); if ( addValuesAsKeys ) { put( s2 , s2 ); } } } /** Add word and associated value to trie. * * @param word String to insert. * @param value Object value. * * @return Previous value, if any, for word. */ public Object put( String word , Object value ) { Object result = null; if ( word != null ) { TernaryTrieNode wordNode = findNode( root , word , 0 ); if ( wordNode != null ) { result = wordNode.getValue(); wordNode.setValue( value ); } else { root = insertNode( root , word , 0 , value ); } } return result; } /** Get associated value for word. * * @param word String whose associated value we want. * * @return Associated value. */ public Object get( String word ) { Object result = null; if ( word != null ) { TernaryTrieNode wordNode = findNode( root , word , 0 ); if ( wordNode != null ) { result = wordNode.getValue(); } } return result; } /** Check if trie contains a specified key. * * @param word The key to look up. * * @return true if trie contains specified key. */ public boolean containsKey( String word ) { return searchNode( root , word , 0 ); } /** * <p> * This will return an array of strings which partially match the supplied * word. * word should be of the format '.e.e.e' * Where the '.' character represents any valid character. * Possible results from this query include: Helene, delete, or severe * Note that no substring matching occurs, results only include strings of * the same length. * If the supplied word does not contain the '.' character, then a regular * search is preformed. * </p> * * @param word <code>String</code> to search for * @return <code>String[]</code> - of matching words */ public List<String> partialSearch( String word ) { List<String> list = ListFactory.createNewList(); return partialSearchNode( root , list , "" , word , 0 ); } public List<String> prefixSearch( String prefix ) { List<String> matches = null; if ( prefix != null ) { String prefixPattern = prefix + StringUtils.dupl( "." , maxKeyLength - prefix.length() ); List<String> list = ListFactory.createNewList(); matches = prefixSearchNode ( root , list , "" , prefix , prefixPattern , 0 ); } return matches; } protected List<String> prefixSearchNode ( TernaryTrieNode node , List<String> matches , String match , String prefix , String prefixPattern , int index ) { if ( ( node != null ) && ( index < prefixPattern.length() ) ) { char c = prefixPattern.charAt( index ); char split = node.getSplitChar(); if ( ( c == '.' ) || ( c < split ) ) { matches = prefixSearchNode ( node.getLokid() , matches , match , prefix , prefixPattern , index ); } if ( ( c == '.' ) || ( c == split ) ) { String partialWord = match + split; if ( node.isEndOfWord() && ( partialWord.startsWith( prefix ) ) ) { matches.add( partialWord ); } matches = prefixSearchNode ( node.getEqkid() , matches , partialWord , prefix , prefixPattern , index + 1 ); } if ( ( c == '.' ) || ( c > split ) ) { matches = prefixSearchNode ( node.getHikid() , matches , match , prefix , prefixPattern , index ); } } return matches; } /** * <p> * This will return an array of strings which are near to the supplied * word by the supplied distance. * For the query nearSearch("fisher", 2): * Possible results include: cipher, either, fishery, kosher, sister. * If the supplied distance is not > 0, then a regular * search is preformed. * </p> * * @param word <code>String</code> to search for * @param distance <code>int</code> for valid match * @return <code>String[]</code> - of matching words */ public List<String> nearSearch( String word , int distance ) { List<String> list = ListFactory.createNewList(); return nearSearchNode ( root , distance , list , "" , word , 0 ); } /** * <p> * This will return an array of all the words in this * <code>TernaryTrie</code>. * This is a very expensive operation, every node in the tree is traversed. * </p> * * @return <code>String[]</code> - of words */ public List<String> getWords() { List<String> list = ListFactory.createNewList(); return traverseNode( root, "", list ); } /** * <p> * This will recursively insert a word into the <code>TernaryTrie</code> * one node at a time beginning at the supplied node. * </p> * * @param node <code>TernaryTrieNode</code> to put character in * @param word <code>String</code> to be inserted * @param index <code>int</code> of character in word * @return <code>TernaryTrieNode</code> - to insert */ protected TernaryTrieNode insertNode ( TernaryTrieNode node, final String word, final int index, final Object value ) { if (index < word.length()) { final char c = word.charAt(index); if (node == null) { node = new TernaryTrieNode(c); } final char split = node.getSplitChar(); if (c < split) { node.setLokid(insertNode(node.getLokid(), word, index, value)); } else if (c == split) { if (index == word.length() - 1) { node.setEndOfWord(true); node.setValue(value); nodeCount++; maxKeyLength = Math.max( maxKeyLength , word.length() ); } node.setEqkid(insertNode(node.getEqkid(), word, index + 1, value)); } else { node.setHikid(insertNode(node.getHikid(), word, index, value)); } } return node; } /** * <p> * This will recursively search for a word in the <code>TernaryTrie</code> * one node at a time beginning at the supplied node. * </p> * * @param node <code>TernaryTrieNode</code> to search in * @param word <code>String</code> to search for * @param index <code>int</code> of character in word * @return <code>boolean</code> - whether or not word was found */ protected boolean searchNode(final TernaryTrieNode node, final String word, final int index) { boolean success = false; if (node != null && index < word.length()) { final char c = word.charAt(index); final char split = node.getSplitChar(); if (c < split) { return searchNode(node.getLokid(), word, index); } else if (c > split) { return searchNode(node.getHikid(), word, index); } else { if (index == word.length() - 1) { if (node.isEndOfWord()) { success = true; } } else { return searchNode(node.getEqkid(), word, index + 1); } } } return success; } protected TernaryTrieNode findNode ( final TernaryTrieNode node , final String word , final int index ) { if (node != null && index < word.length()) { final char c = word.charAt(index); final char split = node.getSplitChar(); if (c < split) { return findNode(node.getLokid(), word, index); } else if (c > split) { return findNode(node.getHikid(), word, index); } else { if (index == word.length() - 1) { if (node.isEndOfWord()) { return node; } } else { return findNode(node.getEqkid(), word, index + 1); } } } return null; } /** * <p> * This will recursively search for a partial word in the * <code>TernaryTrie</code> * one node at a time beginning at the supplied node. * </p> * * @param node <code>TernaryTrieNode</code> to search in * @param matches <code>ArrayList</code> of partial matches * @param match <code>String</code> the current word being examined * @param word <code>String</code> to search for * @param index <code>int</code> of character in word * @return <code>ArrayList</code> - of matches */ protected List<String> partialSearchNode ( final TernaryTrieNode node, List<String> matches, final String match, final String word, final int index ) { if (node != null && index < word.length()) { final char c = word.charAt(index); final char split = node.getSplitChar(); if (c == '.' || c < split) { matches = partialSearchNode(node.getLokid(), matches, match, word, index); } if (c == '.' || c == split) { if (index == word.length() - 1) { if (node.isEndOfWord()) { matches.add(match + split); } } else { matches = partialSearchNode(node.getEqkid(), matches, match + split, word, index + 1); } } if (c == '.' || c > split) { matches = partialSearchNode(node.getHikid(), matches, match, word, index); } } return matches; } /** * <p> * This will recursively search for a near match word in the * <code>TernaryTrie</code> * one node at a time beginning at the supplied node. * </p> * * @param node <code>TernaryTrieNode</code> to search in * @param distance <code>int</code> of a valid match, must be > 0 * @param matches <code>ArrayList</code> of near matches * @param match <code>String</code> the current word being examined * @param word <code>String</code> to search for * @param index <code>int</code> of character in word * @return <code>ArrayList</code> - of matches */ protected List<String> nearSearchNode ( TernaryTrieNode node, int distance, List<String> matches, String match, String word, int index ) { if ( ( node != null ) && ( distance >= 0 ) ) { final char c; if ( index < word.length() ) { c = word.charAt( index ); } else { c = (char)(-1); // c = (char)0; } char split = node.getSplitChar(); if ( ( distance > 0 ) || ( c < split ) ) { matches = nearSearchNode ( node.getLokid() , distance , matches , match , word, index ); } String newMatch = match + split; if ( c == split ) { if ( node.isEndOfWord() && ( distance >= 0 ) && ( ( newMatch.length() + distance ) >= word.length() ) ) { matches.add( newMatch ); //System.out.println( "newSearchNode: eq1: adding " + newMatch ); } matches = nearSearchNode ( node.getEqkid() , distance , matches , newMatch , word , index + 1 ); } else { if ( node.isEndOfWord() && ( ( distance - 1 ) >= 0 ) && ( newMatch.length() + ( distance - 1 ) >= word.length() ) ) { matches.add( newMatch ); //System.out.println( "newSearchNode: eq2: adding " + newMatch ); } matches = nearSearchNode ( node.getEqkid() , distance - 1 , matches , newMatch , word , index + 1 ); } if ( ( distance > 0 ) || ( c > split ) ) { matches = nearSearchNode ( node.getHikid() , distance , matches , match , word , index ); } } return matches; } /** * <p> * This will recursively traverse every node in the * <code>TernaryTrie</code> * one node at a time beginning at the supplied node. * The result is a string representing every word, which is delimited by * the LINE_SEPARATOR character. * </p> * * @param node <code>TernaryTrieNode</code> to begin traversing * @param s <code>String</code> of words found at the supplied node * @param words <code>ArrayList</code> which will be returned * (recursive function) * @return <code>String</code> - containing all words from the supplied node */ protected List<String> traverseNode ( TernaryTrieNode node, String s, List<String> words ) { if ( node != null ) { words = traverseNode( node.getLokid() , s , words ); String c = String.valueOf( node.getSplitChar() ); if ( node.getEqkid() != null ) { if ( node.endOfWord ) { words.add( s + c ); } words = traverseNode( node.getEqkid() , s + c , words ); } else { words.add( s + c ); } words = traverseNode( node.getHikid() , s , words ); } return words; } /** Return size of trie (# of terminal nodes) * * @return Number of terminal nodes (nodes with data values). */ public int size() { return nodeCount; } /** See if specified string exists. * * @param string The string. * * @return True if specified string exists. */ public boolean containsString( String string ) { return containsKey( string ); } /** Get the tag value associated with a string. * * @param string The string. * * @return The tag value associated with the string. * May be null. */ public String getTag( String string ) { String result = null; Object value = get( string ); if ( value != null ) { result = value.toString(); } return result; } /** Set the tag value associated with a string. * * @param string The string. * @param tag The tag. */ public void putTag( String string , String tag ) { put( string , tag ); } /** Get number of strings. * * @return Number of strings. */ public int getStringCount() { return nodeCount; } /** Get set of all unique string values. * * @return Set of all unique string values. */ public Set<String> getAllStrings() { Set<String> result = SetFactory.createNewSet(); List<String> list = ListFactory.createNewList(); result.addAll( traverseNode( root , "" , list ) ); return result; } /** Get set of all unique tag values as strings. * * @return Set of all unique string tag values. */ public Set<String> getAllTags() { Set<String> result = SetFactory.createNewSet(); List<String> list = ListFactory.createNewList(); List<String> stringsList = traverseNode( root , "" , list ); for ( String key : stringsList ) { result.add( get( key ).toString() ); } return result; } }