package org.arabidopsis.ahocorasick; import java.util.Iterator; /** <p>An implementation of the Aho-Corasick string searching automaton. This implementation of the <a href="http://portal.acm.org/citation.cfm?id=360855&dl=ACM&coll=GUIDE" target="_blank">Aho-Corasick</a> algorithm is optimized to work with chars.</p> <p> Example usage: <code><pre> AhoCorasick tree = new AhoCorasick(); tree.add("hello", "hello"); tree.add("world", "world"); tree.prepare(); Iterator searcher = tree.search("hello world".getBytes()); while (searcher.hasNext()) { SearchResult result = searcher.next(); System.out.println(result.getOutputs()); System.out.println("Found at index: " + result.getLastIndex()); } </pre></code> </p> <h2>Recent changes</h2> <ul> <li>Per user request from Carsten Kruege, I've changed the signature of State.getOutputs() and SearchResults.getOutputs() to Sets rather than Lists. </li> </ul> */ public class AhoCorasick<T> { private State root; private boolean prepared; public AhoCorasick() { this.root = new State<T>(0); this.prepared = false; } /** Adds a new keyword with the given output. During search, if the keyword is matched, output will be one of the yielded elements in SearchResults.getOutputs(). */ public void add(String keyword, Object output) { if (this.prepared) throw new IllegalStateException("can't add keywords after prepare() is called"); State lastState = this.root.extendAll(keyword.toCharArray()); lastState.addOutput(output); } /** Prepares the automaton for searching. This must be called before any searching(). */ public void prepare() { this.prepareFailTransitions(); this.prepared = true; } /** Starts a new search, and returns an Iterator of SearchResults. */ public Iterator<SearchResult<T>> search(char[] chars) { return new Searcher(this, this.startSearch(chars)); } /** * DANGER DANGER: dense algorithm code ahead. Very order dependent. Initializes the fail * transitions of all states except for the root. */ private void prepareFailTransitions() { Queue<State> q = new Queue<State>(); for (int i = 0; i < 256; i++) if (this.root.get((char) i) != null) { this.root.get((char) i).setFail(this.root); q.add(this.root.get((char) i)); } this.prepareRoot(); while (! q.isEmpty()) { State state = q.pop(); char[] keys = state.keys(); for (int i = 0; i < keys.length; i++) { State r = state; char a = keys[i]; State s = r.get(a); q.add(s); r = r.getFail(); while (r.get(a) == null) r = r.getFail(); s.setFail(r.get(a)); s.getOutputs().addAll(r.get(a).getOutputs()); } } } /** Sets all the out transitions of the root to itself, if no transition yet exists at this point. */ private void prepareRoot() { for (int i = 0; i < 256; i++) if (this.root.get((char) i) == null) this.root.put((char) i, this.root); } /** * Returns the root of the tree. */ State getRoot() { return this.root; } /** * Begins a new search using the raw interface. */ SearchResult<T> startSearch(char[] chars) { if (!this.prepared) throw new IllegalStateException ("can't start search until prepare()"); return continueSearch (new SearchResult(this.root, chars, 0)); } /** * Continues the search, given the initial state described by the lastResult. */ SearchResult continueSearch(SearchResult lastResult) { char[] chars = lastResult.chars; State state = lastResult.lastMatchedState; for (int i = lastResult.lastIndex; i < chars.length; i++) { char b = chars[i]; while (state.get(b) == null) state = state.getFail(); state = state.get(b); if (state.getOutputs().size() > 0) return new SearchResult(state, chars, i+1); } return null; } }