/** * Licensed to Cloudera, Inc. under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Cloudera, Inc. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.arabidopsis.ahocorasick; import java.util.Iterator; /** * jon: * * The main modifications from the original is to make this fully type-parameterized. */ /** * <p> * An implementation of the Aho-Corasick string searching automaton. This * implementation of the <a * href="http://portal.acm.org/citation.cfm?id=360855&dl=ACM&coll=GUIDE" * target="_blank">Aho-Corasick</a> algorithm is optimized to work with bytes. * </p> * * <p> * Example usage: <code><pre> AhoCorasick tree = new AhoCorasick(); tree.add("hello".getBytes(), "hello"); tree.add("world".getBytes(), "world"); tree.prepare(); Iterator searcher = tree.search("hello world".getBytes()); while (searcher.hasNext()) { SearchResult result = searcher.next(); System.out.println(result.getOutputs()); System.out.println("Found at index: " + result.getLastIndex()); } </pre></code> * </p> * * <h2>Recent changes</h2> * <ul> * * <li>Per user request from Carsten Kruege, I've changed the signature of * State.getOutputs() and SearchResults.getOutputs() to Sets rather than Lists.</li> * * </ul> * * jon: tweaked to be type generic */ public class AhoCorasick<T> { private State<T> root; private boolean prepared; public AhoCorasick() { this.root = new State<T>(0); this.prepared = false; } /** * Adds a new keyword with the given output. During search, if the keyword is * matched, output will be one of the yielded elements in * SearchResults.getOutputs(). */ public void add(byte[] keyword, T output) { if (this.prepared) throw new IllegalStateException( "can't add keywords after prepare() is called"); State<T> lastState = this.root.extendAll(keyword); lastState.addOutput(output); } /** * Prepares the automaton for searching. This must be called before any * searching(). */ public void prepare() { this.prepareFailTransitions(); this.prepared = true; } /** * Starts a new search, and returns an Iterator of SearchResults. */ public Iterator<SearchResult<T>> search(byte[] bytes) { return new Searcher<T>(this, this.startSearch(bytes)); } /** * DANGER DANGER: dense algorithm code ahead. Very order dependent. * Initializes the fail transitions of all states except for the root. */ private void prepareFailTransitions() { Queue<T> q = new Queue<T>(); for (int i = 0; i < 256; i++) if (this.root.get((byte) i) != null) { this.root.get((byte) i).setFail(this.root); q.add(this.root.get((byte) i)); } this.prepareRoot(); while (!q.isEmpty()) { State<T> state = q.pop(); byte[] keys = state.keys(); for (int i = 0; i < keys.length; i++) { State<T> r = state; byte a = keys[i]; State<T> s = r.get(a); q.add(s); r = r.getFail(); while (r.get(a) == null) r = r.getFail(); s.setFail(r.get(a)); s.getOutputs().addAll(r.get(a).getOutputs()); } } } /** * Sets all the out transitions of the root to itself, if no transition yet * exists at this point. */ private void prepareRoot() { for (int i = 0; i < 256; i++) if (this.root.get((byte) i) == null) this.root.put((byte) i, this.root); } /** * Returns the root of the tree. Package protected, since the user probably * shouldn't touch this. */ State<T> getRoot() { return this.root; } /** * Begins a new search using the raw interface. Package protected. */ SearchResult<T> startSearch(byte[] bytes) { if (!this.prepared) throw new IllegalStateException("can't start search until prepare()"); return continueSearch(new SearchResult<T>(this.root, bytes, 0)); } /** * Continues the search, given the initial state described by the lastResult. * Package protected. */ SearchResult<T> continueSearch(SearchResult<T> lastResult) { byte[] bytes = lastResult.bytes; State<T> state = lastResult.lastMatchedState; for (int i = lastResult.lastIndex; i < bytes.length; i++) { byte b = bytes[i]; while (state.get(b) == null) state = state.getFail(); state = state.get(b); if (state.getOutputs().size() > 0) return new SearchResult<T>(state, bytes, i + 1); } return null; } }