/*
* Copyright 2013, 2014 Deutsche Nationalbibliothek
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.culturegraph.mf.commons.tries;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Queue;
/**
* Implementation of the Aho-Corasick algorithm
*
* @author Markus Michael Geipel
*
* @param <T>
* type of value stored
*/
public final class SetMatcher<T> {
private final ACNode<T> root = new ACNode<T>(null, 0);
private boolean isPrepared;
public void put(final String key, final T value) {
if (isPrepared) {
throw new IllegalStateException("keys cannot be added during matching.");
}
final int length = key.length();
ACNode<T> node = root;
ACNode<T> next;
for (int i = 0; i < length - 1; ++i) {
next = node.getNext(key.charAt(i));
if (next == null) {
next = node.addNext(key.charAt(i));
}
node = next;
}
next = node.getNext(key.charAt(length - 1));
if (next == null) {
next = node.addNext(key.charAt(length - 1), value);
} else if (next.getValue() == null) {
next.setValue(value);
} else {
throw new IllegalStateException("Key '" + key + "' already in trie");
}
}
public List<Match<T>> match(final String text) {
if (!isPrepared) {
prepare();
isPrepared = true;
}
final List<Match<T>> matches = new ArrayList<Match<T>>();
ACNode<T> node = root;
final int length = text.length();
int index = 0;
while (index < length) {
final ACNode<T> next = node.getNext(text.charAt(index));
if (next != null) {
node = next;
} else if (node != root) {
node = node.getFailure();
continue;
}
++index;
collectMatches(node, index, matches);
}
return matches;
}
private void collectMatches(final ACNode<T> node, final int index, final List<Match<T>> matches) {
//direct hit or hit in chain of failure links?
ACNode<T> tempNode = node;
do{
if (tempNode.getValue() != null) {
matches.add(new Match<T>(tempNode.getValue(), index - tempNode.getDepth(), tempNode.getDepth()));
}
tempNode = tempNode.getFailure();
}while (tempNode != root);
}
private void prepare() {
final Queue<ACNode<T>> queue = new LinkedList<ACNode<T>>();
// prepare root
root.setFailure(root);
for (ACNode<T> child : root.getNext()) {
child.setFailure(root);
queue.add(child);
}
// prepare rest
while (!queue.isEmpty()) {
final ACNode<T> parent = queue.poll();
final ACNode<T> parentFailure = parent.getFailure();
for (Entry<Character, ACNode<T>> link : parent.getLinks()) {
final char key = link.getKey().charValue();
final ACNode<T> child = link.getValue();
ACNode<T> node = parentFailure;
while (node.getNext(key) == null && node != root) {
node = node.getFailure();
}
if (node.getNext(key) == null) {
child.setFailure(root);
} else {
child.setFailure(node.getNext(key));
}
queue.add(child);
}
}
}
/**
* prints dot description of the automaton to out for visualization in GraphViz. Used for debugging and education.
*
* @param out
*/
public void printAutomaton(final PrintStream out) {
out.println("digraph ahocorasick {");
printDebug(out, root);
out.println("}");
}
private void printDebug(final PrintStream out, final ACNode<T> node) {
if (node.getValue() == null) {
out.println(node.hashCode() + " [shape=point label=\"\"]");
} else {
out.println(node.hashCode() + " [shape=circle style=filled label=\"\"]");
}
if (node.getFailure() != root) {
out.println(node.hashCode() + " -> " + node.getFailure().hashCode() + "[color=gray]");
}
for (Entry<Character, ACNode<T>> link : node.getLinks()) {
out.println(node.hashCode() + " -> " + link.getValue().hashCode() + " [label=\"" + link.getKey() + "\"]");
printDebug(out, link.getValue());
}
}
/**
*
* @param <T>
*/
public static final class Match<T> {
private final T value;
private final int start;
private final int length;
public Match(final T value, final int start, final int length) {
super();
this.value = value;
this.start = start;
this.length = length;
}
public T getValue() {
return value;
}
public int getStart() {
return start;
}
public int getLength() {
return length;
}
@Override
public String toString() {
return value + " " + start + "+" + length;
}
}
}