/* Created on Dec 21, 2012 by Florian Leitner.
* Copyright 2012. All rights reserved. */
package com.tuplejump.stargate.lucene.query.fsm;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
/**
* A <i>generic</i>, <b>NFA-based pattern matching</b> implementation using <i>weighted
* backtracking</i> to provide capture groups.
* <p>
* This class provides methods to compile a non-deterministic state machine. In addition to a
* pattern parser/compiler, the {@link Transition} interface has to be implemented, defining how
* elements on the sequence should be matched and which weight the resulting transition should
* have.
* <p>
* The entire API for this generic NFA is designed as close as possible to Java's own
* {@link java.util.regex.Pattern} API. It is incomplete, because while the class is usable, it
* provides no static <code>compile(String regex)</code> method or any other predetermined
* mechanism of assembling the FSM. Therefore, some procedure of compiling the NFA needs to be
* implemented, such as a regular expression language based on a context free grammar. For this
* reason, the {@link #toString()} method that would convert the Pattern back to a String of the
* regular language only produces a directed, acyclic graph of the state-transitions (i.e., cyclic
* dependencies are not represented) that is useful to debug the state machine.
* <p>
* <b>Compiling a Pattern</b>
* <p>
* Implementations of this generic NFA package could, for example, inherit this class and implement
* a method such as <code>static Pattern<E> compile(String)</code> that parses and compiles
* the NFA from some regular language. The states should be compiled using the default constructors
* or the static methods {@link Pattern#match(Transition) match} (a single transition), and then
* joined with {@link Pattern#chain(Pattern, Pattern) chain} ("AND", "", i.e., chain a sequence of
* transitions) or {@link Pattern#branch(Pattern, Pattern) branch} ("OR", "|", branch out into
* several possible transitions) operations. To declare a particular sub-pattern as a capture
* group, apply the static {@link Pattern#capture(Pattern) capture} method on it. Any (sub-)
* pattern's behavior can be augmented by making it {@link #optional() optional} ( <code>?</code> )
* and/or by allowing it to {@link Pattern#repeat() repeat} ( <code>+</code> ; a pattern that is
* made both optional and repeated effectively acts as a full Kleene closure ( <code>*</code> )).
* Unless there are reasons not to, the last step of compiling a pattern should be to call
* {@link Pattern#minimize()} on itself, thereby removing states with epsilon transitions and no
* other pattern semantics (essentially, removing artifacts created during the compilation).
* <p>
* A few convenience methods present in {@link java.util.regex.Pattern Java's Pattern API} are not
* implemented, particularly the <code>split</code> methods.
*
* @author Florian Leitner
*/
public class Pattern<E> {
private Node<E> entry;
private Node<E> exit;
/**
* Create a pattern that matches a single transition.
*
* @param t the transition that has to match
* @return a NFA
*/
public static final <T> Pattern<T> match(Transition<T> t) {
Node<T> entry = new Node<T>();
Node<T> exit = new Node<T>();
entry.addTransition(t, exit);
return new Pattern<T>(entry, exit);
}
/**
* Join two successive patterns into one ("AND", "").
*
* @param first pattern to match before second
* @param second pattern to match after first
* @return a joined NFA
*/
public static final <T> Pattern<T> chain(Pattern<T> first, Pattern<T> second) {
first.exit.makeNonFinal();
first.exit.addEpsilonTransition(second.entry);
return new Pattern<T>(first.entry, second.exit);
}
/**
* Fork out into one of two patterns ("OR", "|).
*
* @param left optional pattern to match
* @param right optional pattern to match
* @return a forked NFA
*/
public static final <T> Pattern<T> branch(Pattern<T> left, Pattern<T> right) {
Node<T> entry = new Node<T>();
Node<T> exit = new Node<T>();
left.exit.makeNonFinal();
right.exit.makeNonFinal();
entry.addEpsilonTransition(left.entry);
entry.addEpsilonTransition(right.entry);
left.exit.addEpsilonTransition(exit);
right.exit.addEpsilonTransition(exit);
return new Pattern<T>(entry, exit);
}
/**
* Make this pattern capturing, i.e., ensure the sequence offsets matched by it will be recored
* as a capture group by the {@link Matcher}.
*
* @param pattern to capture
* @return a NFA
*/
public static final <T> Pattern<T> capture(Pattern<T> pattern) {
// note that a state with both the capture start and end flag set will be treated as
// first ending a group, then starting a new one; therefore, if the pattern's entry and
// exit states are the same (instance), additional states need to be introduced, otherwise the
// the matcher would try to first end a (not yet started) group and then start a group
// (that might never end) because the start and end flags would be set on the same state
if (!pattern.entry.equals(pattern.exit) && !pattern.entry.captureStart &&
!pattern.exit.captureEnd) {
// entry and exit are not the same; simple case
pattern.entry.captureStart = true;
pattern.exit.captureEnd = true;
return pattern;
} else {
Node<T> entry = new Node<T>();
Node<T> exit = new Node<T>();
entry.captureStart = true;
exit.captureEnd = true;
entry.addEpsilonTransition(pattern.entry);
pattern.exit.addEpsilonTransition(exit);
pattern.exit.makeNonFinal();
return new Pattern<T>(entry, exit);
}
}
/**
* Construct the simplest possible NFA: a two-state automata joined by an epsilon transition.
* <p>
* This pattern will match anything, from the empty sequence ("lambda"), to the infinite one. It
* provides a perfect "seed" for assembling more complex patterns.
*/
public Pattern() {
entry = new Node<E>();
exit = new Node<E>();
entry.addEpsilonTransition(exit);
exit.makeFinal();
}
/**
* Construct an NFA from the given entry and exit states.
* <p>
* It is the responsibility of the user to ensure these two states are actually connected. This
* particular constructor should therefore only be used internally.
*
* @param entry state
* @param exit state
*/
Pattern(Node<E> entry, Node<E> exit) {
this.entry = entry;
this.exit = exit;
exit.makeFinal(); // ensure at least exit is a final state
}
/**
* A tree-like (multi-line) DAG representation of the NFA's states and transitions for debugging
* purposes only.
*/
@Override
public final String toString() {
return String.format("Pattern:\n%s", entry.toString());
}
/**
* Augment this pattern to match zero or one iterations of itself, i.e., the pattern may
* optionally be skipped.
* <p>
* It is allowed to use both {@link #optional()} and {@link #repeat()} on the same pattern.
*
* @return itself/this pattern
*/
public final Pattern<E> optional() {
entry.addEpsilonTransition(exit);
return this;
}
/**
* Augment this pattern to match one or more repetitions of itself, i.e., the pattern may
* optionally be matched several times.
* <p>
* It is allowed to use both {@link #optional()} and {@link #repeat()} on the same pattern.
*
* @return itself/this pattern
*/
public final Pattern<E> repeat() {
exit.addEpsilonTransition(entry);
return this;
}
/**
* Remove states that only have epsilon transitions and instead connect their source and target
* states directly. Capture states will never be pruned even if they only have outgoing epsilon
* transitions.
* <p>
* This method should be used after the entire pattern has been compiled to reduce the total
* number of transitions and states in the FSM.
*
* @return itself/this pattern
*/
public final Pattern<E> minimize() {
Node<E> node;
Queue<Node<E>> queue = new LinkedList<Node<E>>(); // queue of states to check
// a map of states with only epsilon transitions and their associated target states
Map<Node<E>, Set<Node<E>>> invalidStates = new HashMap<Node<E>, Set<Node<E>>>();
Set<Node<E>> validNodes = new HashSet<Node<E>>(); // states that should not be pruned
// remove superfluous entry nodes (single epsilon transitions without any other semantics)
while (entry.transitions.size() == 0 && entry.epsilonTransitions.size() == 1 &&
!entry.isFinal() && !entry.isCapturing()) {
node = entry.epsilonTransitions.iterator().next();
if (node.equals(entry)) break;
entry = node;
}
queue.add(entry);
// detect invalid states: states that are non-final with no regular transitions
// unless it is the entry state or a capture group-related state
while (!queue.isEmpty()) {
node = queue.remove();
if (!node.isFinal() && !node.isCapturing() && node.transitions.size() == 0 &&
!node.equals(entry)) {
// for those invalid states, record their (epsilon transition) targets
invalidStates.put(node, node.epsilonTransitions);
} else {
// everything else is a valid state
validNodes.add(node);
}
// find yet unseen states to queue
for (Node<E> next : node.epsilonTransitions) {
if (!validNodes.contains(next) && !invalidStates.containsKey(next)) queue.add(next);
if (next.equals(node)) // safeguard to avoid infinite loops
throw new RuntimeException("circular reference detected: " + node.toString());
}
for (Set<Node<E>> nodeSet : node.transitions.values()) {
for (Node<E> next : nodeSet) {
if (!validNodes.contains(next) && !invalidStates.containsKey(next)) queue.add(next);
}
}
}
boolean pruning = true;
// find invalid states that map to other invalid states and replace those
// targets by continuously expanding them until they are only valid targets left
while (pruning) {
// while any invalid state contained a mapping to any other invalid state, keep pruning
pruning = false; // assume pruning is done at the start of each round of pruning
// iterate over all invalid states
for (Node<E> source : invalidStates.keySet()) {
// if replaceAndExpand is true, this source state was pointing to another invalid state
if (replaceAndExpand(invalidStates, invalidStates.get(source))) pruning = true;
}
}
// after pruning the pointers, we can now expand all invalid states pointed at by valid ones
// with their appropriate valid target states
for (Node<E> valid : validNodes) {
replaceAndExpand(invalidStates, valid.epsilonTransitions);
for (Set<Node<E>> targetNodes : valid.transitions.values())
replaceAndExpand(invalidStates, targetNodes);
}
return this;
}
/**
* Expand any invalid states in the given set of states.
*
* @param expansions a mapping of invalid states to their target expansions
* @param nodes a set of states possibly containing invalid states to be expanded
* @return <code>true</code> if any expansion was made
*/
private static final <T> boolean replaceAndExpand(Map<Node<T>, Set<Node<T>>> expansions,
Set<Node<T>> nodes) {
Node<T> s;
Set<Node<T>> expansion = null; // be lazy - only instantiate this set if necessary
Iterator<Node<T>> iter = nodes.iterator();
// iterate over the states
while (iter.hasNext()) {
s = iter.next();
if (expansions.containsKey(s)) {
// the state is invalid: replace and expand with that state's expansions
iter.remove();
if (expansion == null) expansion = new HashSet<Node<T>>();
expansion.addAll(expansions.get(s));
}
}
if (expansion != null) {
nodes.addAll(expansion);
return true;
} else {
return false;
}
}
/**
* Creates a matcher that will match the input sequence against this pattern.
*
* @param input sequence to be matched
* @return a new matcher for this pattern
*/
public final Matcher<E> matcher(List<E> input) {
return new Matcher<E>(entry, exit, input);
}
// XXX: possible future additions to make this class more equal to Java's Pattern API:
// public final List<E>[] split(List<E> input)
// public final List<E>[] split(List<E> input, int limit)
}