/* Created on Dec 26, 2012 by Florian Leitner.
* Copyright 2012. All rights reserved. */
package com.tuplejump.stargate.lucene.query.fsm;
import java.util.*;
/**
* An engine that performs match operation on a sequence of generic elements <code>E</code> by
* interpreting a {@link Pattern} (analogous to Java's {@link java.util.regex.Matcher}).
*
* A matcher is created from a pattern by invoking the pattern's {@link Pattern#matcher(List)
* matcher} method. Once created, a matcher can be used to perform different kinds of match
* operations:
* <ol>
* <li>The {@link Matcher#matches matches} method attempts to match the entire input sequence
* against the pattern.</li>
* <li>The {@link Matcher#find() find} method scans the input sequence, looking for the next
* subsequence that matches the pattern.</li>
* <li>The {@link Matcher#lookingAt lookingAt} method attempts to match the input sequence,
* starting at the beginning, against the pattern.</li>
* </ol>
* Each of these methods returns a Boolean value indicating success or failure. More information
* about a successful match can be obtained by querying the state of the matcher.
*
* The explicit state of a matcher includes the start and end indices of the most recent successful
* match. It also includes the start and end indices of the input subsequence captured by each
* capturing group in the pattern as well as a total count of such subsequences. As a convenience,
* methods are also provided for returning these captured subsequences.
*
* A few convenience methods present in Java's {@link java.util.regex.Matcher} are not implemented,
* particularly <code>appendReplacement</code>, <code>appendTail</code>, and
* <code>replaceAll</code>.
*
* Greedy vs. non-greedy behavior of the quantifiers can be modified by changing the
* {@link #greedy} flag (default: non-greedy matching).
*
* This class is <i>not</i> <b>thread-safe</b>.
*
* @author Florian Leitner
*/
public final class Matcher<E> {
final Node<E> entry;
final Node<E> exit;
private List<E> seq;
private int len; // length of the previous match (-1 if the previous match attempt failed)
private int idx; // offset of the previous match (-1 if no previous match attempt was made)
private int[][] captureGroups; // capture group offsets (int[][2] arrays)
private BFSQueue<E> queue;
/**
* A flag indicating whether quantifiers should behave greedily or not (the default).
*/
public boolean greedy = false;
/**
* Creates a new Matcher object.
*
* @param entry pattern state
* @param exit pattern state
* @param sequence to match
*/
Matcher(Node<E> entry, Node<E> exit, List<E> sequence) {
this.entry = entry;
this.exit = exit;
reset(sequence);
}
/**
* Returns the pattern that is interpreted by this matcher.
*/
public Pattern<E> pattern() {
return new Pattern<E>(entry, exit);
}
/**
* Attempts to find the next subsequence of the input sequence that matches the pattern.
*
* This method starts at the beginning of the input sequence or, if a previous invocation of the
* method was successful and the matcher has not since been {@link #reset}, at the first
* character not matched by the previous match.
*
* If the match succeeds, more information can be obtained via the {@link #start}, {@link #end},
* and {@link #group} methods.
*/
public boolean find() {
// if no failed previous attempt is indicated
if (len != -1) {
int max = seq.size();
idx += len;
while ((len = match()) == -1 && idx++ < max) {
}
}
return (len != -1);
}
/**
* Resets this matcher and then attempts to find the next subsequence of the input sequence that
* matches the pattern, starting at the specified index.
*
* If the match succeeds, more information can be obtained via the {@link #start}, {@link #end},
* and {@link #group} methods.
*
* @throws IndexOutOfBoundsException if start is less than zero or greater than the length of the
* input sequence
*/
public boolean find(int start) {
idx = start;
len = 0;
return find();
}
/**
* Attempts to match the input sequence, starting at the beginning, against the pattern.
*
* Like the {@link #matches} method, this method always starts at the beginning of the input
* sequence; unlike that method, it does not require that the entire input sequence be matched.
*
* If the match succeeds, more information can be obtained via the {@link #start}, {@link #end},
* and {@link #group} methods.
*
* @return <code>true</code> if any input sequence' prefix matches the pattern
*/
public boolean lookingAt() {
idx = 0;
return ((len = match()) != -1);
}
/**
* Return <code>true</code> if the whole (entire) sequence matches.
*
* If the match succeeds, more information can be obtained via the {@link #start}, {@link #end},
* and {@link #group} methods.
*/
public boolean matches() {
idx = 0;
if ((len = match()) == seq.size()) {
return true;
} else {
len = -1; // set the flag indicating that this previous match failed
return false;
}
}
/**
* Return the subsequence matched by the previous match.
*
* For a matcher <code>m</code> with input sequence <code>s</code>, the expressions
* <code>m.group()</code> and <code>s.subList(m.start(),
* m.end())</code> are equivalent.
*
* Don't forget that the result could be an empty list for particular patterns.
*
* @return The subsequence matched by the previous match
* @throws IllegalStateException if no match has yet been attempted, or if the previous match
* operation failed
*/
public List<E> group() {
if (noMatch()) throw new IllegalStateException("no previous match");
return seq.subList(idx, idx + len);
}
/**
* Returns the input subsequence captured by the given group during the previous match operation.
*
* Capturing groups are indexed from left to right, starting at one. Group zero denotes the
* entire pattern, so the expression <code>m.{@link #group(int) group(0)}</code> is equivalent to
* <code>m.{@link #group()}</code>.
*
* @param group index of a capturing group in this matcher's pattern
* @throws IllegalStateException if no match has yet been attempted, or if the previous match
* operation failed
* @throws IndexOutOfBoundsException if there is no capturing group in the pattern with the given
* index
*/
public List<E> group(int group) {
if (group == 0) return group();
if (noMatch()) throw new IllegalStateException("no previous match");
int[] o = captureGroups[group - 1];
return seq.subList(o[0], o[1]);
}
/**
* Returns the number of <b>capturing</b> groups in this matcher's pattern.
* @return the number of <b>capturing</b> groups in this matcher's pattern.
*/
public int groupCount() {
return captureGroups.length;
}
/**
* Returns an array of group offsets (start, end), including the entire match group.
*
* The first two integers are the entire match' offsets, each following pair are for each group.
* I.e., the length of the resulting array will always be even.
*
* @return all group offset pairs (start, end)
* @throws IllegalStateException if no match has yet been attempted, or if the previous match
* operation failed
*/
public int[] groups() {
if (noMatch()) throw new IllegalStateException("no previous match");
int[] groups = new int[2 + captureGroups.length * 2];
for (int i = captureGroups.length; i >= 0; i--) {
groups[i * 2] = start(i);
groups[i * 2 + 1] = end(i);
}
return groups;
}
/**
* Returns the start index of last match.
* @return the start index of last match.
*
* @throws IllegalStateException if no match has yet been attempted, or if the previous match
* operation failed
*/
public int start() {
if (noMatch()) throw new IllegalStateException("no previous match");
return idx;
}
/**
* Returns the end index of last match.
* @return the end index of last match.
*
* @throws IllegalStateException if no match has yet been attempted, or if the previous match
* operation failed
*/
public int end() {
if (noMatch()) throw new IllegalStateException("no previous match");
return idx + len;
}
/**
* Returns the start index of the subsequence captured by the given group during the previous
* match operation.
*
* Capturing groups are indexed from left to right, starting at one. Group zero denotes the
* entire pattern, so the expression <code>m.{@link #start(int) start(0)}</code> is equivalent to
* <code>m.{@link #start()}</code>.
*
* @throws IllegalStateException if no match has yet been attempted, or if the previous match
* operation failed
* @throws IndexOutOfBoundsException if there is no capturing group in the pattern with the given
* index
*/
public int start(int group) {
if (group == 0) return start();
if (noMatch()) throw new IllegalStateException("no previous match");
return captureGroups[group - 1][0];
}
/**
* Returns the end index of the subsequence captured by the given group during the previous match
* operation.
*
* Capturing groups are indexed from left to right, starting at one. Group zero denotes the
* entire pattern, so the expression <code>m.{@link #end(int) end(0)}</code> is equivalent to
* <code>m.{@link #end()}</code>.
*
* @throws IllegalStateException if no match has yet been attempted, or if the previous match
* operation failed
* @throws IndexOutOfBoundsException if there is no capturing group in the pattern with the given
* index
*/
public int end(int group) {
if (group == 0) return end();
if (noMatch()) throw new IllegalStateException("no previous match");
return captureGroups[group - 1][1];
}
/**
* Resets this matcher, returning itself.
*/
public Matcher<E> reset() {
idx = -1;
len = 1;
return this;
}
/**
* Resets this matcher with a new sequence, returning itself.
*/
public Matcher<E> reset(List<E> input) {
seq = new ArrayList<E>(input);
idx = -1;
len = 1;
return this;
}
/**
* Check if there was a previously made match.
*/
private boolean noMatch() {
return (len == -1 || idx == -1);
}
/**
* Breadth-first search of a match for the pattern in the input sequence at the current
* {@link #idx index}.
*
* @return the match length or <code>-1</code> if no match was made
*/
private int match() {
if (idx > seq.size()) throw new IndexOutOfBoundsException("offset exceeds sequence length");
captureGroups = new int[][]{}; // reset capture groups
// (capture groups will be built from the backtrace of the queue)
if (entry.isFinal()) return 0; // a "match anything" pattern...
E element; // the currently consumed item
Node<E> node = entry; // the currently processed state
int offset = idx; // the current position of the state machine in the sequence
queue = new BFSQueue<E>(offset, node); // start a new tracer queue
QueueItem<Node<E>> match = null; // for greedy mode
int length = -1; // for greedy mode
// search for an accept state on the queue while there are items in it
search:
while (!queue.isEmpty()) {
QueueItem<Node<E>> item = queue.remove();
offset = item.index();
node = item.get();
if (node.isFinal()) {
// determine the length of this matching sequence
length = offset - idx;
match = item;
if (!greedy) break search; // only keep looking in greedy mode
} else if (offset < seq.size()) {
element = seq.get(offset); // get the item in the sequence at the relevant index
for (Transition<E> t : node.transitions.keySet()) {
if (t.matches(element)) {
t.onMatch(element);
// add the result states of matching transitions (if they have not been added yet)
queue.addTransistions(offset + 1, item, node.transitions.get(t), t.weight());
}
}
}
if (node.epsilonTransitions.size() > 0)
queue.addTransistions(offset, item, node.epsilonTransitions, 0.0);
}
// backtrack captured groups
if (match != null) setCaptureGroups(match);
return length;
}
/**
* Use weighted backtracking to identify capture groups based on a dynamic programming approach.
*
* @param item final queue item from where to begin the backtracking
*/
private void setCaptureGroups(QueueItem<Node<E>> item) {
List<QueueItem<Node<E>>> path = queue.backtrack(item);
// collect one offset per state starting or ending a capture group
Map<Node<E>, int[]> starts = new HashMap<Node<E>, int[]>();
Map<Node<E>, int[]> ends = new HashMap<Node<E>, int[]>();
int i = 0;
for (QueueItem<Node<E>> qi : path) {
Node<E> s = qi.get();
// collect the minimum (i.e., first) recorded offset for a captureStart state
if (s.captureStart && !starts.containsKey(s)) starts.put(s, new int[]{qi.index(), i, 1});
// collect the maximum (i.e., last) recorded offset for a captureEnd state
if (s.captureEnd) ends.put(s, new int[]{qi.index(), i, 0});
i++;
}
int numGroups = starts.size();
if (numGroups > 0) {
// sort all start and end offsets by their positions, then by the order they were matched,
// and last order start AFTER end positions, leaving minimal space for any ambiguity
int[][] positions = new int[numGroups * 2][];
i = 0;
for (int[] s : starts.values())
positions[i++] = s;
for (int[] e : ends.values())
positions[i++] = e;
Arrays.sort(positions, new Comparator<int[]>() {
public int compare(int[] a, int[] b) {
if (a[0] == b[0]) {
if (a[1] == b[1]) return a[2] - b[2];
else return a[1] - b[1];
}
return a[0] - b[0];
}
});
// populate the captureGroups offset array using the ordered positions
i = 0;
Stack<Integer> endIdx = new Stack<Integer>();
captureGroups = new int[numGroups][];
for (int[] p : positions) {
if (p[2] == 1) {
endIdx.push(i);
captureGroups[i++] = new int[]{p[0], -1};
} else {
captureGroups[endIdx.pop()][1] = p[0];
}
}
}
}
}