package org.xbib.elasticsearch.common.fsa;
/**
* This class implements some common matching and scanning operations on a
* generic FSA.
*/
public final class FSATraversal {
/**
* Target automaton.
*/
private final FSA fsa;
/**
* Traversals of the given FSA.
*
* @param fsa fsa
*/
public FSATraversal(FSA fsa) {
this.fsa = fsa;
}
/**
* Calculate perfect hash for a given input sequence of bytes. The perfect hash requires
* that {@link FSA} is built with {@link FSAFlags#NUMBERS} and corresponds to the sequential
* order of input sequences used at automaton construction time.
*
* @param sequence sequence
* @param start Start index in the sequence array.
* @param length Length of the byte sequence, must be at least 1.
* @param node node
* @return Returns a unique integer assigned to the input sequence in the automaton (reflecting
* the number of that sequence in the input used to build the automaton). Returns a negative
* integer if the input sequence was not part of the input from which the automaton was created.
* The type of mismatch is a constant defined in {@link MatchResult}.
*/
public int perfectHash(byte[] sequence, int start, int length, int node) {
if (!fsa.getFlags().contains(FSAFlags.NUMBERS)) {
throw new IllegalArgumentException("FSA not built with NUMBERS option.");
}
if (length == 0) {
throw new IllegalArgumentException("must be a non-empty sequence");
}
int hash = 0;
final int end = start + length - 1;
int seqIndex = start;
byte label = sequence[seqIndex];
// Seek through the current node's labels, looking for 'label', update hash.
for (int arc = fsa.getFirstArc(node); arc != 0; ) {
if (fsa.getArcLabel(arc) == label) {
if (fsa.isArcFinal(arc)) {
if (seqIndex == end) {
return hash;
}
hash++;
}
if (fsa.isArcTerminal(arc)) {
/* The automaton contains a prefix of the input sequence. */
return MatchResult.AUTOMATON_HAS_PREFIX;
}
// The sequence is a prefix of one of the sequences stored in the automaton.
if (seqIndex == end) {
return MatchResult.SEQUENCE_IS_A_PREFIX;
}
// Make a transition along the arc, go the target node's first arc.
arc = fsa.getFirstArc(fsa.getEndNode(arc));
label = sequence[++seqIndex];
continue;
} else {
if (fsa.isArcFinal(arc)) {
hash++;
}
if (!fsa.isArcTerminal(arc)) {
hash += fsa.getRightLanguageCount(fsa.getEndNode(arc));
}
}
arc = fsa.getNextArc(arc);
}
// Labels of this node ended without a match on the sequence.
// Perfect hash does not exist.
return MatchResult.NO_MATCH;
}
/**
* Same as {@link #match(byte[], int, int, int)}, but allows passing
* a reusable {@link MatchResult} object so that no intermediate garbage is
* produced.
*
* @param result result
* @param sequence sequence
* @param start start
* @param length length
* @param n node
* @return The same object as <code>result</code>, but with reset internal
* type and other fields.
*/
public MatchResult match(MatchResult result, byte[] sequence, int start, int length, int n) {
int node = n;
if (node == 0) {
result.reset(MatchResult.NO_MATCH, start, node);
return result;
}
final int end = start + length;
for (int i = start; i < end; i++) {
final int arc = fsa.getArc(node, sequence[i]);
if (arc != 0) {
if (fsa.isArcFinal(arc) && i + 1 == end) {
/* The automaton has an exact match of the input sequence. */
result.reset(MatchResult.EXACT_MATCH, i, node);
return result;
}
if (fsa.isArcTerminal(arc)) {
/* The automaton contains a prefix of the input sequence. */
result.reset(MatchResult.AUTOMATON_HAS_PREFIX, i + 1, 0);
return result;
}
// Make a transition along the arc.
node = fsa.getEndNode(arc);
} else {
result.reset(MatchResult.NO_MATCH, i, node);
return result;
}
}
/* The sequence is a prefix of at least one sequence in the automaton. */
result.reset(MatchResult.SEQUENCE_IS_A_PREFIX, 0, node);
return result;
}
/**
* Finds a matching path in the dictionary for a given sequence of labels
* from <code>sequence</code> and starting at node <code>node</code>.
*
* @param sequence An array of labels to follow in the FSA.
* @param start Starting index in <code>sequence</code>.
* @param length How many symbols to consider from <code>sequence</code>?
* @param node Start node identifier in the FSA.
* @return match result
*/
public MatchResult match(byte[] sequence, int start, int length, int node) {
return match(new MatchResult(), sequence, start, length, node);
}
public MatchResult match(byte[] sequence, int node) {
return match(sequence, 0, sequence.length, node);
}
public MatchResult match(byte[] sequence) {
return match(sequence, fsa.getRootNode());
}
}