package org.xbib.elasticsearch.common.fsa;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.BitSet;
import java.util.Collections;
import java.util.Iterator;
import java.util.Set;
/**
* This is an abstract class for handling finite state automata. These
* automata are arc-based, a design described in Jan Daciuk's <i>Incremental
* Construction of Finite-State Automata and Transducers, and Their Use in the
* Natural Language Processing</i> (PhD thesis, Technical University of Gdansk).
* Concrete subclasses (implementations) provide varying tradeoffs and features:
* traversal speed vs. memory size, for example.
*
* @see FSABuilder
*/
public abstract class FSA implements Iterable<ByteBuffer> {
/**
* @return Returns the identifier of the root node of this automaton.
* Returns 0 if the start node is also the end node (the automaton
* is empty).
*/
public abstract int getRootNode();
/**
* @param node node
* @return Returns the identifier of the first arc leaving <code>node</code>
* or 0 if the node has no outgoing arcs.
*/
public abstract int getFirstArc(int node);
/**
* @param arc arc
* @return Returns the identifier of the next arc after <code>arc</code> and
* leaving <code>node</code>. Zero is returned if no more arcs are
* available for the node.
*/
public abstract int getNextArc(int arc);
/**
* @param node node
* @param label label
* @return Returns the identifier of an arc leaving <code>node</code> and
* labeled with <code>label</code>. An identifier equal to 0 means
* the node has no outgoing arc labeled <code>label</code>.
*/
public abstract int getArc(int node, byte label);
/**
* @param arc arc
* @return the label associated with a given <code>arc</code>.
*/
public abstract byte getArcLabel(int arc);
/**
* @param arc arc
* @return <code>true</code> if the destination node at the end of this
* <code>arc</code> corresponds to an input sequence created when building
* this automaton.
*/
public abstract boolean isArcFinal(int arc);
/**
* Returns <code>true</code> if this <code>arc</code> does not have a
* terminating node (@link {@link #getEndNode(int)} will throw an
* exception). Implies {@link #isArcFinal(int)}.
*
* @param arc arc
* @return <code>true</code> if this <code>arc</code> does not have a
* terminating node
*/
public abstract boolean isArcTerminal(int arc);
/**
* Return the end node pointed to by a given <code>arc</code>. Terminal arcs
* (those that point to a terminal state) have no end node representation
* and throw a runtime exception.
*
* @param arc arc
* @return int
*/
public abstract int getEndNode(int arc);
/**
* Returns a set of flags for this FSA instance.
*
* @return set
*/
public abstract Set<FSAFlags> getFlags();
public abstract void write(DataOutputStream outputStream) throws IOException;
/**
* @param node node
* @return Returns the number of sequences reachable from the given state if
* the automaton was compiled with {@link FSAFlags#NUMBERS}. The size of
* the right language of the state, in other words.
* @throws UnsupportedOperationException If the automaton was not compiled with
* {@link FSAFlags#NUMBERS}. The value can then be computed by manual count
* of {@link #getSequences(int)}.
*/
public int getRightLanguageCount(int node) {
throw new UnsupportedOperationException("Automaton not compiled with " + FSAFlags.NUMBERS);
}
/**
* Returns an iterator over all binary sequences starting at the given FSA
* state (node) and ending in final nodes. This corresponds to a set of
* suffixes of a given prefix from all sequences stored in the automaton.
* The returned iterator is a {@link java.nio.ByteBuffer} whose contents changes on
* each call to {@link java.util.Iterator#next()}. The keep the contents between calls
* to {@link java.util.Iterator#next()}, one must copy the buffer to some other
* location.
* Important: it is guaranteed that the returned byte buffer is
* backed by a byte array and that the content of the byte buffer starts at
* the array's index 0.
*
* @param node node
* @return byte buffer
* @see Iterable
*/
public Iterable<ByteBuffer> getSequences(final int node) {
if (node == 0) {
return Collections.emptyList();
}
return () -> new FSAFinalStatesIterator(FSA.this, node);
}
/**
* An alias of calling {@link #iterator} directly ({@link FSA} is also
* {@link Iterable}).
*
* @return iterable
*/
public final Iterable<ByteBuffer> getSequences() {
return getSequences(getRootNode());
}
/**
* Returns an iterator over all binary sequences starting from the initial
* FSA state (node) and ending in final nodes. The returned iterator is a
* {@link java.nio.ByteBuffer} whose contents changes on each call to
* {@link java.util.Iterator#next()}. The keep the contents between calls to
* {@link java.util.Iterator#next()}, one must copy the buffer to some other location.
* Important: It is guaranteed that the returned byte buffer is
* backed by a byte array and that the content of the byte buffer starts at
* the array's index 0.
*
* @return iterator
*/
@Override
public final Iterator<ByteBuffer> iterator() {
return getSequences().iterator();
}
/**
* Visit all states. The order of visiting is undefined. This method may be faster
* than traversing the automaton in post or preorder since it can scan states
* linearly. Returning false from {@link StateVisitor#accept(int)}
* immediately terminates the traversal.
*
* @param <T> type
* @param v v
* @return state visitor
*/
public <T extends StateVisitor> T visitAllStates(T v) {
return visitInPostOrder(v);
}
/**
* Same as {@link #visitInPostOrder(StateVisitor, int)},
* starting from root automaton node.
*
* @param <T> type
* @param v v
* @return state visitor
*/
public <T extends StateVisitor> T visitInPostOrder(T v) {
return visitInPostOrder(v, getRootNode());
}
/**
* Visits all states reachable from <code>node</code> in postorder.
* Returning false from {@link StateVisitor#accept(int)}
* immediately terminates the traversal.
*
* @param <T> type
* @param v v
* @param node node
* @return state visitor
*/
public <T extends StateVisitor> T visitInPostOrder(T v, int node) {
visitInPostOrder(v, node, new BitSet());
return v;
}
/**
* Private recursion.
*/
private boolean visitInPostOrder(StateVisitor v, int node, BitSet visited) {
if (visited.get(node)) {
return true;
}
visited.set(node);
for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
if (!isArcTerminal(arc) && !visitInPostOrder(v, getEndNode(arc), visited)) {
return false;
}
}
return v.accept(node);
}
/**
* Same as {@link #visitInPreOrder(StateVisitor, int)}, starting from root automaton node.
*
* @param <T> type
* @param v v
* @return state visitor
*/
public <T extends StateVisitor> T visitInPreOrder(T v) {
return visitInPreOrder(v, getRootNode());
}
/**
* Visits all states in preorder. Returning false from {@link StateVisitor#accept(int)}
* skips traversal of all sub-states of a given state.
*
* @param <T> type
* @param v v
* @param node node
* @return state visitor
*/
public <T extends StateVisitor> T visitInPreOrder(T v, int node) {
visitInPreOrder(v, node, new BitSet());
return v;
}
/**
* Private recursion.
*/
private void visitInPreOrder(StateVisitor v, int node, BitSet visited) {
if (visited.get(node)) {
return;
}
visited.set(node);
if (v.accept(node)) {
for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
if (!isArcTerminal(arc)) {
visitInPreOrder(v, getEndNode(arc), visited);
}
}
}
}
}