package org.xbib.elasticsearch.common.fsa;
import org.elasticsearch.common.io.Streams;
import java.io.DataInputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Map;
import java.util.TreeMap;
import static org.xbib.elasticsearch.common.fsa.ConstantArcSizeFSA.ADDRESS_OFFSET;
import static org.xbib.elasticsearch.common.fsa.ConstantArcSizeFSA.ARC_SIZE;
import static org.xbib.elasticsearch.common.fsa.ConstantArcSizeFSA.BIT_ARC_FINAL;
import static org.xbib.elasticsearch.common.fsa.ConstantArcSizeFSA.BIT_ARC_LAST;
import static org.xbib.elasticsearch.common.fsa.ConstantArcSizeFSA.FLAGS_OFFSET;
import static org.xbib.elasticsearch.common.fsa.ConstantArcSizeFSA.LABEL_OFFSET;
import static org.xbib.elasticsearch.common.fsa.ConstantArcSizeFSA.TARGET_ADDRESS_SIZE;
import static org.xbib.elasticsearch.common.fsa.ConstantArcSizeFSA.TERMINAL_STATE;
/**
* Fast, memory-conservative finite state automaton builder, returning a
* byte-serialized {@link ConstantArcSizeFSA} (a tradeoff between construction
* speed and memory consumption).
*/
public final class FSABuilder {
/**
* Comparator comparing full byte arrays consistently with
* {@link #compare(byte[], int, int, byte[], int, int)}.
*/
public static final Comparator<byte[]> LEXICAL_ORDERING = (o1, o2) -> compare(o1, 0, o1.length, o2, 0, o2.length);
/**
* A megabyte.
*/
private static final int MB = 1024 * 1024;
/**
* Internal serialized FSA buffer expand ratio.
*/
private static final int BUFFER_GROWTH_SIZE = 5 * MB;
/**
* Maximum number of labels from a single state.
*/
private static final int MAX_LABELS = 256;
/**
* Internal serialized FSA buffer expand ratio.
*/
private final int bufferGrowthSize;
/**
* Holds serialized and mutable states. Each state is a sequential list of
* arcs, the last arc is marked with {@link ConstantArcSizeFSA#BIT_ARC_LAST}.
*/
private byte[] serialized = new byte[0];
/**
* Number of bytes already taken in {@link #serialized}. Start from 1 to
* keep 0 a sentinel value (for the hash set and final state).
*/
private int size;
/**
* States on the "active path" (still mutable). Values are addresses of each
* state's first arc.
*/
private int[] activePath = new int[0];
/**
* Current length of the active path.
*/
private int activePathLen;
/**
* The next offset at which an arc will be added to the given state on
* {@link #activePath}.
*/
private int[] nextArcOffset = new int[0];
/**
* Root state. If negative, the automaton has been built already and cannot be extended.
*/
private int root;
/**
* An epsilon state. The first and only arc of this state points either
* to the root or to the terminal state, indicating an empty automaton.
*/
private int epsilon;
/**
* Hash set of state addresses in {@link #serialized}, hashed by
* {@link #hash(int, int)}. Zero reserved for an unoccupied slot.
*/
private int[] hashSet = new int[2];
/**
* Number of entries currently stored in {@link #hashSet}.
*/
private int hashSize = 0;
/**
* Previous sequence added to the automaton in {@link #add(byte[], int, int)}.
*/
private byte[] previous;
/**
* Information about the automaton and its compilation.
*/
private Map<InfoEntry, Object> info;
/**
* {@link #previous} sequence's length
*/
private int previousLength;
/**
* Number of serialization buffer reallocations.
*/
private int serializationBufferReallocations;
/** */
public FSABuilder() {
this(BUFFER_GROWTH_SIZE);
}
/**
* @param bufferGrowthSize buffer growth size
*/
public FSABuilder(int bufferGrowthSize) {
this.bufferGrowthSize = Math.max(bufferGrowthSize, ARC_SIZE * MAX_LABELS);
// Allocate epsilon state.
epsilon = allocateState(1);
serialized[epsilon + FLAGS_OFFSET] |= BIT_ARC_LAST;
// Allocate root, with an initial empty set of output arcs.
expandActivePath(1);
root = activePath[0];
}
/**
* Build a minimal, deterministic automaton from a sorted list of byte sequences.
*
* @param input input
* @return FSA
*/
public static FSA build(byte[][] input) {
final FSABuilder builder = new FSABuilder();
for (byte[] chs : input) {
builder.add(chs, 0, chs.length);
}
return builder.complete();
}
/**
* Build a minimal, deterministic automaton from an iterable list of byte sequences.
*
* @param input input
* @return FSA
*/
public static FSA build(Iterable<byte[]> input) {
final FSABuilder builder = new FSABuilder();
for (byte[] chs : input) {
builder.add(chs, 0, chs.length);
}
return builder.complete();
}
/**
* Lexicographic order of input sequences. By default, consistent with the "C" sort
* (absolute value of bytes, 0-255).
*
* @param s1 s1
* @param start1 start1
* @param lens1 lens1
* @param s2 s2
* @param start2 start2
* @param lens2 lens2
* @return diffence of length
*/
public static int compare(byte[] s1, int start1, int lens1,
byte[] s2, int start2, int lens2) {
final int max = Math.min(lens1, lens2);
int k1 = start1;
int k2 = start2;
for (int i = 0; i < max; i++) {
final byte c1 = s1[k1++];
final byte c2 = s2[k2++];
if (c1 != c2) {
return (c1 & 0xff) - (c2 & 0xff);
}
}
return lens1 - lens2;
}
/**
* Add a single sequence of bytes to the FSA. The input must be lexicographically greater
* than any previously added sequence.
*
* @param sequence sequence
* @param start start
* @param len len
*/
public void add(byte[] sequence, int start, int len) {
if (serialized == null) {
throw new IllegalArgumentException("automaton not built");
}
if (!(previous == null || len == 0 || compare(previous, 0, previousLength, sequence, start, len) <= 0)) {
throw new IllegalArgumentException("Input must be sorted: "
+ Arrays.toString(previous) + " >= "
+ Arrays.toString(sequence));
}
setPrevious(sequence, start, len);
// Determine common prefix length.
final int commonPrefix = commonPrefix(sequence, start, len);
// Make room for extra states on active path, if needed.
expandActivePath(len);
// Freeze all the states after the common prefix.
for (int i = activePathLen - 1; i > commonPrefix; i--) {
final int frozenState = freezeState(i);
setArcTarget(nextArcOffset[i - 1] - ARC_SIZE, frozenState);
nextArcOffset[i] = activePath[i];
}
// Create arcs to new suffix states.
for (int i = commonPrefix + 1, j = start + commonPrefix; i <= len; i++) {
final int p = nextArcOffset[i - 1];
serialized[p + FLAGS_OFFSET] = (byte) (i == len ? BIT_ARC_FINAL : 0);
serialized[p + LABEL_OFFSET] = sequence[j++];
setArcTarget(p, i == len ? TERMINAL_STATE : activePath[i]);
nextArcOffset[i - 1] = p + ARC_SIZE;
}
// Save last sequence's length so that we don't need to calculate it again.
this.activePathLen = len;
}
/**
* Complete the automaton.
*
* @return FSA
*/
public FSA complete() {
add(new byte[0], 0, 0);
if (nextArcOffset[0] - activePath[0] == 0) {
// An empty FSA.
setArcTarget(epsilon, TERMINAL_STATE);
} else {
// An automaton with at least a single arc from root.
root = freezeState(0);
setArcTarget(epsilon, root);
}
info = new TreeMap<>();
info.put(InfoEntry.SERIALIZATION_BUFFER_SIZE, serialized.length);
info.put(InfoEntry.SERIALIZATION_BUFFER_REALLOCATIONS, serializationBufferReallocations);
info.put(InfoEntry.CONSTANT_ARC_AUTOMATON_SIZE, size);
info.put(InfoEntry.MAX_ACTIVE_PATH_LENGTH, activePath.length);
info.put(InfoEntry.STATE_REGISTRY_TABLE_SLOTS, hashSet.length);
info.put(InfoEntry.STATE_REGISTRY_SIZE, hashSize);
info.put(InfoEntry.ESTIMATED_MEMORY_CONSUMPTION_MB,
(this.serialized.length + this.hashSet.length * 4) / (double) MB);
final FSA fsa = new ConstantArcSizeFSA(Arrays.copyOf(this.serialized, this.size), epsilon);
this.serialized = null;
this.hashSet = null;
return fsa;
}
public FSA load(DataInputStream inputStream) throws IOException {
this.size = inputStream.readInt();
this.epsilon = inputStream.readInt();
this.serialized = new byte[this.size];
try {
Streams.readFully(inputStream, serialized);
} finally {
inputStream.close();
}
final FSA fsa = new ConstantArcSizeFSA(Arrays.copyOf(this.serialized, this.size), this.epsilon);
this.serialized = null;
return fsa;
}
/**
* @return various statistics concerning the FSA and its compilation.
*/
public Map<InfoEntry, Object> getInfo() {
return info;
}
/**
* Is this arc the state's last?
*/
private boolean isArcLast(int arc) {
return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0;
}
/**
* Is this arc final?
*/
private boolean isArcFinal(int arc) {
return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0;
}
/**
* Get label's arc.
*/
private byte getArcLabel(int arc) {
return serialized[arc + LABEL_OFFSET];
}
/**
* Fills the target state address of an arc.
* @param a arc
*/
private void setArcTarget(int a, int st) {
int state = st;
int arc = a;
arc += ADDRESS_OFFSET + TARGET_ADDRESS_SIZE;
for (int i = 0; i < TARGET_ADDRESS_SIZE; i++) {
serialized[--arc] = (byte) state;
state >>>= 8;
}
}
/**
* Returns the address of an arc.
*/
private int getArcTarget(int a) {
int arc = a;
arc += ADDRESS_OFFSET;
return (serialized[arc]) << 24 |
(serialized[arc + 1] & 0xff) << 16 |
(serialized[arc + 2] & 0xff) << 8 |
(serialized[arc + 3] & 0xff);
}
/**
* @return The number of common prefix characters with the previous
* sequence.
*/
private int commonPrefix(byte[] sequence, int start, int len) {
int k = start;
final int max = Math.min(len, activePathLen);
int i;
for (i = 0; i < max; i++) {
final int lastArc = nextArcOffset[i] - ARC_SIZE;
if (sequence[k++] != getArcLabel(lastArc)) {
break;
}
}
return i;
}
/**
* Freeze a state: try to find an equivalent state in the interned states
* dictionary first, if found, return it, otherwise, write the mutable
* state at <code>activePathIndex</code> and return it.
*/
private int freezeState(final int activePathIndex) {
final int start = activePath[activePathIndex];
final int end = nextArcOffset[activePathIndex];
final int len = end - start;
// Set the last arc flag on the current active path's state.
serialized[end - ARC_SIZE + FLAGS_OFFSET] |= BIT_ARC_LAST;
// Try to locate a state with an identical content in the hash set.
final int bucketMask = hashSet.length - 1;
int slot = hash(start, len) & bucketMask;
for (int i = 0; ; ++i) {
int state = hashSet[slot];
if (state == 0) {
state = hashSet[slot] = serialize(activePathIndex);
if (++hashSize > hashSet.length / 2) {
expandAndRehash();
}
return state;
} else if (equivalent(state, start, len)) {
return state;
}
slot = (slot + i) & bucketMask;
}
}
/**
* Reallocate and rehash the hash set.
*
*/
private void expandAndRehash() {
final int[] newHashSet = new int[hashSet.length * 2];
final int bucketMask = newHashSet.length - 1;
for (final int state : hashSet) {
if (state > 0) {
int slot = hash(state, stateLength(state)) & bucketMask;
int i = 0;
while (newHashSet[slot] > 0) {
slot = (slot + (++i)) & bucketMask;
}
newHashSet[slot] = state;
}
}
this.hashSet = newHashSet;
}
/**
* The total length of the serialized state data (all arcs).
*/
private int stateLength(int state) {
int arc = state;
while (!isArcLast(arc)) {
arc += ARC_SIZE;
}
return arc - state + ARC_SIZE;
}
/**
* Return <code>true</code> if two regions in {@link #serialized} are identical.
*/
private boolean equivalent(int start1, int start2, int l) {
int k1 = start1;
int k2 = start2;
int len = l;
if (k1 + len > size || k2 + len > size) {
return false;
}
while (len-- > 0) {
if (serialized[k1++] != serialized[k2++]) {
return false;
}
}
return true;
}
/**
* Serialize a given state on the active path.
*/
private int serialize(final int activePathIndex) {
expandBuffers();
final int newState = size;
final int start = activePath[activePathIndex];
final int len = nextArcOffset[activePathIndex] - start;
System.arraycopy(serialized, start, serialized, newState, len);
size += len;
return newState;
}
/**
* Hash code of a fragment of {@link #serialized} array.
*/
private int hash(int st, int byteCount) {
int start = st;
if (byteCount % ARC_SIZE != 0) {
throw new IllegalArgumentException("not an arc multiply?");
}
int h = 0;
int arcs = byteCount / ARC_SIZE;
while (--arcs >= 0) {
h = 17 * h + getArcLabel(start);
h = 17 * h + getArcTarget(start);
if (isArcFinal(start)) {
h += 17;
}
start += ARC_SIZE;
}
return h;
}
/**
* Append a new mutable state to the active path.
*/
private void expandActivePath(int size) {
if (activePath.length < size) {
final int p = activePath.length;
activePath = Arrays.copyOf(activePath, size);
nextArcOffset = Arrays.copyOf(nextArcOffset, size);
for (int i = p; i < size; i++) {
nextArcOffset[i] = activePath[i] =
allocateState(/* assume max labels count */ MAX_LABELS);
}
}
}
/**
* Expand internal buffers for the next state.
*/
private void expandBuffers() {
if (this.serialized.length < size + ARC_SIZE * MAX_LABELS) {
serialized = Arrays.copyOf(serialized, serialized.length + bufferGrowthSize);
serializationBufferReallocations++;
}
}
/**
* Allocate space for a state with the given number of outgoing labels.
*
* @return state offset
*/
private int allocateState(int labels) {
expandBuffers();
final int state = size;
size += labels * ARC_SIZE;
return state;
}
/**
* Copy <code>current</code> into an internal buffer.
*/
private boolean setPrevious(byte[] sequence, int start, int length) {
if (previous == null || previous.length < length) {
previous = new byte[length];
}
System.arraycopy(sequence, start, previous, 0, length);
previousLength = length;
return true;
}
/**
* Debug and information constants.
*
* @see FSABuilder#getInfo()
*/
public enum InfoEntry {
SERIALIZATION_BUFFER_SIZE("Serialization buffer size"),
SERIALIZATION_BUFFER_REALLOCATIONS("Serialization buffer reallocs"),
CONSTANT_ARC_AUTOMATON_SIZE("Constant arc FSA size"),
MAX_ACTIVE_PATH_LENGTH("Max active path"),
STATE_REGISTRY_TABLE_SLOTS("Registry hash slots"),
STATE_REGISTRY_SIZE("Registry hash entries"),
ESTIMATED_MEMORY_CONSUMPTION_MB("Estimated mem consumption (MB)");
private final String stringified;
InfoEntry(String stringified) {
this.stringified = stringified;
}
@Override
public String toString() {
return stringified;
}
}
}