package org.apache.lucene.util.fst; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.HashMap; import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.InputStreamDataInput; import org.apache.lucene.store.OutputStreamDataOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.fst.Builder.UnCompiledNode; import org.apache.lucene.util.packed.GrowableWriter; import org.apache.lucene.util.packed.PackedInts; // TODO: break this into WritableFST and ReadOnlyFST.. then // we can have subclasses of ReadOnlyFST to handle the // different byte[] level encodings (packed or // not)... and things like nodeCount, arcCount are read only // TODO: if FST is pure prefix trie we can do a more compact // job, ie, once we are at a 'suffix only', just store the // completion labels as a string not as a series of arcs. // TODO: maybe make an explicit thread state that holds // reusable stuff eg BytesReader, a scratch arc // NOTE: while the FST is able to represent a non-final // dead-end state (NON_FINAL_END_NODE=0), the layers above // (FSTEnum, Util) have problems with this!! /** Represents an finite state machine (FST), using a * compact byte[] format. * <p> The format is similar to what's used by Morfologik * (http://sourceforge.net/projects/morfologik). * * <p> See the {@link org.apache.lucene.util.fst package * documentation} for some simple examples. * <p><b>NOTE</b>: the FST cannot be larger than ~2.1 GB * because it uses int to address the byte[]. * * @lucene.experimental */ public final class FST<T> { /** Specifies allowed range of each int input label for * this FST. */ public static enum INPUT_TYPE {BYTE1, BYTE2, BYTE4}; public final INPUT_TYPE inputType; final static int BIT_FINAL_ARC = 1 << 0; final static int BIT_LAST_ARC = 1 << 1; final static int BIT_TARGET_NEXT = 1 << 2; // TODO: we can free up a bit if we can nuke this: final static int BIT_STOP_NODE = 1 << 3; final static int BIT_ARC_HAS_OUTPUT = 1 << 4; final static int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5; // Arcs are stored as fixed-size (per entry) array, so // that we can find an arc using binary search. We do // this when number of arcs is > NUM_ARCS_ARRAY: // If set, the target node is delta coded vs current // position: private final static int BIT_TARGET_DELTA = 1 << 6; private final static byte ARCS_AS_FIXED_ARRAY = BIT_ARC_HAS_FINAL_OUTPUT; /** * @see #shouldExpand(UnCompiledNode) */ final static int FIXED_ARRAY_SHALLOW_DISTANCE = 3; // 0 => only root node. /** * @see #shouldExpand(UnCompiledNode) */ final static int FIXED_ARRAY_NUM_ARCS_SHALLOW = 5; /** * @see #shouldExpand(UnCompiledNode) */ final static int FIXED_ARRAY_NUM_ARCS_DEEP = 10; private int[] bytesPerArc = new int[0]; // Increment version to change it private final static String FILE_FORMAT_NAME = "FST"; private final static int VERSION_START = 0; /** Changed numBytesPerArc for array'd case from byte to int. */ private final static int VERSION_INT_NUM_BYTES_PER_ARC = 1; /** Write BYTE2 labels as 2-byte short, not vInt. */ private final static int VERSION_SHORT_BYTE2_LABELS = 2; /** Added optional packed format. */ private final static int VERSION_PACKED = 3; private final static int VERSION_CURRENT = VERSION_PACKED; // Never serialized; just used to represent the virtual // final node w/ no arcs: private final static int FINAL_END_NODE = -1; // Never serialized; just used to represent the virtual // non-final node w/ no arcs: private final static int NON_FINAL_END_NODE = 0; // if non-null, this FST accepts the empty string and // produces this output T emptyOutput; private byte[] emptyOutputBytes; // Not private to avoid synthetic access$NNN methods: byte[] bytes; int byteUpto = 0; private int startNode = -1; public final Outputs<T> outputs; private int lastFrozenNode; private final T NO_OUTPUT; public int nodeCount; public int arcCount; public int arcWithOutputCount; private final boolean packed; private PackedInts.Reader nodeRefToAddress; /** If arc has this label then that arc is final/accepted */ public static final int END_LABEL = -1; private boolean allowArrayArcs = true; private Arc<T> cachedRootArcs[]; /** Represents a single arc. */ public final static class Arc<T> { public int label; public T output; // From node (ord or address); currently only used when // building an FST w/ willPackFST=true: int node; /** To node (ord or address) */ public int target; byte flags; public T nextFinalOutput; // address (into the byte[]), or ord/address if label == END_LABEL int nextArc; // This is non-zero if current arcs are fixed array: int posArcsStart; int bytesPerArc; int arcIdx; int numArcs; /** Returns this */ public Arc<T> copyFrom(Arc<T> other) { node = other.node; label = other.label; target = other.target; flags = other.flags; output = other.output; nextFinalOutput = other.nextFinalOutput; nextArc = other.nextArc; bytesPerArc = other.bytesPerArc; if (bytesPerArc != 0) { posArcsStart = other.posArcsStart; arcIdx = other.arcIdx; numArcs = other.numArcs; } return this; } boolean flag(int flag) { return FST.flag(flags, flag); } public boolean isLast() { return flag(BIT_LAST_ARC); } public boolean isFinal() { return flag(BIT_FINAL_ARC); } @Override public String toString() { StringBuilder b = new StringBuilder(); b.append("node=" + node); b.append(" target=" + target); b.append(" label=" + label); if (flag(BIT_LAST_ARC)) { b.append(" last"); } if (flag(BIT_FINAL_ARC)) { b.append(" final"); } if (flag(BIT_TARGET_NEXT)) { b.append(" targetNext"); } if (flag(BIT_ARC_HAS_OUTPUT)) { b.append(" output=" + output); } if (flag(BIT_ARC_HAS_FINAL_OUTPUT)) { b.append(" nextFinalOutput=" + nextFinalOutput); } if (bytesPerArc != 0) { b.append(" arcArray(idx=" + arcIdx + " of " + numArcs + ")"); } return b.toString(); } }; private static boolean flag(int flags, int bit) { return (flags & bit) != 0; } private final BytesWriter writer; private GrowableWriter nodeAddress; // TODO: we could be smarter here, and prune periodically // as we go; high in-count nodes will "usually" become // clear early on: private GrowableWriter inCounts; // make a new empty FST, for building; Builder invokes // this ctor FST(INPUT_TYPE inputType, Outputs<T> outputs, boolean willPackFST, float acceptableOverheadRatio) { this.inputType = inputType; this.outputs = outputs; bytes = new byte[128]; NO_OUTPUT = outputs.getNoOutput(); if (willPackFST) { nodeAddress = new GrowableWriter(PackedInts.bitsRequired(bytes.length - 1), 8, acceptableOverheadRatio); inCounts = new GrowableWriter(1, 8, acceptableOverheadRatio); } else { nodeAddress = null; inCounts = null; } writer = new BytesWriter(); emptyOutput = null; packed = false; nodeRefToAddress = null; } /** Load a previously saved FST. */ public FST(DataInput in, Outputs<T> outputs) throws IOException { this.outputs = outputs; writer = null; // NOTE: only reads most recent format; we don't have // back-compat promise for FSTs (they are experimental): CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_PACKED); packed = in.readByte() == 1; if (in.readByte() == 1) { // accepts empty string int numBytes = in.readVInt(); // messy bytes = new byte[numBytes]; in.readBytes(bytes, 0, numBytes); if (packed) { emptyOutput = outputs.read(getBytesReader(0)); } else { emptyOutput = outputs.read(getBytesReader(numBytes-1)); } } else { emptyOutput = null; } final byte t = in.readByte(); switch(t) { case 0: inputType = INPUT_TYPE.BYTE1; break; case 1: inputType = INPUT_TYPE.BYTE2; break; case 2: inputType = INPUT_TYPE.BYTE4; break; default: throw new IllegalStateException("invalid input type " + t); } if (packed) { nodeRefToAddress = PackedInts.getReader(in); } else { nodeRefToAddress = null; } startNode = in.readVInt(); nodeCount = in.readVInt(); arcCount = in.readVInt(); arcWithOutputCount = in.readVInt(); bytes = new byte[in.readVInt()]; in.readBytes(bytes, 0, bytes.length); NO_OUTPUT = outputs.getNoOutput(); cacheRootArcs(); } public INPUT_TYPE getInputType() { return inputType; } /** Returns bytes used to represent the FST */ public int sizeInBytes() { int size = bytes.length; if (packed) { size += nodeRefToAddress.ramBytesUsed(); } else if (nodeAddress != null) { size += nodeAddress.ramBytesUsed(); size += inCounts.ramBytesUsed(); } return size; } void finish(int startNode) throws IOException { if (startNode == FINAL_END_NODE && emptyOutput != null) { startNode = 0; } if (this.startNode != -1) { throw new IllegalStateException("already finished"); } byte[] finalBytes = new byte[writer.posWrite]; System.arraycopy(bytes, 0, finalBytes, 0, writer.posWrite); bytes = finalBytes; this.startNode = startNode; cacheRootArcs(); } private int getNodeAddress(int node) { if (nodeAddress != null) { // Deref return (int) nodeAddress.get(node); } else { // Straight return node; } } // Caches first 128 labels @SuppressWarnings({"rawtypes","unchecked"}) private void cacheRootArcs() throws IOException { cachedRootArcs = (Arc<T>[]) new Arc[0x80]; final Arc<T> arc = new Arc<T>(); getFirstArc(arc); final BytesReader in = getBytesReader(0); if (targetHasArcs(arc)) { readFirstRealTargetArc(arc.target, arc, in); while(true) { assert arc.label != END_LABEL; if (arc.label < cachedRootArcs.length) { cachedRootArcs[arc.label] = new Arc<T>().copyFrom(arc); } else { break; } if (arc.isLast()) { break; } readNextRealArc(arc, in); } } } public T getEmptyOutput() { return emptyOutput; } void setEmptyOutput(T v) throws IOException { if (emptyOutput != null) { emptyOutput = outputs.merge(emptyOutput, v); } else { emptyOutput = v; } // TODO: this is messy -- replace with sillyBytesWriter; maybe make // bytes private final int posSave = writer.posWrite; outputs.write(emptyOutput, writer); emptyOutputBytes = new byte[writer.posWrite-posSave]; if (!packed) { // reverse final int stopAt = (writer.posWrite - posSave)/2; int upto = 0; while(upto < stopAt) { final byte b = bytes[posSave + upto]; bytes[posSave+upto] = bytes[writer.posWrite-upto-1]; bytes[writer.posWrite-upto-1] = b; upto++; } } System.arraycopy(bytes, posSave, emptyOutputBytes, 0, writer.posWrite-posSave); writer.posWrite = posSave; } public void save(DataOutput out) throws IOException { if (startNode == -1) { throw new IllegalStateException("call finish first"); } if (nodeAddress != null) { throw new IllegalStateException("cannot save an FST pre-packed FST; it must first be packed"); } if (packed && !(nodeRefToAddress instanceof PackedInts.Mutable)) { throw new IllegalStateException("cannot save a FST which has been loaded from disk "); } CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT); if (packed) { out.writeByte((byte) 1); } else { out.writeByte((byte) 0); } // TODO: really we should encode this as an arc, arriving // to the root node, instead of special casing here: if (emptyOutput != null) { out.writeByte((byte) 1); out.writeVInt(emptyOutputBytes.length); out.writeBytes(emptyOutputBytes, 0, emptyOutputBytes.length); } else { out.writeByte((byte) 0); } final byte t; if (inputType == INPUT_TYPE.BYTE1) { t = 0; } else if (inputType == INPUT_TYPE.BYTE2) { t = 1; } else { t = 2; } out.writeByte(t); if (packed) { ((PackedInts.Mutable) nodeRefToAddress).save(out); } out.writeVInt(startNode); out.writeVInt(nodeCount); out.writeVInt(arcCount); out.writeVInt(arcWithOutputCount); out.writeVInt(bytes.length); out.writeBytes(bytes, 0, bytes.length); } /** * Writes an automaton to a file. */ public void save(final File file) throws IOException { boolean success = false; OutputStream os = new BufferedOutputStream(new FileOutputStream(file)); try { save(new OutputStreamDataOutput(os)); success = true; } finally { if (success) { IOUtils.close(os); } else { IOUtils.closeWhileHandlingException(os); } } } /** * Reads an automaton from a file. */ public static <T> FST<T> read(File file, Outputs<T> outputs) throws IOException { InputStream is = new BufferedInputStream(new FileInputStream(file)); boolean success = false; try { FST<T> fst = new FST<T>(new InputStreamDataInput(is), outputs); success = true; return fst; } finally { if (success) { IOUtils.close(is); } else { IOUtils.closeWhileHandlingException(is); } } } private void writeLabel(int v) throws IOException { assert v >= 0: "v=" + v; if (inputType == INPUT_TYPE.BYTE1) { assert v <= 255: "v=" + v; writer.writeByte((byte) v); } else if (inputType == INPUT_TYPE.BYTE2) { assert v <= 65535: "v=" + v; writer.writeShort((short) v); } else { //writeInt(v); writer.writeVInt(v); } } int readLabel(DataInput in) throws IOException { final int v; if (inputType == INPUT_TYPE.BYTE1) { // Unsigned byte: v = in.readByte()&0xFF; } else if (inputType == INPUT_TYPE.BYTE2) { // Unsigned short: v = in.readShort()&0xFFFF; } else { v = in.readVInt(); } return v; } /** returns true if the node at this address has any * outgoing arcs */ public static<T> boolean targetHasArcs(Arc<T> arc) { return arc.target > 0; } // serializes new node by appending its bytes to the end // of the current byte[] int addNode(Builder.UnCompiledNode<T> nodeIn) throws IOException { //System.out.println("FST.addNode pos=" + writer.posWrite + " numArcs=" + nodeIn.numArcs); if (nodeIn.numArcs == 0) { if (nodeIn.isFinal) { return FINAL_END_NODE; } else { return NON_FINAL_END_NODE; } } int startAddress = writer.posWrite; //System.out.println(" startAddr=" + startAddress); final boolean doFixedArray = shouldExpand(nodeIn); final int fixedArrayStart; if (doFixedArray) { if (bytesPerArc.length < nodeIn.numArcs) { bytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, 1)]; } // write a "false" first arc: writer.writeByte(ARCS_AS_FIXED_ARRAY); writer.writeVInt(nodeIn.numArcs); // placeholder -- we'll come back and write the number // of bytes per arc (int) here: // TODO: we could make this a vInt instead writer.writeInt(0); fixedArrayStart = writer.posWrite; //System.out.println(" do fixed arcs array arcsStart=" + fixedArrayStart); } else { fixedArrayStart = 0; } arcCount += nodeIn.numArcs; final int lastArc = nodeIn.numArcs-1; int lastArcStart = writer.posWrite; int maxBytesPerArc = 0; for(int arcIdx=0;arcIdx<nodeIn.numArcs;arcIdx++) { final Builder.Arc<T> arc = nodeIn.arcs[arcIdx]; final Builder.CompiledNode target = (Builder.CompiledNode) arc.target; int flags = 0; if (arcIdx == lastArc) { flags += BIT_LAST_ARC; } if (lastFrozenNode == target.node && !doFixedArray) { // TODO: for better perf (but more RAM used) we // could avoid this except when arc is "near" the // last arc: flags += BIT_TARGET_NEXT; } if (arc.isFinal) { flags += BIT_FINAL_ARC; if (arc.nextFinalOutput != NO_OUTPUT) { flags += BIT_ARC_HAS_FINAL_OUTPUT; } } else { assert arc.nextFinalOutput == NO_OUTPUT; } boolean targetHasArcs = target.node > 0; if (!targetHasArcs) { flags += BIT_STOP_NODE; } else if (inCounts != null) { inCounts.set(target.node, inCounts.get(target.node) + 1); } if (arc.output != NO_OUTPUT) { flags += BIT_ARC_HAS_OUTPUT; } writer.writeByte((byte) flags); writeLabel(arc.label); // System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + writer.posWrite + " output=" + outputs.outputToString(arc.output)); if (arc.output != NO_OUTPUT) { outputs.write(arc.output, writer); //System.out.println(" write output"); arcWithOutputCount++; } if (arc.nextFinalOutput != NO_OUTPUT) { //System.out.println(" write final output"); outputs.write(arc.nextFinalOutput, writer); } if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { assert target.node > 0; //System.out.println(" write target"); writer.writeInt(target.node); } // just write the arcs "like normal" on first pass, // but record how many bytes each one took, and max // byte size: if (doFixedArray) { bytesPerArc[arcIdx] = writer.posWrite - lastArcStart; lastArcStart = writer.posWrite; maxBytesPerArc = Math.max(maxBytesPerArc, bytesPerArc[arcIdx]); //System.out.println(" bytes=" + bytesPerArc[arcIdx]); } } // TODO: if arc'd arrays will be "too wasteful" by some // measure, eg if arcs have vastly different sized // outputs, then we should selectively disable array for // such cases if (doFixedArray) { //System.out.println(" doFixedArray"); assert maxBytesPerArc > 0; // 2nd pass just "expands" all arcs to take up a fixed // byte size final int sizeNeeded = fixedArrayStart + nodeIn.numArcs * maxBytesPerArc; assert ((long) fixedArrayStart) + ((long) nodeIn.numArcs) * maxBytesPerArc < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; bytes = ArrayUtil.grow(bytes, sizeNeeded); // TODO: we could make this a vInt instead bytes[fixedArrayStart-4] = (byte) (maxBytesPerArc >> 24); bytes[fixedArrayStart-3] = (byte) (maxBytesPerArc >> 16); bytes[fixedArrayStart-2] = (byte) (maxBytesPerArc >> 8); bytes[fixedArrayStart-1] = (byte) maxBytesPerArc; // expand the arcs in place, backwards int srcPos = writer.posWrite; int destPos = fixedArrayStart + nodeIn.numArcs*maxBytesPerArc; writer.posWrite = destPos; for(int arcIdx=nodeIn.numArcs-1;arcIdx>=0;arcIdx--) { //System.out.println(" repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos); destPos -= maxBytesPerArc; srcPos -= bytesPerArc[arcIdx]; if (srcPos != destPos) { assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " bytesPerArc[arcIdx]=" + bytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs; System.arraycopy(bytes, srcPos, bytes, destPos, bytesPerArc[arcIdx]); } } } // reverse bytes in-place; we do this so that the // "BIT_TARGET_NEXT" opto can work, ie, it reads the // node just before the current one final int endAddress = writer.posWrite - 1; int left = startAddress; int right = endAddress; while (left < right) { final byte b = bytes[left]; bytes[left++] = bytes[right]; bytes[right--] = b; } //System.out.println(" endAddress=" + endAddress); nodeCount++; final int node; if (nodeAddress != null) { // Nodes are addressed by 1+ord: if (nodeCount == nodeAddress.size()) { nodeAddress = nodeAddress.resize(ArrayUtil.oversize(nodeAddress.size() + 1, nodeAddress.getBitsPerValue())); inCounts = inCounts.resize(ArrayUtil.oversize(inCounts.size() + 1, inCounts.getBitsPerValue())); } nodeAddress.set(nodeCount, endAddress); // System.out.println(" write nodeAddress[" + nodeCount + "] = " + endAddress); node = nodeCount; } else { node = endAddress; } lastFrozenNode = node; return node; } /** Fills virtual 'start' arc, ie, an empty incoming arc to * the FST's start node */ public Arc<T> getFirstArc(Arc<T> arc) { if (emptyOutput != null) { arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC; arc.nextFinalOutput = emptyOutput; } else { arc.flags = BIT_LAST_ARC; arc.nextFinalOutput = NO_OUTPUT; } arc.output = NO_OUTPUT; // If there are no nodes, ie, the FST only accepts the // empty string, then startNode is 0 arc.target = startNode; return arc; } /** Follows the <code>follow</code> arc and reads the last * arc of its target; this changes the provided * <code>arc</code> (2nd arg) in-place and returns it. * * @return Returns the second argument * (<code>arc</code>). */ public Arc<T> readLastTargetArc(Arc<T> follow, Arc<T> arc, FST.BytesReader in) throws IOException { //System.out.println("readLast"); if (!targetHasArcs(follow)) { //System.out.println(" end node"); assert follow.isFinal(); arc.label = END_LABEL; arc.target = FINAL_END_NODE; arc.output = follow.nextFinalOutput; arc.flags = BIT_LAST_ARC; return arc; } else { in.pos = getNodeAddress(follow.target); arc.node = follow.target; final byte b = in.readByte(); if (b == ARCS_AS_FIXED_ARRAY) { // array: jump straight to end arc.numArcs = in.readVInt(); if (packed) { arc.bytesPerArc = in.readVInt(); } else { arc.bytesPerArc = in.readInt(); } //System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc); arc.posArcsStart = in.pos; arc.arcIdx = arc.numArcs - 2; } else { arc.flags = b; // non-array: linear scan arc.bytesPerArc = 0; //System.out.println(" scan"); while(!arc.isLast()) { // skip this arc: readLabel(in); if (arc.flag(BIT_ARC_HAS_OUTPUT)) { outputs.read(in); } if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { outputs.read(in); } if (arc.flag(BIT_STOP_NODE)) { } else if (arc.flag(BIT_TARGET_NEXT)) { } else { if (packed) { in.readVInt(); } else { in.skip(4); } } arc.flags = in.readByte(); } // Undo the byte flags we read: in.skip(-1); arc.nextArc = in.pos; } readNextRealArc(arc, in); assert arc.isLast(); return arc; } } /** * Follow the <code>follow</code> arc and read the first arc of its target; * this changes the provided <code>arc</code> (2nd arg) in-place and returns * it. * * @return Returns the second argument (<code>arc</code>). */ public Arc<T> readFirstTargetArc(Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException { //int pos = address; //System.out.println(" readFirstTarget follow.target=" + follow.target + " isFinal=" + follow.isFinal()); if (follow.isFinal()) { // Insert "fake" final first arc: arc.label = END_LABEL; arc.output = follow.nextFinalOutput; arc.flags = BIT_FINAL_ARC; if (follow.target <= 0) { arc.flags |= BIT_LAST_ARC; } else { arc.node = follow.target; // NOTE: nextArc is a node (not an address!) in this case: arc.nextArc = follow.target; } arc.target = FINAL_END_NODE; //System.out.println(" insert isFinal; nextArc=" + follow.target + " isLast=" + arc.isLast() + " output=" + outputs.outputToString(arc.output)); return arc; } else { return readFirstRealTargetArc(follow.target, arc, in); } } public Arc<T> readFirstRealTargetArc(int node, Arc<T> arc, final BytesReader in) throws IOException { assert in.bytes == bytes; final int address = getNodeAddress(node); in.pos = address; //System.out.println(" readFirstRealTargtArc address=" //+ address); //System.out.println(" flags=" + arc.flags); arc.node = node; if (in.readByte() == ARCS_AS_FIXED_ARRAY) { //System.out.println(" fixedArray"); // this is first arc in a fixed-array arc.numArcs = in.readVInt(); if (packed) { arc.bytesPerArc = in.readVInt(); } else { arc.bytesPerArc = in.readInt(); } arc.arcIdx = -1; arc.nextArc = arc.posArcsStart = in.pos; //System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos); } else { //arc.flags = b; arc.nextArc = address; arc.bytesPerArc = 0; } return readNextRealArc(arc, in); } /** * Checks if <code>arc</code>'s target state is in expanded (or vector) format. * * @return Returns <code>true</code> if <code>arc</code> points to a state in an * expanded array format. */ boolean isExpandedTarget(Arc<T> follow, FST.BytesReader in) throws IOException { if (!targetHasArcs(follow)) { return false; } else { in.pos = getNodeAddress(follow.target); return in.readByte() == ARCS_AS_FIXED_ARRAY; } } /** In-place read; returns the arc. */ public Arc<T> readNextArc(Arc<T> arc, BytesReader in) throws IOException { if (arc.label == END_LABEL) { // This was a fake inserted "final" arc if (arc.nextArc <= 0) { throw new IllegalArgumentException("cannot readNextArc when arc.isLast()=true"); } return readFirstRealTargetArc(arc.nextArc, arc, in); } else { return readNextRealArc(arc, in); } } /** Peeks at next arc's label; does not alter arc. Do * not call this if arc.isLast()! */ public int readNextArcLabel(Arc<T> arc, BytesReader in) throws IOException { assert !arc.isLast(); if (arc.label == END_LABEL) { //System.out.println(" nextArc fake " + arc.nextArc); in.pos = getNodeAddress(arc.nextArc); final byte b = bytes[in.pos]; if (b == ARCS_AS_FIXED_ARRAY) { //System.out.println(" nextArc fake array"); in.skip(1); in.readVInt(); if (packed) { in.readVInt(); } else { in.readInt(); } } } else { if (arc.bytesPerArc != 0) { //System.out.println(" nextArc real array"); // arcs are at fixed entries in.pos = arc.posArcsStart; in.skip((1+arc.arcIdx)*arc.bytesPerArc); } else { // arcs are packed //System.out.println(" nextArc real packed"); in.pos = arc.nextArc; } } // skip flags in.readByte(); return readLabel(in); } /** Never returns null, but you should never call this if * arc.isLast() is true. */ public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException { assert in.bytes == bytes; // TODO: can't assert this because we call from readFirstArc // assert !flag(arc.flags, BIT_LAST_ARC); // this is a continuing arc in a fixed array if (arc.bytesPerArc != 0) { // arcs are at fixed entries arc.arcIdx++; assert arc.arcIdx < arc.numArcs; in.skip(arc.posArcsStart, arc.arcIdx*arc.bytesPerArc); } else { // arcs are packed in.pos = arc.nextArc; } arc.flags = in.readByte(); arc.label = readLabel(in); if (arc.flag(BIT_ARC_HAS_OUTPUT)) { arc.output = outputs.read(in); } else { arc.output = outputs.getNoOutput(); } if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { arc.nextFinalOutput = outputs.read(in); } else { arc.nextFinalOutput = outputs.getNoOutput(); } if (arc.flag(BIT_STOP_NODE)) { if (arc.flag(BIT_FINAL_ARC)) { arc.target = FINAL_END_NODE; } else { arc.target = NON_FINAL_END_NODE; } arc.nextArc = in.pos; } else if (arc.flag(BIT_TARGET_NEXT)) { arc.nextArc = in.pos; // TODO: would be nice to make this lazy -- maybe // caller doesn't need the target and is scanning arcs... if (nodeAddress == null) { if (!arc.flag(BIT_LAST_ARC)) { if (arc.bytesPerArc == 0) { // must scan seekToNextNode(in); } else { in.skip(arc.posArcsStart, arc.bytesPerArc * arc.numArcs); } } arc.target = in.pos; } else { arc.target = arc.node - 1; assert arc.target > 0; } } else { if (packed) { final int pos = in.pos; final int code = in.readVInt(); if (arc.flag(BIT_TARGET_DELTA)) { // Address is delta-coded from current address: arc.target = pos + code; //System.out.println(" delta pos=" + pos + " delta=" + code + " target=" + arc.target); } else if (code < nodeRefToAddress.size()) { // Deref arc.target = (int) nodeRefToAddress.get(code); //System.out.println(" deref code=" + code + " target=" + arc.target); } else { // Absolute arc.target = code; //System.out.println(" abs code=" + code + " derefLen=" + nodeRefToAddress.length); } } else { arc.target = in.readInt(); } arc.nextArc = in.pos; } return arc; } /** Finds an arc leaving the incoming arc, replacing the arc in place. * This returns null if the arc was not found, else the incoming arc. */ public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException { assert cachedRootArcs != null; assert in.bytes == bytes; if (labelToMatch == END_LABEL) { if (follow.isFinal()) { if (follow.target <= 0) { arc.flags = BIT_LAST_ARC; } else { arc.flags = 0; // NOTE: nextArc is a node (not an address!) in this case: arc.nextArc = follow.target; arc.node = follow.target; } arc.output = follow.nextFinalOutput; arc.label = END_LABEL; return arc; } else { return null; } } // Short-circuit if this arc is in the root arc cache: if (follow.target == startNode && labelToMatch < cachedRootArcs.length) { final Arc<T> result = cachedRootArcs[labelToMatch]; if (result == null) { return result; } else { arc.copyFrom(result); return arc; } } if (!targetHasArcs(follow)) { return null; } in.pos = getNodeAddress(follow.target); arc.node = follow.target; // System.out.println("fta label=" + (char) labelToMatch); if (in.readByte() == ARCS_AS_FIXED_ARRAY) { // Arcs are full array; do binary search: arc.numArcs = in.readVInt(); if (packed) { arc.bytesPerArc = in.readVInt(); } else { arc.bytesPerArc = in.readInt(); } arc.posArcsStart = in.pos; int low = 0; int high = arc.numArcs-1; while (low <= high) { //System.out.println(" cycle"); int mid = (low + high) >>> 1; in.skip(arc.posArcsStart, arc.bytesPerArc*mid + 1); int midLabel = readLabel(in); final int cmp = midLabel - labelToMatch; if (cmp < 0) { low = mid + 1; } else if (cmp > 0) { high = mid - 1; } else { arc.arcIdx = mid-1; //System.out.println(" found!"); return readNextRealArc(arc, in); } } return null; } // Linear scan readFirstRealTargetArc(follow.target, arc, in); while(true) { //System.out.println(" non-bs cycle"); // TODO: we should fix this code to not have to create // object for the output of every arc we scan... only // for the matching arc, if found if (arc.label == labelToMatch) { //System.out.println(" found!"); return arc; } else if (arc.label > labelToMatch) { return null; } else if (arc.isLast()) { return null; } else { readNextRealArc(arc, in); } } } private void seekToNextNode(BytesReader in) throws IOException { while(true) { final int flags = in.readByte(); readLabel(in); if (flag(flags, BIT_ARC_HAS_OUTPUT)) { outputs.read(in); } if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) { outputs.read(in); } if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) { if (packed) { in.readVInt(); } else { in.readInt(); } } if (flag(flags, BIT_LAST_ARC)) { return; } } } public int getNodeCount() { // 1+ in order to count the -1 implicit final node return 1+nodeCount; } public int getArcCount() { return arcCount; } public int getArcWithOutputCount() { return arcWithOutputCount; } public void setAllowArrayArcs(boolean v) { allowArrayArcs = v; } /** * Nodes will be expanded if their depth (distance from the root node) is * <= this value and their number of arcs is >= * {@link #FIXED_ARRAY_NUM_ARCS_SHALLOW}. * * <p> * Fixed array consumes more RAM but enables binary search on the arcs * (instead of a linear scan) on lookup by arc label. * * @return <code>true</code> if <code>node</code> should be stored in an * expanded (array) form. * * @see #FIXED_ARRAY_NUM_ARCS_DEEP * @see Builder.UnCompiledNode#depth */ private boolean shouldExpand(UnCompiledNode<T> node) { return allowArrayArcs && ((node.depth <= FIXED_ARRAY_SHALLOW_DISTANCE && node.numArcs >= FIXED_ARRAY_NUM_ARCS_SHALLOW) || node.numArcs >= FIXED_ARRAY_NUM_ARCS_DEEP); } // Non-static: writes to FST's byte[] class BytesWriter extends DataOutput { int posWrite; public BytesWriter() { // pad: ensure no node gets address 0 which is reserved to mean // the stop state w/ no arcs posWrite = 1; } @Override public void writeByte(byte b) { assert posWrite <= bytes.length; if (bytes.length == posWrite) { assert bytes.length < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; bytes = ArrayUtil.grow(bytes); } assert posWrite < bytes.length: "posWrite=" + posWrite + " bytes.length=" + bytes.length; bytes[posWrite++] = b; } public void setPosWrite(int posWrite) { this.posWrite = posWrite; if (bytes.length < posWrite) { assert bytes.length < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; bytes = ArrayUtil.grow(bytes, posWrite); } } @Override public void writeBytes(byte[] b, int offset, int length) { final int size = posWrite + length; assert bytes.length < Integer.MAX_VALUE: "FST too large (> 2.1 GB)"; bytes = ArrayUtil.grow(bytes, size); System.arraycopy(b, offset, bytes, posWrite, length); posWrite += length; } } public BytesReader getBytesReader(int pos) { // TODO: maybe re-use via ThreadLocal? if (packed) { return new ForwardBytesReader(bytes, pos); } else { return new ReverseBytesReader(bytes, pos); } } /** Reads the bytes from this FST. Use {@link * #getBytesReader(int)} to obtain an instance for this * FST; re-use across calls (but only within a single * thread) for better performance. */ public static abstract class BytesReader extends DataInput { protected int pos; protected final byte[] bytes; protected BytesReader(byte[] bytes, int pos) { this.bytes = bytes; this.pos = pos; } abstract void skip(int byteCount); abstract void skip(int base, int byteCount); } final static class ReverseBytesReader extends BytesReader { public ReverseBytesReader(byte[] bytes, int pos) { super(bytes, pos); } @Override public byte readByte() { return bytes[pos--]; } @Override public void readBytes(byte[] b, int offset, int len) { for(int i=0;i<len;i++) { b[offset+i] = bytes[pos--]; } } public void skip(int count) { pos -= count; } public void skip(int base, int count) { pos = base - count; } } // TODO: can we use just ByteArrayDataInput...? need to // add a .skipBytes to DataInput.. hmm and .setPosition final static class ForwardBytesReader extends BytesReader { public ForwardBytesReader(byte[] bytes, int pos) { super(bytes, pos); } @Override public byte readByte() { return bytes[pos++]; } @Override public void readBytes(byte[] b, int offset, int len) { System.arraycopy(bytes, pos, b, offset, len); pos += len; } public void skip(int count) { pos += count; } public void skip(int base, int count) { pos = base + count; } } private static class ArcAndState<T> { final Arc<T> arc; final IntsRef chain; public ArcAndState(Arc<T> arc, IntsRef chain) { this.arc = arc; this.chain = chain; } } /* public void countSingleChains() throws IOException { // TODO: must assert this FST was built with // "willRewrite" final List<ArcAndState<T>> queue = new ArrayList<ArcAndState<T>>(); // TODO: use bitset to not revisit nodes already // visited FixedBitSet seen = new FixedBitSet(1+nodeCount); int saved = 0; queue.add(new ArcAndState<T>(getFirstArc(new Arc<T>()), new IntsRef())); Arc<T> scratchArc = new Arc<T>(); while(queue.size() > 0) { //System.out.println("cycle size=" + queue.size()); //for(ArcAndState<T> ent : queue) { // System.out.println(" " + Util.toBytesRef(ent.chain, new BytesRef())); // } final ArcAndState<T> arcAndState = queue.get(queue.size()-1); seen.set(arcAndState.arc.node); final BytesRef br = Util.toBytesRef(arcAndState.chain, new BytesRef()); if (br.length > 0 && br.bytes[br.length-1] == -1) { br.length--; } //System.out.println(" top node=" + arcAndState.arc.target + " chain=" + br.utf8ToString()); if (targetHasArcs(arcAndState.arc) && !seen.get(arcAndState.arc.target)) { // push readFirstTargetArc(arcAndState.arc, scratchArc); //System.out.println(" push label=" + (char) scratchArc.label); //System.out.println(" tonode=" + scratchArc.target + " last?=" + scratchArc.isLast()); final IntsRef chain = IntsRef.deepCopyOf(arcAndState.chain); chain.grow(1+chain.length); // TODO //assert scratchArc.label != END_LABEL; chain.ints[chain.length] = scratchArc.label; chain.length++; if (scratchArc.isLast()) { if (scratchArc.target != -1 && inCounts[scratchArc.target] == 1) { //System.out.println(" append"); } else { if (arcAndState.chain.length > 1) { saved += chain.length-2; try { System.out.println("chain: " + Util.toBytesRef(chain, new BytesRef()).utf8ToString()); } catch (AssertionError ae) { System.out.println("chain: " + Util.toBytesRef(chain, new BytesRef())); } } chain.length = 0; } } else { //System.out.println(" reset"); if (arcAndState.chain.length > 1) { saved += arcAndState.chain.length-2; try { System.out.println("chain: " + Util.toBytesRef(arcAndState.chain, new BytesRef()).utf8ToString()); } catch (AssertionError ae) { System.out.println("chain: " + Util.toBytesRef(arcAndState.chain, new BytesRef())); } } if (scratchArc.target != -1 && inCounts[scratchArc.target] != 1) { chain.length = 0; } else { chain.ints[0] = scratchArc.label; chain.length = 1; } } // TODO: instead of new Arc() we can re-use from // a by-depth array queue.add(new ArcAndState<T>(new Arc<T>().copyFrom(scratchArc), chain)); } else if (!arcAndState.arc.isLast()) { // next readNextArc(arcAndState.arc); //System.out.println(" next label=" + (char) arcAndState.arc.label + " len=" + arcAndState.chain.length); if (arcAndState.chain.length != 0) { arcAndState.chain.ints[arcAndState.chain.length-1] = arcAndState.arc.label; } } else { if (arcAndState.chain.length > 1) { saved += arcAndState.chain.length-2; System.out.println("chain: " + Util.toBytesRef(arcAndState.chain, new BytesRef()).utf8ToString()); } // pop //System.out.println(" pop"); queue.remove(queue.size()-1); while(queue.size() > 0 && queue.get(queue.size()-1).arc.isLast()) { queue.remove(queue.size()-1); } if (queue.size() > 0) { final ArcAndState<T> arcAndState2 = queue.get(queue.size()-1); readNextArc(arcAndState2.arc); //System.out.println(" read next=" + (char) arcAndState2.arc.label + " queue=" + queue.size()); assert arcAndState2.arc.label != END_LABEL; if (arcAndState2.chain.length != 0) { arcAndState2.chain.ints[arcAndState2.chain.length-1] = arcAndState2.arc.label; } } } } System.out.println("TOT saved " + saved); } */ // Creates a packed FST private FST(INPUT_TYPE inputType, PackedInts.Reader nodeRefToAddress, Outputs<T> outputs) { packed = true; this.inputType = inputType; bytes = new byte[128]; this.nodeRefToAddress = nodeRefToAddress; this.outputs = outputs; NO_OUTPUT = outputs.getNoOutput(); writer = new BytesWriter(); } /** Expert: creates an FST by packing this one. This * process requires substantial additional RAM (currently * up to ~8 bytes per node depending on * <code>acceptableOverheadRatio</code>), but then should * produce a smaller FST. * * <p>The implementation of this method uses ideas from * <a target="_blank" href="http://www.cs.put.poznan.pl/dweiss/site/publications/download/fsacomp.pdf">Smaller Representation of Finite State Automata</a>, * which describes techniques to reduce the size of a FST. * However, this is not a strict implementation of the * algorithms described in this paper. */ public FST<T> pack(int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio) throws IOException { // TODO: other things to try // - renumber the nodes to get more next / better locality? // - allow multiple input labels on an arc, so // singular chain of inputs can take one arc (on // wikipedia terms this could save another ~6%) // - in the ord case, the output '1' is presumably // very common (after NO_OUTPUT)... maybe use a bit // for it..? // - use spare bits in flags.... for top few labels / // outputs / targets if (nodeAddress == null) { throw new IllegalArgumentException("this FST was not built with willPackFST=true"); } Arc<T> arc = new Arc<T>(); final BytesReader r = getBytesReader(0); final int topN = Math.min(maxDerefNodes, inCounts.size()); // Find top nodes with highest number of incoming arcs: NodeQueue q = new NodeQueue(topN); // TODO: we could use more RAM efficient selection algo here... NodeAndInCount bottom = null; for(int node=0; node<inCounts.size(); node++) { if (inCounts.get(node) >= minInCountDeref) { if (bottom == null) { q.add(new NodeAndInCount(node, (int) inCounts.get(node))); if (q.size() == topN) { bottom = q.top(); } } else if (inCounts.get(node) > bottom.count) { q.insertWithOverflow(new NodeAndInCount(node, (int) inCounts.get(node))); } } } // Free up RAM: inCounts = null; final Map<Integer,Integer> topNodeMap = new HashMap<Integer,Integer>(); for(int downTo=q.size()-1;downTo>=0;downTo--) { NodeAndInCount n = q.pop(); topNodeMap.put(n.node, downTo); //System.out.println("map node=" + n.node + " inCount=" + n.count + " to newID=" + downTo); } final FST<T> fst = new FST<T>(inputType, null, outputs); final BytesWriter writer = fst.writer; // +1 because node ords start at 1 (0 is reserved as stop node): final GrowableWriter newNodeAddress = new GrowableWriter( PackedInts.bitsRequired(bytes.length), 1 + nodeCount, acceptableOverheadRatio); // Fill initial coarse guess: for(int node=1;node<=nodeCount;node++) { newNodeAddress.set(node, 1 + bytes.length - nodeAddress.get(node)); } int absCount; int deltaCount; int topCount; int nextCount; // Iterate until we converge: while(true) { //System.out.println("\nITER"); boolean changed = false; // for assert: boolean negDelta = false; writer.posWrite = 0; // Skip 0 byte since 0 is reserved target: writer.writeByte((byte) 0); fst.arcWithOutputCount = 0; fst.nodeCount = 0; fst.arcCount = 0; absCount = deltaCount = topCount = nextCount = 0; int changedCount = 0; int addressError = 0; //int totWasted = 0; // Since we re-reverse the bytes, we now write the // nodes backwards, so that BIT_TARGET_NEXT is // unchanged: for(int node=nodeCount;node>=1;node--) { fst.nodeCount++; final int address = writer.posWrite; //System.out.println(" node: " + node + " address=" + address); if (address != newNodeAddress.get(node)) { addressError = address - (int) newNodeAddress.get(node); //System.out.println(" change: " + (address - newNodeAddress[node])); changed = true; newNodeAddress.set(node, address); changedCount++; } int nodeArcCount = 0; int bytesPerArc = 0; boolean retry = false; // for assert: boolean anyNegDelta = false; // Retry loop: possibly iterate more than once, if // this is an array'd node and bytesPerArc changes: writeNode: while(true) { // retry writing this node readFirstRealTargetArc(node, arc, r); final boolean useArcArray = arc.bytesPerArc != 0; if (useArcArray) { // Write false first arc: if (bytesPerArc == 0) { bytesPerArc = arc.bytesPerArc; } writer.writeByte(ARCS_AS_FIXED_ARRAY); writer.writeVInt(arc.numArcs); writer.writeVInt(bytesPerArc); //System.out.println("node " + node + ": " + arc.numArcs + " arcs"); } int maxBytesPerArc = 0; //int wasted = 0; while(true) { // iterate over all arcs for this node //System.out.println(" arc label=" + arc.label + " target=" + arc.target + " pos=" + writer.posWrite); final int arcStartPos = writer.posWrite; nodeArcCount++; byte flags = 0; if (arc.isLast()) { flags += BIT_LAST_ARC; } /* if (!useArcArray && nodeUpto < nodes.length-1 && arc.target == nodes[nodeUpto+1]) { flags += BIT_TARGET_NEXT; } */ if (!useArcArray && node != 1 && arc.target == node-1) { flags += BIT_TARGET_NEXT; if (!retry) { nextCount++; } } if (arc.isFinal()) { flags += BIT_FINAL_ARC; if (arc.nextFinalOutput != NO_OUTPUT) { flags += BIT_ARC_HAS_FINAL_OUTPUT; } } else { assert arc.nextFinalOutput == NO_OUTPUT; } if (!targetHasArcs(arc)) { flags += BIT_STOP_NODE; } if (arc.output != NO_OUTPUT) { flags += BIT_ARC_HAS_OUTPUT; } final Integer ptr; final int absPtr; final boolean doWriteTarget = targetHasArcs(arc) && (flags & BIT_TARGET_NEXT) == 0; if (doWriteTarget) { ptr = topNodeMap.get(arc.target); if (ptr != null) { absPtr = ptr; } else { absPtr = topNodeMap.size() + (int) newNodeAddress.get(arc.target) + addressError; } int delta = (int) newNodeAddress.get(arc.target) + addressError - writer.posWrite - 2; if (delta < 0) { //System.out.println("neg: " + delta); anyNegDelta = true; delta = 0; } if (delta < absPtr) { flags |= BIT_TARGET_DELTA; } } else { ptr = null; absPtr = 0; } writer.writeByte(flags); fst.writeLabel(arc.label); if (arc.output != NO_OUTPUT) { outputs.write(arc.output, writer); if (!retry) { fst.arcWithOutputCount++; } } if (arc.nextFinalOutput != NO_OUTPUT) { outputs.write(arc.nextFinalOutput, writer); } if (doWriteTarget) { int delta = (int) newNodeAddress.get(arc.target) + addressError - writer.posWrite; if (delta < 0) { anyNegDelta = true; //System.out.println("neg: " + delta); delta = 0; } if (flag(flags, BIT_TARGET_DELTA)) { //System.out.println(" delta"); writer.writeVInt(delta); if (!retry) { deltaCount++; } } else { /* if (ptr != null) { System.out.println(" deref"); } else { System.out.println(" abs"); } */ writer.writeVInt(absPtr); if (!retry) { if (absPtr >= topNodeMap.size()) { absCount++; } else { topCount++; } } } } if (useArcArray) { final int arcBytes = writer.posWrite - arcStartPos; //System.out.println(" " + arcBytes + " bytes"); maxBytesPerArc = Math.max(maxBytesPerArc, arcBytes); // NOTE: this may in fact go "backwards", if // somehow (rarely, possibly never) we use // more bytesPerArc in this rewrite than the // incoming FST did... but in this case we // will retry (below) so it's OK to ovewrite // bytes: //wasted += bytesPerArc - arcBytes; writer.setPosWrite(arcStartPos + bytesPerArc); } if (arc.isLast()) { break; } readNextRealArc(arc, r); } if (useArcArray) { if (maxBytesPerArc == bytesPerArc || (retry && maxBytesPerArc <= bytesPerArc)) { // converged //System.out.println(" bba=" + bytesPerArc + " wasted=" + wasted); //totWasted += wasted; break; } } else { break; } //System.out.println(" retry this node maxBytesPerArc=" + maxBytesPerArc + " vs " + bytesPerArc); // Retry: bytesPerArc = maxBytesPerArc; writer.posWrite = address; nodeArcCount = 0; retry = true; anyNegDelta = false; } negDelta |= anyNegDelta; fst.arcCount += nodeArcCount; } if (!changed) { // We don't renumber the nodes (just reverse their // order) so nodes should only point forward to // other nodes because we only produce acyclic FSTs // w/ nodes only pointing "forwards": assert !negDelta; //System.out.println("TOT wasted=" + totWasted); // Converged! break; } //System.out.println(" " + changedCount + " of " + fst.nodeCount + " changed; retry"); } long maxAddress = 0; for (int key : topNodeMap.keySet()) { maxAddress = Math.max(maxAddress, newNodeAddress.get(key)); } PackedInts.Mutable nodeRefToAddressIn = PackedInts.getMutable(topNodeMap.size(), PackedInts.bitsRequired(maxAddress), acceptableOverheadRatio); for(Map.Entry<Integer,Integer> ent : topNodeMap.entrySet()) { nodeRefToAddressIn.set(ent.getValue(), newNodeAddress.get(ent.getKey())); } fst.nodeRefToAddress = nodeRefToAddressIn; fst.startNode = (int) newNodeAddress.get(startNode); //System.out.println("new startNode=" + fst.startNode + " old startNode=" + startNode); if (emptyOutput != null) { fst.setEmptyOutput(emptyOutput); } assert fst.nodeCount == nodeCount: "fst.nodeCount=" + fst.nodeCount + " nodeCount=" + nodeCount; assert fst.arcCount == arcCount; assert fst.arcWithOutputCount == arcWithOutputCount: "fst.arcWithOutputCount=" + fst.arcWithOutputCount + " arcWithOutputCount=" + arcWithOutputCount; final byte[] finalBytes = new byte[writer.posWrite]; //System.out.println("resize " + fst.bytes.length + " down to " + writer.posWrite); System.arraycopy(fst.bytes, 0, finalBytes, 0, writer.posWrite); fst.bytes = finalBytes; fst.cacheRootArcs(); //final int size = fst.sizeInBytes(); //System.out.println("nextCount=" + nextCount + " topCount=" + topCount + " deltaCount=" + deltaCount + " absCount=" + absCount); return fst; } private static class NodeAndInCount implements Comparable<NodeAndInCount> { final int node; final int count; public NodeAndInCount(int node, int count) { this.node = node; this.count = count; } @Override public int compareTo(NodeAndInCount other) { if (count > other.count) { return 1; } else if (count < other.count) { return -1; } else { // Tie-break: smaller node compares as greater than return other.node - node; } } } private static class NodeQueue extends PriorityQueue<NodeAndInCount> { public NodeQueue(int topN) { super(topN, false); } @Override public boolean lessThan(NodeAndInCount a, NodeAndInCount b) { final int cmp = a.compareTo(b); assert cmp != 0; return cmp < 0; } } }