package org.apache.lucene.util.fst;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.*;
import java.util.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
/** Static helper methods
*
* @lucene.experimental */
public final class Util {
private Util() {
}
/** Looks up the output for this input, or null if the
* input is not accepted. FST must be
* INPUT_TYPE.BYTE4. */
public static<T> T get(FST<T> fst, IntsRef input) throws IOException {
assert fst.inputType == FST.INPUT_TYPE.BYTE4;
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
// Accumulate output as we go
final T NO_OUTPUT = fst.outputs.getNoOutput();
T output = NO_OUTPUT;
for(int i=0;i<input.length;i++) {
if (fst.findTargetArc(input.ints[input.offset + i], arc, arc) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output);
}
}
if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
return fst.outputs.add(output, arc.output);
} else {
return output;
}
}
/** Logically casts input to UTF32 ints then looks up the output
* or null if the input is not accepted. FST must be
* INPUT_TYPE.BYTE4. */
public static<T> T get(FST<T> fst, char[] input, int offset, int length) throws IOException {
assert fst.inputType == FST.INPUT_TYPE.BYTE4;
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
int charIdx = offset;
final int charLimit = offset + length;
// Accumulate output as we go
final T NO_OUTPUT = fst.outputs.getNoOutput();
T output = NO_OUTPUT;
while(charIdx < charLimit) {
final int utf32 = Character.codePointAt(input, charIdx);
charIdx += Character.charCount(utf32);
if (fst.findTargetArc(utf32, arc, arc) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output);
}
}
if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
return fst.outputs.add(output, arc.output);
} else {
return output;
}
}
/** Logically casts input to UTF32 ints then looks up the output
* or null if the input is not accepted. FST must be
* INPUT_TYPE.BYTE4. */
public static<T> T get(FST<T> fst, CharSequence input) throws IOException {
assert fst.inputType == FST.INPUT_TYPE.BYTE4;
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
int charIdx = 0;
final int charLimit = input.length();
// Accumulate output as we go
final T NO_OUTPUT = fst.outputs.getNoOutput();
T output = NO_OUTPUT;
while(charIdx < charLimit) {
final int utf32 = Character.codePointAt(input, charIdx);
charIdx += Character.charCount(utf32);
if (fst.findTargetArc(utf32, arc, arc) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output);
}
}
if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
return fst.outputs.add(output, arc.output);
} else {
return output;
}
}
/** Looks up the output for this input, or null if the
* input is not accepted */
public static<T> T get(FST<T> fst, BytesRef input) throws IOException {
assert fst.inputType == FST.INPUT_TYPE.BYTE1;
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
// Accumulate output as we go
final T NO_OUTPUT = fst.outputs.getNoOutput();
T output = NO_OUTPUT;
for(int i=0;i<input.length;i++) {
if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output);
}
}
if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
return fst.outputs.add(output, arc.output);
} else {
return output;
}
}
/**
* Dumps an {@link FST} to a GraphViz's <code>dot</code> language description
* for visualization. Example of use:
*
* <pre>
* PrintStream ps = new PrintStream("out.dot");
* fst.toDot(ps);
* ps.close();
* </pre>
*
* and then, from command line:
*
* <pre>
* dot -Tpng -o out.png out.dot
* </pre>
*
* <p>
* Note: larger FSTs (a few thousand nodes) won't even render, don't bother.
*
* @param sameRank
* If <code>true</code>, the resulting <code>dot</code> file will try
* to order states in layers of breadth-first traversal. This may
* mess up arcs, but makes the output FST's structure a bit clearer.
*
* @param labelStates
* If <code>true</code> states will have labels equal to their offsets in their
* binary format. Expands the graph considerably.
*
* @see "http://www.graphviz.org/"
*/
public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates)
throws IOException {
final String expandedNodeColor = "blue";
// This is the start arc in the automaton (from the epsilon state to the first state
// with outgoing transitions.
final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
// A queue of transitions to consider for the next level.
final List<FST.Arc<T>> thisLevelQueue = new ArrayList<FST.Arc<T>>();
// A queue of transitions to consider when processing the next level.
final List<FST.Arc<T>> nextLevelQueue = new ArrayList<FST.Arc<T>>();
nextLevelQueue.add(startArc);
// A list of states on the same level (for ranking).
final List<Integer> sameLevelStates = new ArrayList<Integer>();
// A bitset of already seen states (target offset).
final BitSet seen = new BitSet();
seen.set(startArc.target);
// Shape for states.
final String stateShape = "circle";
// Emit DOT prologue.
out.write("digraph FST {\n");
out.write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n");
if (!labelStates) {
out.write(" node [shape=circle, width=.2, height=.2, style=filled]\n");
}
emitDotState(out, "initial", "point", "white", "");
emitDotState(out, Integer.toString(startArc.target), stateShape,
fst.isExpandedTarget(startArc) ? expandedNodeColor : null,
"");
out.write(" initial -> " + startArc.target + "\n");
final T NO_OUTPUT = fst.outputs.getNoOutput();
int level = 0;
while (!nextLevelQueue.isEmpty()) {
// we could double buffer here, but it doesn't matter probably.
thisLevelQueue.addAll(nextLevelQueue);
nextLevelQueue.clear();
level++;
out.write("\n // Transitions and states at level: " + level + "\n");
while (!thisLevelQueue.isEmpty()) {
final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);
if (fst.targetHasArcs(arc)) {
// scan all arcs
final int node = arc.target;
fst.readFirstTargetArc(arc, arc);
while (true) {
// Emit the unseen state and add it to the queue for the next level.
if (arc.target >= 0 && !seen.get(arc.target)) {
final boolean isExpanded = fst.isExpandedTarget(arc);
emitDotState(out, Integer.toString(arc.target), stateShape,
isExpanded ? expandedNodeColor : null,
labelStates ? Integer.toString(arc.target) : "");
seen.set(arc.target);
nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
sameLevelStates.add(arc.target);
}
String outs;
if (arc.output != NO_OUTPUT) {
outs = "/" + fst.outputs.outputToString(arc.output);
} else {
outs = "";
}
final String cl;
if (arc.label == FST.END_LABEL) {
cl = "~";
} else {
cl = printableLabel(arc.label);
}
out.write(" " + node + " -> " + arc.target + " [label=\"" + cl + outs + "\"]\n");
// Break the loop if we're on the last arc of this state.
if (arc.isLast()) {
break;
}
fst.readNextArc(arc);
}
}
}
// Emit state ranking information.
if (sameRank && sameLevelStates.size() > 1) {
out.write(" {rank=same; ");
for (int state : sameLevelStates) {
out.write(state + "; ");
}
out.write(" }\n");
}
sameLevelStates.clear();
}
// Emit terminating state (always there anyway).
out.write(" -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n");
out.write(" {rank=sink; -1 }\n");
out.write("}\n");
out.flush();
}
/**
* Emit a single state in the <code>dot</code> language.
*/
private static void emitDotState(Writer out, String name, String shape,
String color, String label) throws IOException {
out.write(" " + name
+ " ["
+ (shape != null ? "shape=" + shape : "") + " "
+ (color != null ? "color=" + color : "") + " "
+ (label != null ? "label=\"" + label + "\"" : "label=\"\"") + " "
+ "]\n");
}
/**
* Ensures an arc's label is indeed printable (dot uses US-ASCII).
*/
private static String printableLabel(int label) {
if (label >= 0x20 && label <= 0x7d) {
return Character.toString((char) label);
} else {
return "0x" + Integer.toHexString(label);
}
}
}