package org.apache.lucene.util.automaton; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.PrefixTermsEnum; import org.apache.lucene.index.SingleTermsEnum; import org.apache.lucene.util.BytesRef; /** * Immutable class holding compiled details for a given * Automaton. The Automaton is deterministic, must not have * dead states but is not necessarily minimal. * * @lucene.experimental */ public class CompiledAutomaton { /** * Automata are compiled into different internal forms for the * most efficient execution depending upon the language they accept. */ public enum AUTOMATON_TYPE { /** Automaton that accepts no strings. */ NONE, /** Automaton that accepts all possible strings. */ ALL, /** Automaton that accepts only a single fixed string. */ SINGLE, /** Automaton that matches all Strings with a constant prefix. */ PREFIX, /** Catch-all for any other automata. */ NORMAL }; public final AUTOMATON_TYPE type; /** * For {@link AUTOMATON_TYPE#PREFIX}, this is the prefix term; * for {@link AUTOMATON_TYPE#SINGLE} this is the singleton term. */ public final BytesRef term; /** * Matcher for quickly determining if a byte[] is accepted. * only valid for {@link AUTOMATON_TYPE#NORMAL}. */ public final ByteRunAutomaton runAutomaton; // TODO: would be nice if these sortedTransitions had "int // to;" instead of "State to;" somehow: /** * Two dimensional array of transitions, indexed by state * number for traversal. The state numbering is consistent with * {@link #runAutomaton}. * Only valid for {@link AUTOMATON_TYPE#NORMAL}. */ public final Transition[][] sortedTransitions; /** * Shared common suffix accepted by the automaton. Only valid * for {@link AUTOMATON_TYPE#NORMAL}, and only when the * automaton accepts an infinite language. */ public final BytesRef commonSuffixRef; /** * Indicates if the automaton accepts a finite set of strings. * Null if this was not computed. * Only valid for {@link AUTOMATON_TYPE#NORMAL}. */ public final Boolean finite; public CompiledAutomaton(Automaton automaton) { this(automaton, null, true); } public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) { if (simplify) { // Test whether the automaton is a "simple" form and // if so, don't create a runAutomaton. Note that on a // large automaton these tests could be costly: if (BasicOperations.isEmpty(automaton)) { // matches nothing type = AUTOMATON_TYPE.NONE; term = null; commonSuffixRef = null; runAutomaton = null; sortedTransitions = null; this.finite = null; return; } else if (BasicOperations.isTotal(automaton)) { // matches all possible strings type = AUTOMATON_TYPE.ALL; term = null; commonSuffixRef = null; runAutomaton = null; sortedTransitions = null; this.finite = null; return; } else { final String commonPrefix; final String singleton; if (automaton.getSingleton() == null) { commonPrefix = SpecialOperations.getCommonPrefix(automaton); if (commonPrefix.length() > 0 && BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) { singleton = commonPrefix; } else { singleton = null; } } else { commonPrefix = null; singleton = automaton.getSingleton(); } if (singleton != null) { // matches a fixed string in singleton or expanded // representation type = AUTOMATON_TYPE.SINGLE; term = new BytesRef(singleton); commonSuffixRef = null; runAutomaton = null; sortedTransitions = null; this.finite = null; return; } else if (BasicOperations.sameLanguage(automaton, BasicOperations.concatenate( BasicAutomata.makeString(commonPrefix), BasicAutomata.makeAnyString()))) { // matches a constant prefix type = AUTOMATON_TYPE.PREFIX; term = new BytesRef(commonPrefix); commonSuffixRef = null; runAutomaton = null; sortedTransitions = null; this.finite = null; return; } } } type = AUTOMATON_TYPE.NORMAL; term = null; if (finite == null) { this.finite = SpecialOperations.isFinite(automaton); } else { this.finite = finite; } Automaton utf8 = new UTF32ToUTF8().convert(automaton); if (this.finite) { commonSuffixRef = null; } else { commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8); } runAutomaton = new ByteRunAutomaton(utf8, true); sortedTransitions = utf8.getSortedTransitions(); } //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; private BytesRef addTail(int state, BytesRef term, int idx, int leadLabel) { // Find biggest transition that's < label // TODO: use binary search here Transition maxTransition = null; for (Transition transition : sortedTransitions[state]) { if (transition.min < leadLabel) { maxTransition = transition; } } assert maxTransition != null; // Append floorLabel final int floorLabel; if (maxTransition.max > leadLabel-1) { floorLabel = leadLabel-1; } else { floorLabel = maxTransition.max; } if (idx >= term.bytes.length) { term.grow(1+idx); } //if (DEBUG) System.out.println(" add floorLabel=" + (char) floorLabel + " idx=" + idx); term.bytes[idx] = (byte) floorLabel; state = maxTransition.to.getNumber(); idx++; // Push down to last accept state while (true) { Transition[] transitions = sortedTransitions[state]; if (transitions.length == 0) { assert runAutomaton.isAccept(state); term.length = idx; //if (DEBUG) System.out.println(" return " + term.utf8ToString()); return term; } else { // We are pushing "top" -- so get last label of // last transition: assert transitions.length != 0; Transition lastTransition = transitions[transitions.length-1]; if (idx >= term.bytes.length) { term.grow(1+idx); } //if (DEBUG) System.out.println(" push maxLabel=" + (char) lastTransition.max + " idx=" + idx); term.bytes[idx] = (byte) lastTransition.max; state = lastTransition.to.getNumber(); idx++; } } } // TODO: should this take startTerm too? This way // Terms.intersect could forward to this method if type != // NORMAL: public TermsEnum getTermsEnum(Terms terms) throws IOException { switch(type) { case NONE: return TermsEnum.EMPTY; case ALL: return terms.iterator(null); case SINGLE: return new SingleTermsEnum(terms.iterator(null), term); case PREFIX: // TODO: this is very likely faster than .intersect, // but we should test and maybe cutover return new PrefixTermsEnum(terms.iterator(null), term); case NORMAL: return terms.intersect(this, null); default: // unreachable throw new RuntimeException("unhandled case"); } } /** Finds largest term accepted by this Automaton, that's * <= the provided input term. The result is placed in * output; it's fine for output and input to point to * the same BytesRef. The returned result is either the * provided output, or null if there is no floor term * (ie, the provided input term is before the first term * accepted by this Automaton). */ public BytesRef floor(BytesRef input, BytesRef output) { output.offset = 0; //if (DEBUG) System.out.println("CA.floor input=" + input.utf8ToString()); int state = runAutomaton.getInitialState(); // Special case empty string: if (input.length == 0) { if (runAutomaton.isAccept(state)) { output.length = 0; return output; } else { return null; } } final List<Integer> stack = new ArrayList<>(); int idx = 0; while (true) { int label = input.bytes[input.offset + idx] & 0xff; int nextState = runAutomaton.step(state, label); //if (DEBUG) System.out.println(" cycle label=" + (char) label + " nextState=" + nextState); if (idx == input.length-1) { if (nextState != -1 && runAutomaton.isAccept(nextState)) { // Input string is accepted if (idx >= output.bytes.length) { output.grow(1+idx); } output.bytes[idx] = (byte) label; output.length = input.length; //if (DEBUG) System.out.println(" input is accepted; return term=" + output.utf8ToString()); return output; } else { nextState = -1; } } if (nextState == -1) { // Pop back to a state that has a transition // <= our label: while (true) { Transition[] transitions = sortedTransitions[state]; if (transitions.length == 0) { assert runAutomaton.isAccept(state); output.length = idx; //if (DEBUG) System.out.println(" return " + output.utf8ToString()); return output; } else if (label-1 < transitions[0].min) { if (runAutomaton.isAccept(state)) { output.length = idx; //if (DEBUG) System.out.println(" return " + output.utf8ToString()); return output; } // pop if (stack.size() == 0) { //if (DEBUG) System.out.println(" pop ord=" + idx + " return null"); return null; } else { state = stack.remove(stack.size()-1); idx--; //if (DEBUG) System.out.println(" pop ord=" + (idx+1) + " label=" + (char) label + " first trans.min=" + (char) transitions[0].min); label = input.bytes[input.offset + idx] & 0xff; } } else { //if (DEBUG) System.out.println(" stop pop ord=" + idx + " first trans.min=" + (char) transitions[0].min); break; } } //if (DEBUG) System.out.println(" label=" + (char) label + " idx=" + idx); return addTail(state, output, idx, label); } else { if (idx >= output.bytes.length) { output.grow(1+idx); } output.bytes[idx] = (byte) label; stack.add(state); state = nextState; idx++; } } } public String toDot() { StringBuilder b = new StringBuilder("digraph CompiledAutomaton {\n"); b.append(" rankdir = LR;\n"); int initial = runAutomaton.getInitialState(); for (int i = 0; i < sortedTransitions.length; i++) { b.append(" ").append(i); if (runAutomaton.isAccept(i)) b.append(" [shape=doublecircle,label=\"\"];\n"); else b.append(" [shape=circle,label=\"\"];\n"); if (i == initial) { b.append(" initial [shape=plaintext,label=\"\"];\n"); b.append(" initial -> ").append(i).append("\n"); } for (int j = 0; j < sortedTransitions[i].length; j++) { b.append(" ").append(i); sortedTransitions[i][j].appendDot(b); } } return b.append("}\n").toString(); } }