package org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Comparator; import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.ByteRunAutomaton; import org.apache.lucene.util.automaton.SpecialOperations; import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.automaton.Transition; /** * A FilteredTermsEnum that enumerates terms based upon what is accepted by a * DFA. * <p> * The algorithm is such: * <ol> * <li>As long as matches are successful, keep reading sequentially. * <li>When a match fails, skip to the next string in lexicographic order that * does not enter a reject state. * </ol> * <p> * The algorithm does not attempt to actually skip to the next string that is * completely accepted. This is not possible when the language accepted by the * FSM is not finite (i.e. * operator). * </p> * @lucene.experimental */ public class AutomatonTermsEnum extends FilteredTermsEnum { // the object-oriented form of the DFA private final Automaton automaton; // a tableized array-based form of the DFA private final ByteRunAutomaton runAutomaton; // common suffix of the automaton private final BytesRef commonSuffixRef; // true if the automaton accepts a finite language private final boolean finite; // array of sorted transitions for each state, indexed by state number private final Transition[][] allTransitions; // for path tracking: each long records gen when we last // visited the state; we use gens to avoid having to clear private final long[] visited; private long curGen; // the reference used for seeking forwards through the term dictionary private final BytesRef seekBytesRef = new BytesRef(10); // true if we are enumerating an infinite portion of the DFA. // in this case it is faster to drive the query based on the terms dictionary. // when this is true, linearUpperBound indicate the end of range // of terms where we should simply do sequential reads instead. private boolean linear = false; private final BytesRef linearUpperBound = new BytesRef(10); private final Comparator<BytesRef> termComp; /** * Expert ctor: * Construct an enumerator based upon an automaton, enumerating the specified * field, working on a supplied reader. * <p> * @lucene.internal Use the public ctor instead. * <p> * @param runAutomaton pre-compiled ByteRunAutomaton * @param finite true if the automaton accepts a finite language */ AutomatonTermsEnum(ByteRunAutomaton runAutomaton, String field, IndexReader reader, boolean finite, BytesRef commonSuffixRef) throws IOException { super(reader, field); this.automaton = runAutomaton.getAutomaton(); this.finite = finite; this.runAutomaton = runAutomaton; if (finite) { // don't use suffix w/ finite DFAs this.commonSuffixRef = null; } else if (commonSuffixRef == null) { // compute now this.commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(automaton); } else { // precomputed this.commonSuffixRef = commonSuffixRef; } // build a cache of sorted transitions for every state allTransitions = new Transition[runAutomaton.getSize()][]; for (State state : this.automaton.getNumberedStates()) { state.sortTransitions(Transition.CompareByMinMaxThenDest); state.trimTransitionsArray(); allTransitions[state.getNumber()] = state.transitionsArray; } // used for path tracking, where each bit is a numbered state. visited = new long[runAutomaton.getSize()]; setUseTermsCache(finite); termComp = getComparator(); } /** * Construct an enumerator based upon an automaton, enumerating the specified * field, working on a supplied reader. * <p> * It will automatically calculate whether or not the automaton is finite */ public AutomatonTermsEnum(Automaton automaton, String field, IndexReader reader) throws IOException { this(new ByteRunAutomaton(automaton), field, reader, SpecialOperations.isFinite(automaton), null); } /** * Returns true if the term matches the automaton. Also stashes away the term * to assist with smart enumeration. */ @Override protected AcceptStatus accept(final BytesRef term) { if (commonSuffixRef == null || term.endsWith(commonSuffixRef)) { if (runAutomaton.run(term.bytes, term.offset, term.length)) return linear ? AcceptStatus.YES : AcceptStatus.YES_AND_SEEK; else return (linear && termComp.compare(term, linearUpperBound) < 0) ? AcceptStatus.NO : AcceptStatus.NO_AND_SEEK; } else { return (linear && termComp.compare(term, linearUpperBound) < 0) ? AcceptStatus.NO : AcceptStatus.NO_AND_SEEK; } } @Override protected BytesRef nextSeekTerm(final BytesRef term) throws IOException { if (term == null) { seekBytesRef.copy(""); // return the empty term, as its valid if (runAutomaton.run(seekBytesRef.bytes, seekBytesRef.offset, seekBytesRef.length)) { return seekBytesRef; } } else { seekBytesRef.copy(term); } // seek to the next possible string; if (nextString()) { // reposition if (linear) setLinear(infinitePosition); return seekBytesRef; } // no more possible strings can match return null; } // this instance prevents unicode conversion during backtracking, // we can just call setLinear once at the end. int infinitePosition; /** * Sets the enum to operate in linear fashion, as we have found * a looping transition at position */ private void setLinear(int position) { int state = runAutomaton.getInitialState(); int maxInterval = 0xef; for (int i = 0; i < position; i++) { state = runAutomaton.step(state, seekBytesRef.bytes[i] & 0xff); assert state >= 0: "state=" + state; } for (int i = 0; i < allTransitions[state].length; i++) { Transition t = allTransitions[state][i]; if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) && (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) { maxInterval = t.getMax(); break; } } // 0xff terms don't get the optimization... not worth the trouble. if (maxInterval != 0xff) maxInterval = incrementUTF8(maxInterval); int length = position + 1; /* position + maxTransition */ if (linearUpperBound.bytes.length < length) linearUpperBound.bytes = new byte[length]; System.arraycopy(seekBytesRef.bytes, 0, linearUpperBound.bytes, 0, position); linearUpperBound.bytes[position] = (byte) maxInterval; linearUpperBound.length = length; } /** * Increments the utf16 buffer to the next String in lexicographic order after s that will not put * the machine into a reject state. If such a string does not exist, returns * false. * * The correctness of this method depends upon the automaton being deterministic, * and having no transitions to dead states. * * @return true if more possible solutions exist for the DFA */ private boolean nextString() { int state; int pos = 0; while (true) { curGen++; linear = false; state = runAutomaton.getInitialState(); // walk the automaton until a character is rejected. for (pos = 0; pos < seekBytesRef.length; pos++) { visited[state] = curGen; int nextState = runAutomaton.step(state, seekBytesRef.bytes[pos] & 0xff); if (nextState == -1) break; // we found a loop, record it for faster enumeration if (!finite && !linear && visited[nextState] == curGen) { linear = true; infinitePosition = pos; } state = nextState; } // take the useful portion, and the last non-reject state, and attempt to // append characters that will match. if (nextString(state, pos)) { return true; } else { /* no more solutions exist from this useful portion, backtrack */ if (!backtrack(pos)) /* no more solutions at all */ return false; else if (runAutomaton.run(seekBytesRef.bytes, 0, seekBytesRef.length)) /* String is good to go as-is */ return true; /* else advance further */ } } } /** * Returns the next String in lexicographic order that will not put * the machine into a reject state. * * This method traverses the DFA from the given position in the String, * starting at the given state. * * If this cannot satisfy the machine, returns false. This method will * walk the minimal path, in lexicographic order, as long as possible. * * If this method returns false, then there might still be more solutions, * it is necessary to backtrack to find out. * * @param state current non-reject state * @param position useful portion of the string * @return true if more possible solutions exist for the DFA from this * position */ private boolean nextString(int state, int position) { /* * the next lexicographic character must be greater than the existing * character, if it exists. */ int c = 0; if (position < seekBytesRef.length) { c = seekBytesRef.bytes[position] & 0xff; // if the next character is U+FFFF and is not part of the useful portion, // then by definition it puts us in a reject state, and therefore this // path is dead. there cannot be any higher transitions. backtrack. c = incrementUTF8(c); if (c == -1) return false; } seekBytesRef.length = position; visited[state] = curGen; Transition transitions[] = allTransitions[state]; // find the minimal path (lexicographic order) that is >= c for (int i = 0; i < transitions.length; i++) { Transition transition = transitions[i]; if (transition.getMax() >= c) { int nextChar = Math.max(c, transition.getMin()); // append either the next sequential char, or the minimum transition seekBytesRef.grow(seekBytesRef.length + 1); seekBytesRef.length++; seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) nextChar; state = transition.getDest().getNumber(); /* * as long as is possible, continue down the minimal path in * lexicographic order. if a loop or accept state is encountered, stop. */ while (visited[state] != curGen && !runAutomaton.isAccept(state)) { visited[state] = curGen; /* * Note: we work with a DFA with no transitions to dead states. * so the below is ok, if it is not an accept state, * then there MUST be at least one transition. */ transition = allTransitions[state][0]; state = transition.getDest().getNumber(); // we found a loop, record it for faster enumeration if (!finite && !linear && visited[state] == curGen) { linear = true; infinitePosition = seekBytesRef.length; } // append the minimum transition seekBytesRef.grow(seekBytesRef.length + 1); seekBytesRef.length++; seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) transition.getMin(); } return true; } } return false; } /** * Attempts to backtrack thru the string after encountering a dead end * at some given position. Returns false if no more possible strings * can match. * * @param position current position in the input String * @return true if more possible solutions exist for the DFA */ private boolean backtrack(int position) { while (position > 0) { int nextChar = seekBytesRef.bytes[position - 1] & 0xff; // if a character is 0xff its a dead-end too, // because there is no higher character in UTF-8 sort order. nextChar = incrementUTF8(nextChar); if (nextChar != -1) { seekBytesRef.bytes[position - 1] = (byte) nextChar; seekBytesRef.length = position; return true; } position--; } return false; /* all solutions exhausted */ } /* return the next utf8 byte in utf8 order, or -1 if exhausted */ private final int incrementUTF8(int utf8) { switch(utf8) { case 0xff: return -1; default: return utf8 + 1; } } }