/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.codecs.blocktreeords; import java.io.IOException; import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.RunAutomaton; import org.apache.lucene.util.fst.FST; // NOTE: cannot seek! final class OrdsIntersectTermsEnum extends TermsEnum { final IndexInput in; private OrdsIntersectTermsEnumFrame[] stack; @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc<Output>[] arcs = new FST.Arc[5]; final RunAutomaton runAutomaton; final CompiledAutomaton compiledAutomaton; private OrdsIntersectTermsEnumFrame currentFrame; private final BytesRef term = new BytesRef(); private final FST.BytesReader fstReader; final OrdsFieldReader fr; private BytesRef savedStartTerm; // TODO: in some cases we can filter by length? eg // regexp foo*bar must be at least length 6 bytes public OrdsIntersectTermsEnum(OrdsFieldReader fr, CompiledAutomaton compiled, BytesRef startTerm) throws IOException { // if (DEBUG) { // System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef)); // } this.fr = fr; runAutomaton = compiled.runAutomaton; compiledAutomaton = compiled; in = fr.parent.in.clone(); stack = new OrdsIntersectTermsEnumFrame[5]; for(int idx=0;idx<stack.length;idx++) { stack[idx] = new OrdsIntersectTermsEnumFrame(this, idx); } for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) { arcs[arcIdx] = new FST.Arc<>(); } if (fr.index == null) { fstReader = null; } else { fstReader = fr.index.getBytesReader(); } // TODO: if the automaton is "smallish" we really // should use the terms index to seek at least to // the initial term and likely to subsequent terms // (or, maybe just fallback to ATE for such cases). // Else the seek cost of loading the frames will be // too costly. final FST.Arc<Output> arc = fr.index.getFirstArc(arcs[0]); // Empty string prefix must have an output in the index! assert arc.isFinal(); // Special pushFrame since it's the first one: final OrdsIntersectTermsEnumFrame f = stack[0]; f.fp = f.fpOrig = fr.rootBlockFP; f.prefix = 0; f.setState(0); f.arc = arc; f.outputPrefix = arc.output; f.load(fr.rootCode); // for assert: assert setSavedStartTerm(startTerm); currentFrame = f; if (startTerm != null) { seekToStartTerm(startTerm); } } // only for assert: private boolean setSavedStartTerm(BytesRef startTerm) { savedStartTerm = startTerm == null ? null : BytesRef.deepCopyOf(startTerm); return true; } @Override public TermState termState() throws IOException { currentFrame.decodeMetaData(); return currentFrame.termState.clone(); } private OrdsIntersectTermsEnumFrame getFrame(int ord) throws IOException { if (ord >= stack.length) { final OrdsIntersectTermsEnumFrame[] next = new OrdsIntersectTermsEnumFrame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(stack, 0, next, 0, stack.length); for(int stackOrd=stack.length;stackOrd<next.length;stackOrd++) { next[stackOrd] = new OrdsIntersectTermsEnumFrame(this, stackOrd); } stack = next; } assert stack[ord].ord == ord; return stack[ord]; } private FST.Arc<Output> getArc(int ord) { if (ord >= arcs.length) { @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<Output>[] next = new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(arcs, 0, next, 0, arcs.length); for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) { next[arcOrd] = new FST.Arc<>(); } arcs = next; } return arcs[ord]; } private OrdsIntersectTermsEnumFrame pushFrame(int state) throws IOException { final OrdsIntersectTermsEnumFrame f = getFrame(currentFrame == null ? 0 : 1+currentFrame.ord); f.fp = f.fpOrig = currentFrame.lastSubFP; f.prefix = currentFrame.prefix + currentFrame.suffix; // if (DEBUG) System.out.println(" pushFrame state=" + state + " prefix=" + f.prefix); f.setState(state); // Walk the arc through the index -- we only // "bother" with this so we can get the floor data // from the index and skip floor blocks when // possible: FST.Arc<Output> arc = currentFrame.arc; int idx = currentFrame.prefix; assert currentFrame.suffix > 0; Output output = currentFrame.outputPrefix; while (idx < f.prefix) { final int target = term.bytes[idx] & 0xff; // TODO: we could be more efficient for the next() // case by using current arc as starting point, // passed to findTargetArc arc = fr.index.findTargetArc(target, arc, getArc(1+idx), fstReader); assert arc != null; output = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output); idx++; } f.arc = arc; f.outputPrefix = output; assert arc.isFinal(); f.load(OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput)); return f; } @Override public BytesRef term() { return term; } // TODO: do we need ord() here? OrdsIntersectTermsEnumFrame tracks termOrd but it may be buggy! @Override public int docFreq() throws IOException { //if (DEBUG) System.out.println("BTIR.docFreq"); currentFrame.decodeMetaData(); //if (DEBUG) System.out.println(" return " + currentFrame.termState.docFreq); return currentFrame.termState.docFreq; } @Override public long totalTermFreq() throws IOException { currentFrame.decodeMetaData(); return currentFrame.termState.totalTermFreq; } @Override public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { currentFrame.decodeMetaData(); return fr.parent.postingsReader.postings(fr.fieldInfo, currentFrame.termState, reuse, flags); } private int getState() { int state = currentFrame.state; for(int idx=0;idx<currentFrame.suffix;idx++) { state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff); assert state != -1; } return state; } // NOTE: specialized to only doing the first-time // seek, but we could generalize it to allow // arbitrary seekExact/Ceil. Note that this is a // seekFloor! private void seekToStartTerm(BytesRef target) throws IOException { //if (DEBUG) System.out.println("seek to startTerm=" + target.utf8ToString()); assert currentFrame.ord == 0; if (term.length < target.length) { term.bytes = ArrayUtil.grow(term.bytes, target.length); } FST.Arc<Output> arc = arcs[0]; assert arc == currentFrame.arc; for(int idx=0;idx<=target.length;idx++) { while (true) { final int savePos = currentFrame.suffixesReader.getPosition(); final int saveStartBytePos = currentFrame.startBytePos; final int saveSuffix = currentFrame.suffix; final long saveLastSubFP = currentFrame.lastSubFP; final int saveTermBlockOrd = currentFrame.termState.termBlockOrd; final boolean isSubBlock = currentFrame.next(); //if (DEBUG) System.out.println(" cycle ent=" + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") prefix=" + currentFrame.prefix + " suffix=" + currentFrame.suffix + " isBlock=" + isSubBlock + " firstLabel=" + (currentFrame.suffix == 0 ? "" : (currentFrame.suffixBytes[currentFrame.startBytePos])&0xff)); term.length = currentFrame.prefix + currentFrame.suffix; if (term.bytes.length < term.length) { term.bytes = ArrayUtil.grow(term.bytes, term.length); } System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix); if (isSubBlock && StringHelper.startsWith(target, term)) { // Recurse //if (DEBUG) System.out.println(" recurse!"); currentFrame = pushFrame(getState()); break; } else { final int cmp = term.compareTo(target); if (cmp < 0) { if (currentFrame.nextEnt == currentFrame.entCount) { if (!currentFrame.isLastInFloor) { //if (DEBUG) System.out.println(" load floorBlock"); currentFrame.loadNextFloorBlock(); continue; } else { //if (DEBUG) System.out.println(" return term=" + brToString(term)); return; } } continue; } else if (cmp == 0) { //if (DEBUG) System.out.println(" return term=" + brToString(term)); return; } else { // Fallback to prior entry: the semantics of // this method is that the first call to // next() will return the term after the // requested term currentFrame.nextEnt--; currentFrame.lastSubFP = saveLastSubFP; currentFrame.startBytePos = saveStartBytePos; currentFrame.suffix = saveSuffix; currentFrame.suffixesReader.setPosition(savePos); currentFrame.termState.termBlockOrd = saveTermBlockOrd; System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix); term.length = currentFrame.prefix + currentFrame.suffix; // If the last entry was a block we don't // need to bother recursing and pushing to // the last term under it because the first // next() will simply skip the frame anyway return; } } } } assert false; } @Override public BytesRef next() throws IOException { // if (DEBUG) { // System.out.println("\nintEnum.next seg=" + segment); // System.out.println(" frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); // } nextTerm: while(true) { // Pop finished frames while (currentFrame.nextEnt == currentFrame.entCount) { if (!currentFrame.isLastInFloor) { //if (DEBUG) System.out.println(" next-floor-block"); currentFrame.loadNextFloorBlock(); //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); } else { //if (DEBUG) System.out.println(" pop frame"); if (currentFrame.ord == 0) { return null; } final long lastFP = currentFrame.fpOrig; currentFrame = stack[currentFrame.ord-1]; assert currentFrame.lastSubFP == lastFP; //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); } } final boolean isSubBlock = currentFrame.next(); // if (DEBUG) { // final BytesRef suffixRef = new BytesRef(); // suffixRef.bytes = currentFrame.suffixBytes; // suffixRef.offset = currentFrame.startBytePos; // suffixRef.length = currentFrame.suffix; // System.out.println(" " + (isSubBlock ? "sub-block" : "term") + " " + currentFrame.nextEnt + " (of " + currentFrame.entCount + ") suffix=" + brToString(suffixRef)); // } if (currentFrame.suffix != 0) { final int label = currentFrame.suffixBytes[currentFrame.startBytePos] & 0xff; while (label > currentFrame.curTransitionMax) { if (currentFrame.transitionIndex >= currentFrame.transitionCount-1) { // Stop processing this frame -- no further // matches are possible because we've moved // beyond what the max transition will allow //if (DEBUG) System.out.println(" break: trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex])); // sneaky! forces a pop above currentFrame.isLastInFloor = true; currentFrame.nextEnt = currentFrame.entCount; continue nextTerm; } currentFrame.transitionIndex++; compiledAutomaton.automaton.getNextTransition(currentFrame.transition); currentFrame.curTransitionMax = currentFrame.transition.max; //if (DEBUG) System.out.println(" next trans=" + currentFrame.transitions[currentFrame.transitionIndex]); } } // First test the common suffix, if set: if (compiledAutomaton.commonSuffixRef != null && !isSubBlock) { final int termLen = currentFrame.prefix + currentFrame.suffix; if (termLen < compiledAutomaton.commonSuffixRef.length) { // No match // if (DEBUG) { // System.out.println(" skip: common suffix length"); // } continue nextTerm; } final byte[] suffixBytes = currentFrame.suffixBytes; final byte[] commonSuffixBytes = compiledAutomaton.commonSuffixRef.bytes; final int lenInPrefix = compiledAutomaton.commonSuffixRef.length - currentFrame.suffix; assert compiledAutomaton.commonSuffixRef.offset == 0; int suffixBytesPos; int commonSuffixBytesPos = 0; if (lenInPrefix > 0) { // A prefix of the common suffix overlaps with // the suffix of the block prefix so we first // test whether the prefix part matches: final byte[] termBytes = term.bytes; int termBytesPos = currentFrame.prefix - lenInPrefix; assert termBytesPos >= 0; final int termBytesPosEnd = currentFrame.prefix; while (termBytesPos < termBytesPosEnd) { if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { // if (DEBUG) { // System.out.println(" skip: common suffix mismatch (in prefix)"); // } continue nextTerm; } } suffixBytesPos = currentFrame.startBytePos; } else { suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - compiledAutomaton.commonSuffixRef.length; } // Test overlapping suffix part: final int commonSuffixBytesPosEnd = compiledAutomaton.commonSuffixRef.length; while (commonSuffixBytesPos < commonSuffixBytesPosEnd) { if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { // if (DEBUG) { // System.out.println(" skip: common suffix mismatch"); // } continue nextTerm; } } } // TODO: maybe we should do the same linear test // that AutomatonTermsEnum does, so that if we // reach a part of the automaton where .* is // "temporarily" accepted, we just blindly .next() // until the limit // See if the term prefix matches the automaton: int state = currentFrame.state; for (int idx=0;idx<currentFrame.suffix;idx++) { state = runAutomaton.step(state, currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff); if (state == -1) { // No match //System.out.println(" no s=" + state); continue nextTerm; } else { //System.out.println(" c s=" + state); } } if (isSubBlock) { // Match! Recurse: //if (DEBUG) System.out.println(" sub-block match to state=" + state + "; recurse fp=" + currentFrame.lastSubFP); copyTerm(); currentFrame = pushFrame(state); //if (DEBUG) System.out.println("\n frame ord=" + currentFrame.ord + " prefix=" + brToString(new BytesRef(term.bytes, term.offset, currentFrame.prefix)) + " state=" + currentFrame.state + " lastInFloor?=" + currentFrame.isLastInFloor + " fp=" + currentFrame.fp + " trans=" + (currentFrame.transitions.length == 0 ? "n/a" : currentFrame.transitions[currentFrame.transitionIndex]) + " outputPrefix=" + currentFrame.outputPrefix); } else if (runAutomaton.isAccept(state)) { copyTerm(); //if (DEBUG) System.out.println(" term match to state=" + state + "; return term=" + brToString(term)); assert savedStartTerm == null || term.compareTo(savedStartTerm) > 0: "saveStartTerm=" + savedStartTerm.utf8ToString() + " term=" + term.utf8ToString(); return term; } else { //System.out.println(" no s=" + state); } } } private void copyTerm() { //System.out.println(" copyTerm cur.prefix=" + currentFrame.prefix + " cur.suffix=" + currentFrame.suffix + " first=" + (char) currentFrame.suffixBytes[currentFrame.startBytePos]); final int len = currentFrame.prefix + currentFrame.suffix; if (term.bytes.length < len) { term.bytes = ArrayUtil.grow(term.bytes, len); } System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix); term.length = len; } @Override public boolean seekExact(BytesRef text) { throw new UnsupportedOperationException(); } @Override public void seekExact(long ord) { throw new UnsupportedOperationException(); } @Override public long ord() { throw new UnsupportedOperationException(); } @Override public SeekStatus seekCeil(BytesRef text) { throw new UnsupportedOperationException(); } }