/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.codecs.idversion; import java.io.IOException; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.index.TermsEnum.SeekStatus; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PairOutputs.Pair; final class IDVersionSegmentTermsEnumFrame { // Our index in stack[]: final int ord; boolean hasTerms; boolean hasTermsOrig; boolean isFloor; // static boolean DEBUG = IDVersionSegmentTermsEnum.DEBUG; /** Highest version of any term in this block. */ long maxIDVersion; FST.Arc<Pair<BytesRef,Long>> arc; // File pointer where this block was loaded from long fp; long fpOrig; long fpEnd; byte[] suffixBytes = new byte[128]; final ByteArrayDataInput suffixesReader = new ByteArrayDataInput(); byte[] floorData = new byte[32]; final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); // Length of prefix shared by all terms in this block int prefix; // Number of entries (term or sub-block) in this block int entCount; // Which term we will next read, or -1 if the block // isn't loaded yet int nextEnt; // True if this block is either not a floor block, // or, it's the last sub-block of a floor block boolean isLastInFloor; // True if all entries are terms boolean isLeafBlock; long lastSubFP; int nextFloorLabel; int numFollowFloorBlocks; // Next term to decode metaData; we decode metaData // lazily so that scanning to find the matching term is // fast and only if you find a match and app wants the // stats or docs/positions enums, will we decode the // metaData int metaDataUpto; final BlockTermState state; // metadata buffer, holding monotonic values public long[] longs; // metadata buffer, holding general values public byte[] bytes; ByteArrayDataInput bytesReader; private final IDVersionSegmentTermsEnum ste; public IDVersionSegmentTermsEnumFrame(IDVersionSegmentTermsEnum ste, int ord) throws IOException { this.ste = ste; this.ord = ord; this.state = ste.fr.parent.postingsReader.newTermState(); this.state.totalTermFreq = -1; this.longs = new long[ste.fr.longsSize]; } public void setFloorData(ByteArrayDataInput in, BytesRef source) { final int numBytes = source.length - (in.getPosition() - source.offset); if (numBytes > floorData.length) { floorData = new byte[ArrayUtil.oversize(numBytes, 1)]; } System.arraycopy(source.bytes, source.offset+in.getPosition(), floorData, 0, numBytes); floorDataReader.reset(floorData, 0, numBytes); numFollowFloorBlocks = floorDataReader.readVInt(); nextFloorLabel = floorDataReader.readByte() & 0xff; //if (DEBUG) { //System.out.println(" setFloorData fpOrig=" + fpOrig + " bytes=" + new BytesRef(source.bytes, source.offset + in.getPosition(), numBytes) + " numFollowFloorBlocks=" + numFollowFloorBlocks + " nextFloorLabel=" + toHex(nextFloorLabel)); //} } public int getTermBlockOrd() { return isLeafBlock ? nextEnt : state.termBlockOrd; } void loadNextFloorBlock() throws IOException { //if (DEBUG) { //System.out.println(" loadNextFloorBlock fp=" + fp + " fpEnd=" + fpEnd); //} assert arc == null || isFloor: "arc=" + arc + " isFloor=" + isFloor; fp = fpEnd; nextEnt = -1; loadBlock(); } /* Does initial decode of next block of terms; this doesn't actually decode the docFreq, totalTermFreq, postings details (frq/prx offset, etc.) metadata; it just loads them as byte[] blobs which are then decoded on-demand if the metadata is ever requested for any term in this block. This enables terms-only intensive consumes (eg certain MTQs, respelling) to not pay the price of decoding metadata they won't use. */ void loadBlock() throws IOException { // Clone the IndexInput lazily, so that consumers // that just pull a TermsEnum to // seekExact(TermState) don't pay this cost: ste.initIndexInput(); if (nextEnt != -1) { // Already loaded return; } //System.out.println("blc=" + blockLoadCount); ste.in.seek(fp); int code = ste.in.readVInt(); entCount = code >>> 1; assert entCount > 0; isLastInFloor = (code & 1) != 0; assert arc == null || (isLastInFloor || isFloor); // TODO: if suffixes were stored in random-access // array structure, then we could do binary search // instead of linear scan to find target term; eg // we could have simple array of offsets // term suffixes: code = ste.in.readVInt(); isLeafBlock = (code & 1) != 0; int numBytes = code >>> 1; if (suffixBytes.length < numBytes) { suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; } ste.in.readBytes(suffixBytes, 0, numBytes); suffixesReader.reset(suffixBytes, 0, numBytes); /*if (DEBUG) { if (arc == null) { System.out.println(" loadBlock (next) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock); } else { System.out.println(" loadBlock (seek) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock); } }*/ metaDataUpto = 0; state.termBlockOrd = 0; nextEnt = 0; lastSubFP = -1; // TODO: we could skip this if !hasTerms; but // that's rare so won't help much // metadata numBytes = ste.in.readVInt(); if (bytes == null) { bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; bytesReader = new ByteArrayDataInput(); } else if (bytes.length < numBytes) { bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; } ste.in.readBytes(bytes, 0, numBytes); bytesReader.reset(bytes, 0, numBytes); // Sub-blocks of a single floor block are always // written one after another -- tail recurse: fpEnd = ste.in.getFilePointer(); // if (DEBUG) { // System.out.println(" fpEnd=" + fpEnd); // } } void rewind() { // Force reload: fp = fpOrig; nextEnt = -1; hasTerms = hasTermsOrig; if (isFloor) { floorDataReader.rewind(); numFollowFloorBlocks = floorDataReader.readVInt(); nextFloorLabel = floorDataReader.readByte() & 0xff; } /* //System.out.println("rewind"); // Keeps the block loaded, but rewinds its state: if (nextEnt > 0 || fp != fpOrig) { if (DEBUG) { System.out.println(" rewind frame ord=" + ord + " fpOrig=" + fpOrig + " fp=" + fp + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " nextEnt=" + nextEnt + " prefixLen=" + prefix); } if (fp != fpOrig) { fp = fpOrig; nextEnt = -1; } else { nextEnt = 0; } hasTerms = hasTermsOrig; if (isFloor) { floorDataReader.rewind(); numFollowFloorBlocks = floorDataReader.readVInt(); nextFloorLabel = floorDataReader.readByte() & 0xff; } assert suffixBytes != null; suffixesReader.rewind(); assert statBytes != null; statsReader.rewind(); metaDataUpto = 0; state.termBlockOrd = 0; // TODO: skip this if !hasTerms? Then postings // impl wouldn't have to write useless 0 byte postingsReader.resetTermsBlock(fieldInfo, state); lastSubFP = -1; } else if (DEBUG) { System.out.println(" skip rewind fp=" + fp + " fpOrig=" + fpOrig + " nextEnt=" + nextEnt + " ord=" + ord); } */ } public boolean next() { return isLeafBlock ? nextLeaf() : nextNonLeaf(); } // Decodes next entry; returns true if it's a sub-block public boolean nextLeaf() { //if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; nextEnt++; suffix = suffixesReader.readVInt(); startBytePos = suffixesReader.getPosition(); ste.term.setLength(prefix + suffix); ste.term.grow(ste.term.length()); suffixesReader.readBytes(ste.term.bytes(), prefix, suffix); // A normal term ste.termExists = true; return false; } public boolean nextNonLeaf() { // if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + entCount); assert nextEnt != -1 && nextEnt < entCount: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; nextEnt++; final int code = suffixesReader.readVInt(); suffix = code >>> 1; startBytePos = suffixesReader.getPosition(); ste.term.setLength(prefix + suffix); ste.term.grow(ste.term.length()); suffixesReader.readBytes(ste.term.bytes(), prefix, suffix); if ((code & 1) == 0) { // A normal term ste.termExists = true; subCode = 0; state.termBlockOrd++; return false; } else { // A sub-block; make sub-FP absolute: ste.termExists = false; subCode = suffixesReader.readVLong(); lastSubFP = fp - subCode; // if (DEBUG) { // System.out.println(" lastSubFP=" + lastSubFP); // } return true; } } // TODO: make this array'd so we can do bin search? // likely not worth it? need to measure how many // floor blocks we "typically" get public void scanToFloorFrame(BytesRef target) { if (!isFloor || target.length <= prefix) { // if (DEBUG) { // System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" + target.length + " vs prefix=" + prefix); // } return; } final int targetLabel = target.bytes[target.offset + prefix] & 0xFF; // if (DEBUG) { // System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" + ((char) targetLabel) + " vs nextFloorLabel=" + ((char) nextFloorLabel) + " numFollowFloorBlocks=" + numFollowFloorBlocks); // } if (targetLabel < nextFloorLabel) { // if (DEBUG) { // System.out.println(" already on correct block"); // } return; } assert numFollowFloorBlocks != 0; long newFP = fpOrig; while (true) { final long code = floorDataReader.readVLong(); newFP = fpOrig + (code >>> 1); hasTerms = (code & 1) != 0; // if (DEBUG) { // System.out.println(" label=" + ((char) nextFloorLabel) + " fp=" + newFP + " hasTerms?=" + hasTerms + " numFollowFloor=" + numFollowFloorBlocks); // } isLastInFloor = numFollowFloorBlocks == 1; numFollowFloorBlocks--; if (isLastInFloor) { nextFloorLabel = 256; // if (DEBUG) { // System.out.println(" stop! last block nextFloorLabel=" + ((char) nextFloorLabel)); // } break; } else { nextFloorLabel = floorDataReader.readByte() & 0xff; if (targetLabel < nextFloorLabel) { // if (DEBUG) { // System.out.println(" stop! nextFloorLabel=" + ((char) nextFloorLabel)); // } break; } } } if (newFP != fp) { // Force re-load of the block: // if (DEBUG) { // System.out.println(" force switch to fp=" + newFP + " oldFP=" + fp); // } nextEnt = -1; fp = newFP; } else { // if (DEBUG) { // System.out.println(" stay on same fp=" + newFP); // } } } public void decodeMetaData() throws IOException { //if (DEBUG) System.out.println("\nBTTR.decodeMetadata seg=" + ste.fr.parent.segment + " mdUpto=" + metaDataUpto + " vs termBlockOrd=" + state.termBlockOrd); assert nextEnt >= 0; // lazily catch up on metadata decode: final int limit = getTermBlockOrd(); boolean absolute = metaDataUpto == 0; // TODO: better API would be "jump straight to term=N"??? while (metaDataUpto < limit) { // TODO: we could make "tiers" of metadata, ie, // decode docFreq/totalTF but don't decode postings // metadata; this way caller could get // docFreq/totalTF w/o paying decode cost for // postings // TODO: if docFreq were bulk decoded we could // just skipN here: // stats state.docFreq = 1; state.totalTermFreq = 1; //if (DEBUG) System.out.println(" dF=" + state.docFreq); // metadata for (int i = 0; i < ste.fr.longsSize; i++) { longs[i] = bytesReader.readVLong(); } ste.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ste.fr.fieldInfo, state, absolute); metaDataUpto++; absolute = false; } state.termBlockOrd = metaDataUpto; } // Used only by assert private boolean prefixMatches(BytesRef target) { for(int bytePos=0;bytePos<prefix;bytePos++) { if (target.bytes[target.offset + bytePos] != ste.term.byteAt(bytePos)) { return false; } } return true; } // Scans to sub-block that has this target fp; only // called by next(); NOTE: does not set // startBytePos/suffix as a side effect public void scanToSubBlock(long subFP) { assert !isLeafBlock; //if (DEBUG) System.out.println(" scanToSubBlock fp=" + fp + " subFP=" + subFP + " entCount=" + entCount + " lastSubFP=" + lastSubFP); //assert nextEnt == 0; if (lastSubFP == subFP) { //if (DEBUG) System.out.println(" already positioned"); return; } assert subFP < fp : "fp=" + fp + " subFP=" + subFP; final long targetSubCode = fp - subFP; //if (DEBUG) System.out.println(" targetSubCode=" + targetSubCode); while(true) { assert nextEnt < entCount; nextEnt++; final int code = suffixesReader.readVInt(); suffixesReader.skipBytes(isLeafBlock ? code : code >>> 1); //if (DEBUG) System.out.println(" " + nextEnt + " (of " + entCount + ") ent isSubBlock=" + ((code&1)==1)); if ((code & 1) != 0) { final long subCode = suffixesReader.readVLong(); //if (DEBUG) System.out.println(" subCode=" + subCode); if (targetSubCode == subCode) { //if (DEBUG) System.out.println(" match!"); lastSubFP = subFP; return; } } else { state.termBlockOrd++; } } } // NOTE: sets startBytePos/suffix as a side effect public SeekStatus scanToTerm(BytesRef target, boolean exactOnly) throws IOException { return isLeafBlock ? scanToTermLeaf(target, exactOnly) : scanToTermNonLeaf(target, exactOnly); } private int startBytePos; private int suffix; private long subCode; // Target's prefix matches this block's prefix; we // scan the entries check if the suffix matches. public SeekStatus scanToTermLeaf(BytesRef target, boolean exactOnly) throws IOException { // if (DEBUG) System.out.println(" scanToTermLeaf: block fp=" + fp + " prefix=" + prefix + " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + IDVersionSegmentTermsEnum.brToString(target) + " term=" + IDVersionSegmentTermsEnum.brToString(ste.term)); assert nextEnt != -1; ste.termExists = true; subCode = 0; if (nextEnt == entCount) { if (exactOnly) { fillTerm(); } return SeekStatus.END; } assert prefixMatches(target); // Loop over each entry (term or sub-block) in this block: //nextTerm: while(nextEnt < entCount) { nextTerm: while (true) { nextEnt++; suffix = suffixesReader.readVInt(); // if (DEBUG) { // BytesRef suffixBytesRef = new BytesRef(); // suffixBytesRef.bytes = suffixBytes; // suffixBytesRef.offset = suffixesReader.getPosition(); // suffixBytesRef.length = suffix; // System.out.println(" cycle: term " + (nextEnt-1) + " (of " + entCount + ") suffix=" + IDVersionSegmentTermsEnum.brToString(suffixBytesRef)); // } final int termLen = prefix + suffix; startBytePos = suffixesReader.getPosition(); suffixesReader.skipBytes(suffix); final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen); int targetPos = target.offset + prefix; // Loop over bytes in the suffix, comparing to // the target int bytePos = startBytePos; while(true) { final int cmp; final boolean stop; if (targetPos < targetLimit) { cmp = (suffixBytes[bytePos++]&0xFF) - (target.bytes[targetPos++]&0xFF); stop = false; } else { assert targetPos == targetLimit; cmp = termLen - target.length; stop = true; } if (cmp < 0) { // Current entry is still before the target; // keep scanning if (nextEnt == entCount) { if (exactOnly) { fillTerm(); } // We are done scanning this block break nextTerm; } else { continue nextTerm; } } else if (cmp > 0) { // Done! Current entry is after target -- // return NOT_FOUND: fillTerm(); if (!exactOnly && !ste.termExists) { // We are on a sub-block, and caller wants // us to position to the next term after // the target, so we must recurse into the // sub-frame(s): ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, termLen); ste.currentFrame.loadBlock(); while (ste.currentFrame.next()) { ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length()); ste.currentFrame.loadBlock(); } } //if (DEBUG) System.out.println(" not found"); return SeekStatus.NOT_FOUND; } else if (stop) { // Exact match! // This cannot be a sub-block because we // would have followed the index to this // sub-block from the start: assert ste.termExists; fillTerm(); //if (DEBUG) System.out.println(" found!"); return SeekStatus.FOUND; } } } // It is possible (and OK) that terms index pointed us // at this block, but, we scanned the entire block and // did not find the term to position to. This happens // when the target is after the last term in the block // (but, before the next term in the index). EG // target could be foozzz, and terms index pointed us // to the foo* block, but the last term in this block // was fooz (and, eg, first term in the next block will // bee fop). //if (DEBUG) System.out.println(" block end"); if (exactOnly) { fillTerm(); } // TODO: not consistent that in the // not-exact case we don't next() into the next // frame here return SeekStatus.END; } // Target's prefix matches this block's prefix; we // scan the entries check if the suffix matches. public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws IOException { // if (DEBUG) System.out.println(" scanToTermNonLeaf: block fp=" + fp + " prefix=" + prefix + " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + IDVersionSegmentTermsEnum.brToString(target) + " term=" + IDVersionSegmentTermsEnum.brToString(ste.term)); assert nextEnt != -1; if (nextEnt == entCount) { if (exactOnly) { fillTerm(); ste.termExists = subCode == 0; } return SeekStatus.END; } assert prefixMatches(target); // Loop over each entry (term or sub-block) in this block: //nextTerm: while(nextEnt < entCount) { nextTerm: while (true) { nextEnt++; final int code = suffixesReader.readVInt(); suffix = code >>> 1; // if (DEBUG) { // BytesRef suffixBytesRef = new BytesRef(); // suffixBytesRef.bytes = suffixBytes; // suffixBytesRef.offset = suffixesReader.getPosition(); // suffixBytesRef.length = suffix; // System.out.println(" cycle: " + ((code&1)==1 ? "sub-block" : "term") + " " + (nextEnt-1) + " (of " + entCount + ") suffix=" + brToString(suffixBytesRef)); // } ste.termExists = (code & 1) == 0; final int termLen = prefix + suffix; startBytePos = suffixesReader.getPosition(); suffixesReader.skipBytes(suffix); if (ste.termExists) { state.termBlockOrd++; subCode = 0; } else { subCode = suffixesReader.readVLong(); lastSubFP = fp - subCode; } final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen); int targetPos = target.offset + prefix; // Loop over bytes in the suffix, comparing to // the target int bytePos = startBytePos; while(true) { final int cmp; final boolean stop; if (targetPos < targetLimit) { cmp = (suffixBytes[bytePos++]&0xFF) - (target.bytes[targetPos++]&0xFF); stop = false; } else { assert targetPos == targetLimit; cmp = termLen - target.length; stop = true; } if (cmp < 0) { // Current entry is still before the target; // keep scanning if (nextEnt == entCount) { if (exactOnly) { fillTerm(); //termExists = true; } // We are done scanning this block break nextTerm; } else { continue nextTerm; } } else if (cmp > 0) { // Done! Current entry is after target -- // return NOT_FOUND: fillTerm(); if (!exactOnly && !ste.termExists) { // We are on a sub-block, and caller wants // us to position to the next term after // the target, so we must recurse into the // sub-frame(s): ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, termLen); ste.currentFrame.loadBlock(); while (ste.currentFrame.next()) { ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length()); ste.currentFrame.loadBlock(); } } //if (DEBUG) System.out.println(" not found"); return SeekStatus.NOT_FOUND; } else if (stop) { // Exact match! // This cannot be a sub-block because we // would have followed the index to this // sub-block from the start: assert ste.termExists; fillTerm(); //if (DEBUG) System.out.println(" found!"); return SeekStatus.FOUND; } } } // It is possible (and OK) that terms index pointed us // at this block, but, we scanned the entire block and // did not find the term to position to. This happens // when the target is after the last term in the block // (but, before the next term in the index). EG // target could be foozzz, and terms index pointed us // to the foo* block, but the last term in this block // was fooz (and, eg, first term in the next block will // bee fop). //if (DEBUG) System.out.println(" block end"); if (exactOnly) { fillTerm(); } // TODO: not consistent that in the // not-exact case we don't next() into the next // frame here return SeekStatus.END; } private void fillTerm() { final int termLength = prefix + suffix; ste.term.setLength(prefix + suffix); ste.term.grow(termLength); System.arraycopy(suffixBytes, startBytePos, ste.term.bytes(), prefix, suffix); } }