package org.apache.lucene.util; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Arrays; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer; import org.apache.lucene.util.packed.PackedInts; /** * {@link DocIdSet} implementation based on pfor-delta encoding. * <p>This implementation is inspired from LinkedIn's Kamikaze * (http://data.linkedin.com/opensource/kamikaze) and Daniel Lemire's JavaFastPFOR * (https://github.com/lemire/JavaFastPFOR).</p> * <p>On the contrary to the original PFOR paper, exceptions are encoded with * FOR instead of Simple16.</p> */ public final class PForDeltaDocIdSet extends DocIdSet { static final int BLOCK_SIZE = 128; static final int MAX_EXCEPTIONS = 24; // no more than 24 exceptions per block static final PackedInts.Decoder[] DECODERS = new PackedInts.Decoder[32]; static final int[] ITERATIONS = new int[32]; static final int[] BYTE_BLOCK_COUNTS = new int[32]; static final int MAX_BYTE_BLOCK_COUNT; static final MonotonicAppendingLongBuffer SINGLE_ZERO_BUFFER = new MonotonicAppendingLongBuffer(0, 64, PackedInts.COMPACT); static final PForDeltaDocIdSet EMPTY = new PForDeltaDocIdSet(null, 0, Integer.MAX_VALUE, SINGLE_ZERO_BUFFER, SINGLE_ZERO_BUFFER); static final int LAST_BLOCK = 1 << 5; // flag to indicate the last block static final int HAS_EXCEPTIONS = 1 << 6; static final int UNARY = 1 << 7; static { SINGLE_ZERO_BUFFER.add(0); SINGLE_ZERO_BUFFER.freeze(); int maxByteBLockCount = 0; for (int i = 1; i < ITERATIONS.length; ++i) { DECODERS[i] = PackedInts.getDecoder(PackedInts.Format.PACKED, PackedInts.VERSION_CURRENT, i); assert BLOCK_SIZE % DECODERS[i].byteValueCount() == 0; ITERATIONS[i] = BLOCK_SIZE / DECODERS[i].byteValueCount(); BYTE_BLOCK_COUNTS[i] = ITERATIONS[i] * DECODERS[i].byteBlockCount(); maxByteBLockCount = Math.max(maxByteBLockCount, DECODERS[i].byteBlockCount()); } MAX_BYTE_BLOCK_COUNT = maxByteBLockCount; } /** A builder for {@link PForDeltaDocIdSet}. */ public static class Builder { final GrowableByteArrayDataOutput data; final int[] buffer = new int[BLOCK_SIZE]; final int[] exceptionIndices = new int[BLOCK_SIZE]; final int[] exceptions = new int[BLOCK_SIZE]; int bufferSize; int previousDoc; int cardinality; int indexInterval; int numBlocks; // temporary variables used when compressing blocks final int[] freqs = new int[32]; int bitsPerValue; int numExceptions; int bitsPerException; /** Sole constructor. */ public Builder() { data = new GrowableByteArrayDataOutput(128); bufferSize = 0; previousDoc = -1; indexInterval = 2; cardinality = 0; numBlocks = 0; } /** Set the index interval. Every <code>indexInterval</code>-th block will * be stored in the index. Set to {@link Integer#MAX_VALUE} to disable indexing. */ public Builder setIndexInterval(int indexInterval) { if (indexInterval < 1) { throw new IllegalArgumentException("indexInterval must be >= 1"); } this.indexInterval = indexInterval; return this; } /** Add a document to this builder. Documents must be added in order. */ public Builder add(int doc) { if (doc <= previousDoc) { throw new IllegalArgumentException("Doc IDs must be provided in order, but previousDoc=" + previousDoc + " and doc=" + doc); } buffer[bufferSize++] = doc - previousDoc - 1; if (bufferSize == BLOCK_SIZE) { encodeBlock(); bufferSize = 0; } previousDoc = doc; ++cardinality; return this; } /** Convenience method to add the content of a {@link DocIdSetIterator} to this builder. */ public Builder add(DocIdSetIterator it) throws IOException { for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { add(doc); } return this; } void computeFreqs() { Arrays.fill(freqs, 0); for (int i = 0; i < bufferSize; ++i) { ++freqs[32 - Integer.numberOfLeadingZeros(buffer[i])]; } } int pforBlockSize(int bitsPerValue, int numExceptions, int bitsPerException) { final PackedInts.Format format = PackedInts.Format.PACKED; long blockSize = 1 // header: number of bits per value + format.byteCount(PackedInts.VERSION_CURRENT, BLOCK_SIZE, bitsPerValue); if (numExceptions > 0) { blockSize += 2 // 2 additional bytes in case of exceptions: numExceptions and bitsPerException + numExceptions // indices of the exceptions + format.byteCount(PackedInts.VERSION_CURRENT, numExceptions, bitsPerException); } if (bufferSize < BLOCK_SIZE) { blockSize += 1; // length of the block } return (int) blockSize; } int unaryBlockSize() { int deltaSum = 0; for (int i = 0; i < BLOCK_SIZE; ++i) { deltaSum += 1 + buffer[i]; } int blockSize = (deltaSum + 0x07) >>> 3; // round to the next byte ++blockSize; // header if (bufferSize < BLOCK_SIZE) { blockSize += 1; // length of the block } return blockSize; } int computeOptimalNumberOfBits() { computeFreqs(); bitsPerValue = 31; numExceptions = 0; while (bitsPerValue > 0 && freqs[bitsPerValue] == 0) { --bitsPerValue; } final int actualBitsPerValue = bitsPerValue; int blockSize = pforBlockSize(bitsPerValue, numExceptions, bitsPerException); // Now try different values for bitsPerValue and pick the best one for (int bitsPerValue = this.bitsPerValue - 1, numExceptions = freqs[this.bitsPerValue]; bitsPerValue >= 0 && numExceptions <= MAX_EXCEPTIONS; numExceptions += freqs[bitsPerValue--]) { final int newBlockSize = pforBlockSize(bitsPerValue, numExceptions, actualBitsPerValue - bitsPerValue); if (newBlockSize < blockSize) { this.bitsPerValue = bitsPerValue; this.numExceptions = numExceptions; blockSize = newBlockSize; } } this.bitsPerException = actualBitsPerValue - bitsPerValue; assert bufferSize < BLOCK_SIZE || numExceptions < bufferSize; return blockSize; } void pforEncode() { if (numExceptions > 0) { final int mask = (1 << bitsPerValue) - 1; int ex = 0; for (int i = 0; i < bufferSize; ++i) { if (buffer[i] > mask) { exceptionIndices[ex] = i; exceptions[ex++] = buffer[i] >>> bitsPerValue; buffer[i] &= mask; } } assert ex == numExceptions; Arrays.fill(exceptions, numExceptions, BLOCK_SIZE, 0); } if (bitsPerValue > 0) { final PackedInts.Encoder encoder = PackedInts.getEncoder(PackedInts.Format.PACKED, PackedInts.VERSION_CURRENT, bitsPerValue); final int numIterations = ITERATIONS[bitsPerValue]; encoder.encode(buffer, 0, data.bytes, data.length, numIterations); data.length += encoder.byteBlockCount() * numIterations; } if (numExceptions > 0) { assert bitsPerException > 0; data.writeByte((byte) numExceptions); data.writeByte((byte) bitsPerException); final PackedInts.Encoder encoder = PackedInts.getEncoder(PackedInts.Format.PACKED, PackedInts.VERSION_CURRENT, bitsPerException); final int numIterations = (numExceptions + encoder.byteValueCount() - 1) / encoder.byteValueCount(); encoder.encode(exceptions, 0, data.bytes, data.length, numIterations); data.length += PackedInts.Format.PACKED.byteCount(PackedInts.VERSION_CURRENT, numExceptions, bitsPerException); for (int i = 0; i < numExceptions; ++i) { data.writeByte((byte) exceptionIndices[i]); } } } void unaryEncode() { int current = 0; for (int i = 0, doc = -1; i < BLOCK_SIZE; ++i) { doc += 1 + buffer[i]; while (doc >= 8) { data.writeByte((byte) current); current = 0; doc -= 8; } current |= 1 << doc; } if (current != 0) { data.writeByte((byte) current); } } void encodeBlock() { final int originalLength = data.length; Arrays.fill(buffer, bufferSize, BLOCK_SIZE, 0); final int unaryBlockSize = unaryBlockSize(); final int pforBlockSize = computeOptimalNumberOfBits(); final int blockSize; if (pforBlockSize <= unaryBlockSize) { // use pfor blockSize = pforBlockSize; data.bytes = ArrayUtil.grow(data.bytes, data.length + blockSize + MAX_BYTE_BLOCK_COUNT); int token = bufferSize < BLOCK_SIZE ? LAST_BLOCK : 0; token |= bitsPerValue; if (numExceptions > 0) { token |= HAS_EXCEPTIONS; } data.writeByte((byte) token); pforEncode(); } else { // use unary blockSize = unaryBlockSize; final int token = UNARY | (bufferSize < BLOCK_SIZE ? LAST_BLOCK : 0); data.writeByte((byte) token); unaryEncode(); } if (bufferSize < BLOCK_SIZE) { data.writeByte((byte) bufferSize); } ++numBlocks; assert data.length - originalLength == blockSize : (data.length - originalLength) + " <> " + blockSize; } /** Build the {@link PForDeltaDocIdSet} instance. */ public PForDeltaDocIdSet build() { assert bufferSize < BLOCK_SIZE; if (cardinality == 0) { assert previousDoc == -1; return EMPTY; } encodeBlock(); final byte[] dataArr = Arrays.copyOf(data.bytes, data.length + MAX_BYTE_BLOCK_COUNT); final int indexSize = (numBlocks - 1) / indexInterval + 1; final MonotonicAppendingLongBuffer docIDs, offsets; if (indexSize <= 1) { docIDs = offsets = SINGLE_ZERO_BUFFER; } else { final int pageSize = 128; final int initialPageCount = (indexSize + pageSize - 1) / pageSize; docIDs = new MonotonicAppendingLongBuffer(initialPageCount, pageSize, PackedInts.COMPACT); offsets = new MonotonicAppendingLongBuffer(initialPageCount, pageSize, PackedInts.COMPACT); // Now build the index final Iterator it = new Iterator(dataArr, cardinality, Integer.MAX_VALUE, SINGLE_ZERO_BUFFER, SINGLE_ZERO_BUFFER); index: for (int k = 0; k < indexSize; ++k) { docIDs.add(it.docID() + 1); offsets.add(it.offset); for (int i = 0; i < indexInterval; ++i) { it.skipBlock(); if (it.docID() == DocIdSetIterator.NO_MORE_DOCS) { break index; } } } docIDs.freeze(); offsets.freeze(); } return new PForDeltaDocIdSet(dataArr, cardinality, indexInterval, docIDs, offsets); } } final byte[] data; final MonotonicAppendingLongBuffer docIDs, offsets; // for the index final int cardinality, indexInterval; PForDeltaDocIdSet(byte[] data, int cardinality, int indexInterval, MonotonicAppendingLongBuffer docIDs, MonotonicAppendingLongBuffer offsets) { this.data = data; this.cardinality = cardinality; this.indexInterval = indexInterval; this.docIDs = docIDs; this.offsets = offsets; } @Override public boolean isCacheable() { return true; } @Override public DocIdSetIterator iterator() { if (data == null) { return null; } else { return new Iterator(data, cardinality, indexInterval, docIDs, offsets); } } static class Iterator extends DocIdSetIterator { // index final int indexInterval; final MonotonicAppendingLongBuffer docIDs, offsets; final int cardinality; final byte[] data; int offset; // offset in data final int[] nextDocs; int i; // index in nextDeltas final int[] nextExceptions; int blockIdx; int docID; Iterator(byte[] data, int cardinality, int indexInterval, MonotonicAppendingLongBuffer docIDs, MonotonicAppendingLongBuffer offsets) { this.data = data; this.cardinality = cardinality; this.indexInterval = indexInterval; this.docIDs = docIDs; this.offsets = offsets; offset = 0; nextDocs = new int[BLOCK_SIZE]; Arrays.fill(nextDocs, -1); i = BLOCK_SIZE; nextExceptions = new int[BLOCK_SIZE]; blockIdx = -1; docID = -1; } @Override public int docID() { return docID; } void pforDecompress(byte token) { final int bitsPerValue = token & 0x1F; if (bitsPerValue == 0) { Arrays.fill(nextDocs, 0); } else { DECODERS[bitsPerValue].decode(data, offset, nextDocs, 0, ITERATIONS[bitsPerValue]); offset += BYTE_BLOCK_COUNTS[bitsPerValue]; } if ((token & HAS_EXCEPTIONS) != 0) { // there are exceptions final int numExceptions = data[offset++]; final int bitsPerException = data[offset++]; final int numIterations = (numExceptions + DECODERS[bitsPerException].byteValueCount() - 1) / DECODERS[bitsPerException].byteValueCount(); DECODERS[bitsPerException].decode(data, offset, nextExceptions, 0, numIterations); offset += PackedInts.Format.PACKED.byteCount(PackedInts.VERSION_CURRENT, numExceptions, bitsPerException); for (int i = 0; i < numExceptions; ++i) { nextDocs[data[offset++]] |= nextExceptions[i] << bitsPerValue; } } for (int previousDoc = docID, i = 0; i < BLOCK_SIZE; ++i) { final int doc = previousDoc + 1 + nextDocs[i]; previousDoc = nextDocs[i] = doc; } } void unaryDecompress(byte token) { assert (token & HAS_EXCEPTIONS) == 0; int docID = this.docID; for (int i = 0; i < BLOCK_SIZE; ) { final byte b = data[offset++]; for (int bitList = BitUtil.bitList(b); bitList != 0; ++i, bitList >>>= 4) { nextDocs[i] = docID + (bitList & 0x0F); } docID += 8; } } void decompressBlock() { final byte token = data[offset++]; if ((token & UNARY) != 0) { unaryDecompress(token); } else { pforDecompress(token); } if ((token & LAST_BLOCK) != 0) { final int blockSize = data[offset++]; Arrays.fill(nextDocs, blockSize, BLOCK_SIZE, NO_MORE_DOCS); } ++blockIdx; } void skipBlock() { assert i == BLOCK_SIZE; decompressBlock(); docID = nextDocs[BLOCK_SIZE - 1]; } @Override public int nextDoc() { if (i == BLOCK_SIZE) { decompressBlock(); i = 0; } return docID = nextDocs[i++]; } int forwardBinarySearch(int target) { // advance forward and double the window at each step final int indexSize = (int) docIDs.size(); int lo = Math.max(blockIdx / indexInterval, 0), hi = lo + 1; assert blockIdx == -1 || docIDs.get(lo) <= docID; assert lo + 1 == docIDs.size() || docIDs.get(lo + 1) > docID; while (true) { if (hi >= indexSize) { hi = indexSize - 1; break; } else if (docIDs.get(hi) >= target) { break; } final int newLo = hi; hi += (hi - lo) << 1; lo = newLo; } // we found a window containing our target, let's binary search now while (lo <= hi) { final int mid = (lo + hi) >>> 1; final int midDocID = (int) docIDs.get(mid); if (midDocID <= target) { lo = mid + 1; } else { hi = mid - 1; } } assert docIDs.get(hi) <= target; assert hi + 1 == docIDs.size() || docIDs.get(hi + 1) > target; return hi; } @Override public int advance(int target) throws IOException { assert target > docID; if (nextDocs[BLOCK_SIZE - 1] < target) { // not in the next block, now use the index final int index = forwardBinarySearch(target); final int offset = (int) offsets.get(index); if (offset > this.offset) { this.offset = offset; docID = (int) docIDs.get(index) - 1; blockIdx = index * indexInterval - 1; while (true) { decompressBlock(); if (nextDocs[BLOCK_SIZE - 1] >= target) { break; } docID = nextDocs[BLOCK_SIZE - 1]; } i = 0; } } return slowAdvance(target); } @Override public long cost() { return cardinality; } } /** Return the number of documents in this {@link DocIdSet} in constant time. */ public int cardinality() { return cardinality; } /** Return the memory usage of this instance. */ public long ramBytesUsed() { return RamUsageEstimator.alignObjectSize(3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF) + docIDs.ramBytesUsed() + offsets.ramBytesUsed(); } }