package org.apache.lucene.util; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.Comparator; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.DataInput; import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer; import org.apache.lucene.util.packed.PackedInts; /** * {@link DocIdSet} implementation based on word-aligned hybrid encoding on * words of 8 bits. * <p>This implementation doesn't support random-access but has a fast * {@link DocIdSetIterator} which can advance in logarithmic time thanks to * an index.</p> * <p>The compression scheme is simplistic and should work well with sparse and * very dense doc id sets while being only slightly larger than a * {@link FixedBitSet} for incompressible sets (overhead<2% in the worst * case) in spite of the index.</p> * <p><b>Format</b>: The format is byte-aligned. An 8-bits word is either clean, * meaning composed only of zeros or ones, or dirty, meaning that it contains * between 1 and 7 bits set. The idea is to encode sequences of clean words * using run-length encoding and to leave sequences of dirty words as-is.</p> * <table> * <tr><th>Token</th><th>Clean length+</th><th>Dirty length+</th><th>Dirty words</th></tr> * <tr><td>1 byte</td><td>0-n bytes</td><td>0-n bytes</td><td>0-n bytes</td></tr> * </table> * <ul> * <li><b>Token</b> encodes whether clean means full of zeros or ones in the * first bit, the number of clean words minus 2 on the next 3 bits and the * number of dirty words on the last 4 bits. The higher-order bit is a * continuation bit, meaning that the number is incomplete and needs additional * bytes to be read.</li> * <li><b>Clean length+</b>: If clean length has its higher-order bit set, * you need to read a {@link DataInput#readVInt() vint}, shift it by 3 bits on * the left side and add it to the 3 bits which have been read in the token.</li> * <li><b>Dirty length+</b> works the same way as <b>Clean length+</b> but * on 4 bits and for the length of dirty words.</li> * <li><b>Dirty words</b> are the dirty words, there are <b>Dirty length</b> * of them.</li> * </ul> * <p>This format cannot encode sequences of less than 2 clean words and 0 dirty * word. The reason is that if you find a single clean word, you should rather * encode it as a dirty word. This takes the same space as starting a new * sequence (since you need one byte for the token) but will be lighter to * decode. There is however an exception for the first sequence. Since the first * sequence may start directly with a dirty word, the clean length is encoded * directly, without subtracting 2.</p> * <p>There is an additional restriction on the format: the sequence of dirty * words is not allowed to contain two consecutive clean words. This restriction * exists to make sure no space is wasted and to make sure iterators can read * the next doc ID by reading at most 2 dirty words.</p> * @lucene.experimental */ public final class WAH8DocIdSet extends DocIdSet { // Minimum index interval, intervals below this value can't guarantee anymore // that this set implementation won't be significantly larger than a FixedBitSet // The reason is that a single sequence saves at least one byte and an index // entry requires at most 8 bytes (2 ints) so there shouldn't be more than one // index entry every 8 sequences private static final int MIN_INDEX_INTERVAL = 8; /** Default index interval. */ public static final int DEFAULT_INDEX_INTERVAL = 24; private static final MonotonicAppendingLongBuffer SINGLE_ZERO_BUFFER = new MonotonicAppendingLongBuffer(1, 64, PackedInts.COMPACT); private static WAH8DocIdSet EMPTY = new WAH8DocIdSet(new byte[0], 0, 1, SINGLE_ZERO_BUFFER, SINGLE_ZERO_BUFFER); static { SINGLE_ZERO_BUFFER.add(0L); SINGLE_ZERO_BUFFER.freeze(); } private static final Comparator<Iterator> SERIALIZED_LENGTH_COMPARATOR = new Comparator<Iterator>() { @Override public int compare(Iterator wi1, Iterator wi2) { return wi1.in.length() - wi2.in.length(); } }; /** Same as {@link #intersect(Collection, int)} with the default index interval. */ public static WAH8DocIdSet intersect(Collection<WAH8DocIdSet> docIdSets) { return intersect(docIdSets, DEFAULT_INDEX_INTERVAL); } /** * Compute the intersection of the provided sets. This method is much faster than * computing the intersection manually since it operates directly at the byte level. */ public static WAH8DocIdSet intersect(Collection<WAH8DocIdSet> docIdSets, int indexInterval) { switch (docIdSets.size()) { case 0: throw new IllegalArgumentException("There must be at least one set to intersect"); case 1: return docIdSets.iterator().next(); } // The logic below is similar to ConjunctionScorer final int numSets = docIdSets.size(); final Iterator[] iterators = new Iterator[numSets]; int i = 0; for (WAH8DocIdSet set : docIdSets) { final Iterator it = set.iterator(); iterators[i++] = it; } Arrays.sort(iterators, SERIALIZED_LENGTH_COMPARATOR); final WordBuilder builder = new WordBuilder().setIndexInterval(indexInterval); int wordNum = 0; main: while (true) { // Advance the least costly iterator first iterators[0].advanceWord(wordNum); wordNum = iterators[0].wordNum; if (wordNum == DocIdSetIterator.NO_MORE_DOCS) { break; } byte word = iterators[0].word; for (i = 1; i < numSets; ++i) { if (iterators[i].wordNum < wordNum) { iterators[i].advanceWord(wordNum); } if (iterators[i].wordNum > wordNum) { wordNum = iterators[i].wordNum; continue main; } assert iterators[i].wordNum == wordNum; word &= iterators[i].word; if (word == 0) { // There are common words, but they don't share any bit ++wordNum; continue main; } } // Found a common word assert word != 0; builder.addWord(wordNum, word); ++wordNum; } return builder.build(); } /** Same as {@link #union(Collection, int)} with the default index interval. */ public static WAH8DocIdSet union(Collection<WAH8DocIdSet> docIdSets) { return union(docIdSets, DEFAULT_INDEX_INTERVAL); } /** * Compute the union of the provided sets. This method is much faster than * computing the union manually since it operates directly at the byte level. */ public static WAH8DocIdSet union(Collection<WAH8DocIdSet> docIdSets, int indexInterval) { switch (docIdSets.size()) { case 0: return EMPTY; case 1: return docIdSets.iterator().next(); } // The logic below is very similar to DisjunctionScorer final int numSets = docIdSets.size(); final PriorityQueue<Iterator> iterators = new PriorityQueue<WAH8DocIdSet.Iterator>(numSets) { @Override protected boolean lessThan(Iterator a, Iterator b) { return a.wordNum < b.wordNum; } }; for (WAH8DocIdSet set : docIdSets) { Iterator iterator = set.iterator(); iterator.nextWord(); iterators.add(iterator); } Iterator top = iterators.top(); if (top.wordNum == Integer.MAX_VALUE) { return EMPTY; } int wordNum = top.wordNum; byte word = top.word; final WordBuilder builder = new WordBuilder().setIndexInterval(indexInterval); while (true) { top.nextWord(); iterators.updateTop(); top = iterators.top(); if (top.wordNum == wordNum) { word |= top.word; } else { builder.addWord(wordNum, word); if (top.wordNum == Integer.MAX_VALUE) { break; } wordNum = top.wordNum; word = top.word; } } return builder.build(); } static int wordNum(int docID) { assert docID >= 0; return docID >>> 3; } /** Word-based builder. */ static class WordBuilder { final GrowableByteArrayDataOutput out; final GrowableByteArrayDataOutput dirtyWords; int clean; int lastWordNum; int numSequences; int indexInterval; int cardinality; boolean reverse; WordBuilder() { out = new GrowableByteArrayDataOutput(1024); dirtyWords = new GrowableByteArrayDataOutput(128); clean = 0; lastWordNum = -1; numSequences = 0; indexInterval = DEFAULT_INDEX_INTERVAL; cardinality = 0; } /** Set the index interval. Smaller index intervals improve performance of * {@link DocIdSetIterator#advance(int)} but make the {@link DocIdSet} * larger. An index interval <code>i</code> makes the index add an overhead * which is at most <code>4/i</code>, but likely much less.The default index * interval is <code>8</code>, meaning the index has an overhead of at most * 50%. To disable indexing, you can pass {@link Integer#MAX_VALUE} as an * index interval. */ public WordBuilder setIndexInterval(int indexInterval) { if (indexInterval < MIN_INDEX_INTERVAL) { throw new IllegalArgumentException("indexInterval must be >= " + MIN_INDEX_INTERVAL); } this.indexInterval = indexInterval; return this; } void writeHeader(boolean reverse, int cleanLength, int dirtyLength) throws IOException { final int cleanLengthMinus2 = cleanLength - 2; assert cleanLengthMinus2 >= 0; assert dirtyLength >= 0; int token = ((cleanLengthMinus2 & 0x03) << 4) | (dirtyLength & 0x07); if (reverse) { token |= 1 << 7; } if (cleanLengthMinus2 > 0x03) { token |= 1 << 6; } if (dirtyLength > 0x07) { token |= 1 << 3; } out.writeByte((byte) token); if (cleanLengthMinus2 > 0x03) { out.writeVInt(cleanLengthMinus2 >>> 2); } if (dirtyLength > 0x07) { out.writeVInt(dirtyLength >>> 3); } } private boolean sequenceIsConsistent() { for (int i = 1; i < dirtyWords.length; ++i) { assert dirtyWords.bytes[i-1] != 0 || dirtyWords.bytes[i] != 0; assert dirtyWords.bytes[i-1] != (byte) 0xFF || dirtyWords.bytes[i] != (byte) 0xFF; } return true; } void writeSequence() { assert sequenceIsConsistent(); try { writeHeader(reverse, clean, dirtyWords.length); } catch (IOException cannotHappen) { throw new AssertionError(cannotHappen); } out.writeBytes(dirtyWords.bytes, 0, dirtyWords.length); dirtyWords.length = 0; ++numSequences; } void addWord(int wordNum, byte word) { assert wordNum > lastWordNum; assert word != 0; if (!reverse) { if (lastWordNum == -1) { clean = 2 + wordNum; // special case for the 1st sequence dirtyWords.writeByte(word); } else { switch (wordNum - lastWordNum) { case 1: if (word == (byte) 0xFF && dirtyWords.bytes[dirtyWords.length-1] == (byte) 0xFF) { --dirtyWords.length; writeSequence(); reverse = true; clean = 2; } else { dirtyWords.writeByte(word); } break; case 2: dirtyWords.writeByte((byte) 0); dirtyWords.writeByte(word); break; default: writeSequence(); clean = wordNum - lastWordNum - 1; dirtyWords.writeByte(word); } } } else { assert lastWordNum >= 0; switch (wordNum - lastWordNum) { case 1: if (word == (byte) 0xFF) { if (dirtyWords.length == 0) { ++clean; } else if (dirtyWords.bytes[dirtyWords.length - 1] == (byte) 0xFF) { --dirtyWords.length; writeSequence(); clean = 2; } else { dirtyWords.writeByte(word); } } else { dirtyWords.writeByte(word); } break; case 2: dirtyWords.writeByte((byte) 0); dirtyWords.writeByte(word); break; default: writeSequence(); reverse = false; clean = wordNum - lastWordNum - 1; dirtyWords.writeByte(word); } } lastWordNum = wordNum; cardinality += BitUtil.bitCount(word); } /** Build a new {@link WAH8DocIdSet}. */ public WAH8DocIdSet build() { if (cardinality == 0) { assert lastWordNum == -1; return EMPTY; } writeSequence(); final byte[] data = Arrays.copyOf(out.bytes, out.length); // Now build the index final int valueCount = (numSequences - 1) / indexInterval + 1; final MonotonicAppendingLongBuffer indexPositions, indexWordNums; if (valueCount <= 1) { indexPositions = indexWordNums = SINGLE_ZERO_BUFFER; } else { final int pageSize = 128; final int initialPageCount = (valueCount + pageSize - 1) / pageSize; final MonotonicAppendingLongBuffer positions = new MonotonicAppendingLongBuffer(initialPageCount, pageSize, PackedInts.COMPACT); final MonotonicAppendingLongBuffer wordNums = new MonotonicAppendingLongBuffer(initialPageCount, pageSize, PackedInts.COMPACT); positions.add(0L); wordNums.add(0L); final Iterator it = new Iterator(data, cardinality, Integer.MAX_VALUE, SINGLE_ZERO_BUFFER, SINGLE_ZERO_BUFFER); assert it.in.getPosition() == 0; assert it.wordNum == -1; for (int i = 1; i < valueCount; ++i) { // skip indexInterval sequences for (int j = 0; j < indexInterval; ++j) { final boolean readSequence = it.readSequence(); assert readSequence; it.skipDirtyBytes(); } final int position = it.in.getPosition(); final int wordNum = it.wordNum; positions.add(position); wordNums.add(wordNum + 1); } positions.freeze(); wordNums.freeze(); indexPositions = positions; indexWordNums = wordNums; } return new WAH8DocIdSet(data, cardinality, indexInterval, indexPositions, indexWordNums); } } /** A builder for {@link WAH8DocIdSet}s. */ public static final class Builder extends WordBuilder { private int lastDocID; private int wordNum, word; /** Sole constructor */ public Builder() { super(); lastDocID = -1; wordNum = -1; word = 0; } /** Add a document to this builder. Documents must be added in order. */ public Builder add(int docID) { if (docID <= lastDocID) { throw new IllegalArgumentException("Doc ids must be added in-order, got " + docID + " which is <= lastDocID=" + lastDocID); } final int wordNum = wordNum(docID); if (this.wordNum == -1) { this.wordNum = wordNum; word = 1 << (docID & 0x07); } else if (wordNum == this.wordNum) { word |= 1 << (docID & 0x07); } else { addWord(this.wordNum, (byte) word); this.wordNum = wordNum; word = 1 << (docID & 0x07); } lastDocID = docID; return this; } /** Add the content of the provided {@link DocIdSetIterator}. */ public Builder add(DocIdSetIterator disi) throws IOException { for (int doc = disi.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi.nextDoc()) { add(doc); } return this; } @Override public Builder setIndexInterval(int indexInterval) { return (Builder) super.setIndexInterval(indexInterval); } @Override public WAH8DocIdSet build() { if (this.wordNum != -1) { addWord(wordNum, (byte) word); } return super.build(); } } // where the doc IDs are stored private final byte[] data; private final int cardinality; private final int indexInterval; // index for advance(int) private final MonotonicAppendingLongBuffer positions, wordNums; // wordNums[i] starts at the sequence at positions[i] WAH8DocIdSet(byte[] data, int cardinality, int indexInterval, MonotonicAppendingLongBuffer positions, MonotonicAppendingLongBuffer wordNums) { this.data = data; this.cardinality = cardinality; this.indexInterval = indexInterval; this.positions = positions; this.wordNums = wordNums; } @Override public boolean isCacheable() { return true; } @Override public Iterator iterator() { return new Iterator(data, cardinality, indexInterval, positions, wordNums); } static int readCleanLength(ByteArrayDataInput in, int token) { int len = (token >>> 4) & 0x07; final int startPosition = in.getPosition(); if ((len & 0x04) != 0) { len = (len & 0x03) | (in.readVInt() << 2); } if (startPosition != 1) { len += 2; } return len; } static int readDirtyLength(ByteArrayDataInput in, int token) { int len = token & 0x0F; if ((len & 0x08) != 0) { len = (len & 0x07) | (in.readVInt() << 3); } return len; } static class Iterator extends DocIdSetIterator { /* Using the index can be costly for close targets. */ static int indexThreshold(int cardinality, int indexInterval) { // Short sequences encode for 3 words (2 clean words and 1 dirty byte), // don't advance if we are going to read less than 3 x indexInterval // sequences long indexThreshold = 3L * 3 * indexInterval; return (int) Math.min(Integer.MAX_VALUE, indexThreshold); } final ByteArrayDataInput in; final int cardinality; final int indexInterval; final MonotonicAppendingLongBuffer positions, wordNums; final int indexThreshold; int allOnesLength; int dirtyLength; int wordNum; // byte offset byte word; // current word int bitList; // list of bits set in the current word int sequenceNum; // in which sequence are we? int docID; Iterator(byte[] data, int cardinality, int indexInterval, MonotonicAppendingLongBuffer positions, MonotonicAppendingLongBuffer wordNums) { this.in = new ByteArrayDataInput(data); this.cardinality = cardinality; this.indexInterval = indexInterval; this.positions = positions; this.wordNums = wordNums; wordNum = -1; word = 0; bitList = 0; sequenceNum = -1; docID = -1; indexThreshold = indexThreshold(cardinality, indexInterval); } boolean readSequence() { if (in.eof()) { wordNum = Integer.MAX_VALUE; return false; } final int token = in.readByte() & 0xFF; if ((token & (1 << 7)) == 0) { final int cleanLength = readCleanLength(in, token); wordNum += cleanLength; } else { allOnesLength = readCleanLength(in, token); } dirtyLength = readDirtyLength(in, token); assert in.length() - in.getPosition() >= dirtyLength : in.getPosition() + " " + in.length() + " " + dirtyLength; ++sequenceNum; return true; } void skipDirtyBytes(int count) { assert count >= 0; assert count <= allOnesLength + dirtyLength; wordNum += count; if (count <= allOnesLength) { allOnesLength -= count; } else { count -= allOnesLength; allOnesLength = 0; in.skipBytes(count); dirtyLength -= count; } } void skipDirtyBytes() { wordNum += allOnesLength + dirtyLength; in.skipBytes(dirtyLength); allOnesLength = 0; dirtyLength = 0; } void nextWord() { if (allOnesLength > 0) { word = (byte) 0xFF; ++wordNum; --allOnesLength; return; } if (dirtyLength > 0) { word = in.readByte(); ++wordNum; --dirtyLength; if (word != 0) { return; } if (dirtyLength > 0) { word = in.readByte(); ++wordNum; --dirtyLength; assert word != 0; // never more than one consecutive 0 return; } } if (readSequence()) { nextWord(); } } int forwardBinarySearch(int targetWordNum) { // advance forward and double the window at each step final int indexSize = (int) wordNums.size(); int lo = sequenceNum / indexInterval, hi = lo + 1; assert sequenceNum == -1 || wordNums.get(lo) <= wordNum; assert lo + 1 == wordNums.size() || wordNums.get(lo + 1) > wordNum; while (true) { if (hi >= indexSize) { hi = indexSize - 1; break; } else if (wordNums.get(hi) >= targetWordNum) { break; } final int newLo = hi; hi += (hi - lo) << 1; lo = newLo; } // we found a window containing our target, let's binary search now while (lo <= hi) { final int mid = (lo + hi) >>> 1; final int midWordNum = (int) wordNums.get(mid); if (midWordNum <= targetWordNum) { lo = mid + 1; } else { hi = mid - 1; } } assert wordNums.get(hi) <= targetWordNum; assert hi+1 == wordNums.size() || wordNums.get(hi + 1) > targetWordNum; return hi; } void advanceWord(int targetWordNum) { assert targetWordNum > wordNum; int delta = targetWordNum - wordNum; if (delta <= allOnesLength + dirtyLength + 1) { skipDirtyBytes(delta - 1); } else { skipDirtyBytes(); assert dirtyLength == 0; if (delta > indexThreshold) { // use the index final int i = forwardBinarySearch(targetWordNum); final int position = (int) positions.get(i); if (position > in.getPosition()) { // if the binary search returned a backward offset, don't move wordNum = (int) wordNums.get(i) - 1; in.setPosition(position); sequenceNum = i * indexInterval - 1; } } while (true) { if (!readSequence()) { return; } delta = targetWordNum - wordNum; if (delta <= allOnesLength + dirtyLength + 1) { if (delta > 1) { skipDirtyBytes(delta - 1); } break; } skipDirtyBytes(); } } nextWord(); } @Override public int docID() { return docID; } @Override public int nextDoc() throws IOException { if (bitList != 0) { // there are remaining bits in the current word docID = (wordNum << 3) | ((bitList & 0x0F) - 1); bitList >>>= 4; return docID; } nextWord(); if (wordNum == Integer.MAX_VALUE) { return docID = NO_MORE_DOCS; } bitList = BitUtil.bitList(word); assert bitList != 0; docID = (wordNum << 3) | ((bitList & 0x0F) - 1); bitList >>>= 4; return docID; } @Override public int advance(int target) throws IOException { assert target > docID; final int targetWordNum = wordNum(target); if (targetWordNum > wordNum) { advanceWord(targetWordNum); bitList = BitUtil.bitList(word); } return slowAdvance(target); } @Override public long cost() { return cardinality; } } /** Return the number of documents in this {@link DocIdSet} in constant time. */ public int cardinality() { return cardinality; } /** Return the memory usage of this class in bytes. */ public long ramBytesUsed() { return RamUsageEstimator.alignObjectSize(3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + 2 * RamUsageEstimator.NUM_BYTES_INT) + RamUsageEstimator.sizeOf(data) + positions.ramBytesUsed() + wordNums.ramBytesUsed(); } }