/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.packed;
import java.util.Arrays;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.FixedBitSet; // for javadocs
/** Encode a non decreasing sequence of non negative whole numbers in the Elias-Fano encoding
* that was introduced in the 1970's by Peter Elias and Robert Fano.
* <p>
* The Elias-Fano encoding is a high bits / low bits representation of
* a monotonically increasing sequence of <code>numValues > 0</code> natural numbers <code>x[i]</code>
* <p>
* <code>0 <= x[0] <= x[1] <= ... <= x[numValues-2] <= x[numValues-1] <= upperBound</code>
* <p>
* where <code>upperBound > 0</code> is an upper bound on the last value.
* <br>
* The Elias-Fano encoding uses less than half a bit per encoded number more
* than the smallest representation
* that can encode any monotone sequence with the same bounds.
* <p>
* The lower <code>L</code> bits of each <code>x[i]</code> are stored explicitly and contiguously
* in the lower-bits array, with <code>L</code> chosen as (<code>log()</code> base 2):
* <p>
* <code>L = max(0, floor(log(upperBound/numValues)))</code>
* <p>
* The upper bits are stored in the upper-bits array as a sequence of unary-coded gaps (<code>x[-1] = 0</code>):
* <p>
* <code>(x[i]/2**L) - (x[i-1]/2**L)</code>
* <p>
* The unary code encodes a natural number <code>n</code> by <code>n</code> 0 bits followed by a 1 bit:
* <code>0...01</code>. <br>
* In the upper bits the total the number of 1 bits is <code>numValues</code>
* and the total number of 0 bits is:<p>
* <code>floor(x[numValues-1]/2**L) <= upperBound/(2**max(0, floor(log(upperBound/numValues)))) <= 2*numValues</code>
* <p>
* The Elias-Fano encoding uses at most
* <p>
* <code>2 + ceil(log(upperBound/numValues))</code>
* <p>
* bits per encoded number. With <code>upperBound</code> in these bounds (<code>p</code> is an integer):
* <p>
* <code>2**p < x[numValues-1] <= upperBound <= 2**(p+1)</code>
* <p>
* the number of bits per encoded number is minimized.
* <p>
* In this implementation the values in the sequence can be given as <code>long</code>,
* <code>numValues = 0</code> and <code>upperBound = 0</code> are allowed,
* and each of the upper and lower bit arrays should fit in a <code>long[]</code>.
* <br>
* An index of positions of zero's in the upper bits is also built.
* <p>
* This implementation is based on this article:
* <br>
* Sebastiano Vigna, "Quasi Succinct Indices", June 19, 2012, sections 3, 4 and 9.
* Retrieved from http://arxiv.org/pdf/1206.4300 .
*
* <p>The articles originally describing the Elias-Fano representation are:
* <br>Peter Elias, "Efficient storage and retrieval by content and address of static files",
* J. Assoc. Comput. Mach., 21(2):246–260, 1974.
* <br>Robert M. Fano, "On the number of bits required to implement an associative memory",
* Memorandum 61, Computer Structures Group, Project MAC, MIT, Cambridge, Mass., 1971.
*
* @lucene.internal
*/
public class EliasFanoEncoder {
final long numValues;
private final long upperBound;
final int numLowBits;
final long lowerBitsMask;
final long[] upperLongs;
final long[] lowerLongs;
private static final int LOG2_LONG_SIZE = Long.numberOfTrailingZeros(Long.SIZE);
long numEncoded = 0L;
long lastEncoded = 0L;
/** The default index interval for zero upper bits. */
public static final long DEFAULT_INDEX_INTERVAL = 256;
final long numIndexEntries;
final long indexInterval;
final int nIndexEntryBits;
/** upperZeroBitPositionIndex[i] (filled using packValue) will contain the bit position
* just after the zero bit ((i+1) * indexInterval) in the upper bits.
*/
final long[] upperZeroBitPositionIndex;
long currentEntryIndex; // also indicates how many entries in the index are valid.
/**
* Construct an Elias-Fano encoder.
* After construction, call {@link #encodeNext} <code>numValues</code> times to encode
* a non decreasing sequence of non negative numbers.
* @param numValues The number of values that is to be encoded.
* @param upperBound At least the highest value that will be encoded.
* For space efficiency this should not exceed the power of two that equals
* or is the first higher than the actual maximum.
* <br>When <code>numValues >= (upperBound/3)</code>
* a {@link FixedBitSet} will take less space.
* @param indexInterval The number of high zero bits for which a single index entry is built.
* The index will have at most <code>2 * numValues / indexInterval</code> entries
* and each index entry will use at most <code>ceil(log2(3 * numValues))</code> bits,
* see {@link EliasFanoEncoder}.
* @throws IllegalArgumentException when:
* <ul>
* <li><code>numValues</code> is negative, or
* <li><code>numValues</code> is non negative and <code>upperBound</code> is negative, or
* <li>the low bits do not fit in a <code>long[]</code>:
* <code>(L * numValues / 64) > Integer.MAX_VALUE</code>, or
* <li>the high bits do not fit in a <code>long[]</code>:
* <code>(2 * numValues / 64) > Integer.MAX_VALUE</code>, or
* <li><code>indexInterval < 2</code>,
* <li>the index bits do not fit in a <code>long[]</code>:
* <code>(numValues / indexInterval * ceil(2log(3 * numValues)) / 64) > Integer.MAX_VALUE</code>.
* </ul>
*/
public EliasFanoEncoder(long numValues, long upperBound, long indexInterval) {
if (numValues < 0L) {
throw new IllegalArgumentException("numValues should not be negative: " + numValues);
}
this.numValues = numValues;
if ((numValues > 0L) && (upperBound < 0L)) {
throw new IllegalArgumentException("upperBound should not be negative: " + upperBound + " when numValues > 0");
}
this.upperBound = numValues > 0 ? upperBound : -1L; // if there is no value, -1 is the best upper bound
int nLowBits = 0;
if (this.numValues > 0) { // nLowBits = max(0; floor(2log(upperBound/numValues)))
long lowBitsFac = this.upperBound / this.numValues;
if (lowBitsFac > 0) {
nLowBits = 63 - Long.numberOfLeadingZeros(lowBitsFac); // see Long.numberOfLeadingZeros javadocs
}
}
this.numLowBits = nLowBits;
this.lowerBitsMask = Long.MAX_VALUE >>> (Long.SIZE - 1 - this.numLowBits);
long numLongsForLowBits = numLongsForBits(numValues * numLowBits);
if (numLongsForLowBits > Integer.MAX_VALUE) {
throw new IllegalArgumentException("numLongsForLowBits too large to index a long array: " + numLongsForLowBits);
}
this.lowerLongs = new long[(int) numLongsForLowBits];
long numHighBitsClear = ((this.upperBound > 0) ? this.upperBound : 0) >>> this.numLowBits;
assert numHighBitsClear <= (2 * this.numValues);
long numHighBitsSet = this.numValues;
long numLongsForHighBits = numLongsForBits(numHighBitsClear + numHighBitsSet);
if (numLongsForHighBits > Integer.MAX_VALUE) {
throw new IllegalArgumentException("numLongsForHighBits too large to index a long array: " + numLongsForHighBits);
}
this.upperLongs = new long[(int) numLongsForHighBits];
if (indexInterval < 2) {
throw new IllegalArgumentException("indexInterval should at least 2: " + indexInterval);
}
// For the index:
long maxHighValue = upperBound >>> this.numLowBits;
long nIndexEntries = maxHighValue / indexInterval; // no zero value index entry
this.numIndexEntries = (nIndexEntries >= 0) ? nIndexEntries : 0;
long maxIndexEntry = maxHighValue + numValues - 1; // clear upper bits, set upper bits, start at zero
this.nIndexEntryBits = (maxIndexEntry <= 0) ? 0
: (64 - Long.numberOfLeadingZeros(maxIndexEntry));
long numLongsForIndexBits = numLongsForBits(numIndexEntries * nIndexEntryBits);
if (numLongsForIndexBits > Integer.MAX_VALUE) {
throw new IllegalArgumentException("numLongsForIndexBits too large to index a long array: " + numLongsForIndexBits);
}
this.upperZeroBitPositionIndex = new long[(int) numLongsForIndexBits];
this.currentEntryIndex = 0;
this.indexInterval = indexInterval;
}
/**
* Construct an Elias-Fano encoder using {@link #DEFAULT_INDEX_INTERVAL}.
*/
public EliasFanoEncoder(long numValues, long upperBound) {
this(numValues, upperBound, DEFAULT_INDEX_INTERVAL);
}
private static long numLongsForBits(long numBits) { // Note: int version in FixedBitSet.bits2words()
assert numBits >= 0 : numBits;
return (numBits + (Long.SIZE-1)) >>> LOG2_LONG_SIZE;
}
/** Call at most <code>numValues</code> times to encode a non decreasing sequence of non negative numbers.
* @param x The next number to be encoded.
* @throws IllegalStateException when called more than <code>numValues</code> times.
* @throws IllegalArgumentException when:
* <ul>
* <li><code>x</code> is smaller than an earlier encoded value, or
* <li><code>x</code> is larger than <code>upperBound</code>.
* </ul>
*/
public void encodeNext(long x) {
if (numEncoded >= numValues) {
throw new IllegalStateException("encodeNext called more than " + numValues + " times.");
}
if (lastEncoded > x) {
throw new IllegalArgumentException(x + " smaller than previous " + lastEncoded);
}
if (x > upperBound) {
throw new IllegalArgumentException(x + " larger than upperBound " + upperBound);
}
long highValue = x >>> numLowBits;
encodeUpperBits(highValue);
encodeLowerBits(x & lowerBitsMask);
lastEncoded = x;
// Add index entries:
long indexValue = (currentEntryIndex + 1) * indexInterval;
while (indexValue <= highValue) {
long afterZeroBitPosition = indexValue + numEncoded;
packValue(afterZeroBitPosition, upperZeroBitPositionIndex, nIndexEntryBits, currentEntryIndex);
currentEntryIndex += 1;
indexValue += indexInterval;
}
numEncoded++;
}
private void encodeUpperBits(long highValue) {
long nextHighBitNum = numEncoded + highValue; // sequence of unary gaps
upperLongs[(int)(nextHighBitNum >>> LOG2_LONG_SIZE)] |= (1L << (nextHighBitNum & (Long.SIZE-1)));
}
private void encodeLowerBits(long lowValue) {
packValue(lowValue, lowerLongs, numLowBits, numEncoded);
}
private static void packValue(long value, long[] longArray, int numBits, long packIndex) {
if (numBits != 0) {
long bitPos = numBits * packIndex;
int index = (int) (bitPos >>> LOG2_LONG_SIZE);
int bitPosAtIndex = (int) (bitPos & (Long.SIZE-1));
longArray[index] |= (value << bitPosAtIndex);
if ((bitPosAtIndex + numBits) > Long.SIZE) {
longArray[index+1] = (value >>> (Long.SIZE - bitPosAtIndex));
}
}
}
/** Provide an indication that it is better to use an {@link EliasFanoEncoder} than a {@link FixedBitSet}
* to encode document identifiers.
* This indication is not precise and may change in the future.
* <br>An EliasFanoEncoder is favoured when the size of the encoding by the EliasFanoEncoder
* (including some space for its index) is at most about 5/6 of the size of the FixedBitSet,
* this is the same as comparing estimates of the number of bits accessed by a pair of FixedBitSets and
* by a pair of non indexed EliasFanoDocIdSets when determining the intersections of the pairs.
* <br>A bit set is preferred when <code>upperbound <= 256</code>.
* <br>It is assumed that {@link #DEFAULT_INDEX_INTERVAL} is used.
* @param numValues The number of document identifiers that is to be encoded. Should be non negative.
* @param upperBound The maximum possible value for a document identifier. Should be at least <code>numValues</code>.
*/
public static boolean sufficientlySmallerThanBitSet(long numValues, long upperBound) {
/* When (upperBound / 6) == numValues,
* the number of bits per entry for the EliasFanoEncoder is 2 + ceil(2log(upperBound/numValues)) == 5.
*
* For intersecting two bit sets upperBound bits are accessed, roughly half of one, half of the other.
* For intersecting two EliasFano sequences without index on the upper bits,
* all (2 * 3 * numValues) upper bits are accessed.
*/
return (upperBound > (4 * Long.SIZE)) // prefer a bit set when it takes no more than 4 longs.
&& (upperBound / 7) > numValues; // 6 + 1 to allow some room for the index.
}
/**
* Returns an {@link EliasFanoDecoder} to access the encoded values.
* Perform all calls to {@link #encodeNext} before calling {@link #getDecoder}.
*/
public EliasFanoDecoder getDecoder() {
// decode as far as currently encoded as determined by numEncoded.
return new EliasFanoDecoder(this);
}
/** Expert. The low bits. */
public long[] getLowerBits() {
return lowerLongs;
}
/** Expert. The high bits. */
public long[] getUpperBits() {
return upperLongs;
}
/** Expert. The index bits. */
public long[] getIndexBits() {
return upperZeroBitPositionIndex;
}
@Override
public String toString() {
StringBuilder s = new StringBuilder("EliasFanoSequence");
s.append(" numValues " + numValues);
s.append(" numEncoded " + numEncoded);
s.append(" upperBound " + upperBound);
s.append(" lastEncoded " + lastEncoded);
s.append(" numLowBits " + numLowBits);
s.append("\nupperLongs[" + upperLongs.length + "]");
for (int i = 0; i < upperLongs.length; i++) {
s.append(" " + ToStringUtils.longHex(upperLongs[i]));
}
s.append("\nlowerLongs[" + lowerLongs.length + "]");
for (int i = 0; i < lowerLongs.length; i++) {
s.append(" " + ToStringUtils.longHex(lowerLongs[i]));
}
s.append("\nindexInterval: " + indexInterval + ", nIndexEntryBits: " + nIndexEntryBits);
s.append("\nupperZeroBitPositionIndex[" + upperZeroBitPositionIndex.length + "]");
for (int i = 0; i < upperZeroBitPositionIndex.length; i++) {
s.append(" " + ToStringUtils.longHex(upperZeroBitPositionIndex[i]));
}
return s.toString();
}
@Override
public boolean equals(Object other) {
if (! (other instanceof EliasFanoEncoder)) {
return false;
}
EliasFanoEncoder oefs = (EliasFanoEncoder) other;
// no equality needed for upperBound
return (this.numValues == oefs.numValues)
&& (this.numEncoded == oefs.numEncoded)
&& (this.numLowBits == oefs.numLowBits)
&& (this.numIndexEntries == oefs.numIndexEntries)
&& (this.indexInterval == oefs.indexInterval) // no need to check index content
&& Arrays.equals(this.upperLongs, oefs.upperLongs)
&& Arrays.equals(this.lowerLongs, oefs.lowerLongs);
}
@Override
public int hashCode() {
int h = ((int) (31*(numValues + 7*(numEncoded + 5*(numLowBits + 3*(numIndexEntries + 11*indexInterval))))))
^ Arrays.hashCode(upperLongs)
^ Arrays.hashCode(lowerLongs);
return h;
}
}