/*
* Copyright (C) 2012 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.stats.cardinality;
import com.google.common.base.Preconditions;
import javax.annotation.concurrent.NotThreadSafe;
/**
* A hyperloglog-based cardinality estimator that uses exactly 4 bits per bucket, regardless of the
* cardinality being estimated.
* <p/>
* It is based on the observation that for any given cardinality, the majority of all values fall in
* a range that is at most 4-bit wide. Moreover, the window only moves to the "right" because the
* values in a bucket never decrease.
* <p/>
* Whenever a value is seen that falls outside of the current window it is truncated to the window
* upper bound. This introduces a minor error in the estimation that is smaller than 0.01% based on
* experiments.
*/
@NotThreadSafe
class DenseEstimator
implements Estimator {
private static final int BITS_PER_BUCKET = 4;
private static final int BUCKET_MAX_VALUE = (1 << BITS_PER_BUCKET) - 1;
private static final int BUCKETS_PER_SLOT = Long.SIZE / BITS_PER_BUCKET;
private static final long BUCKET_MASK = (1L << BITS_PER_BUCKET) - 1;
private static final int INSTANCE_SIZE = UnsafeUtil.sizeOf(DenseEstimator.class);
private final int numberOfBuckets;
private final long[] slots;
private double currentSum; // the current sum(1 / (1 << (bucket[i] + baseline)))
private byte baseline; // the lower bound of the current window
private short baselineCount; // the number of buckets who's value is at the lower bound
public DenseEstimator(int numberOfBuckets) {
Preconditions.checkArgument(
Numbers.isPowerOf2(numberOfBuckets),
"numberOfBuckets must be a power of 2"
);
this.numberOfBuckets = numberOfBuckets;
this.baseline = 0;
this.baselineCount = (short) numberOfBuckets;
this.currentSum = numberOfBuckets;
int slotCount = (numberOfBuckets + BUCKETS_PER_SLOT - 1) / BUCKETS_PER_SLOT;
slots = new long[slotCount];
}
public DenseEstimator(int[] bucketValues) {
this(bucketValues.length);
// first, compute the baseline and count of baseline values
baseline = Byte.MAX_VALUE;
baselineCount = 0;
for (int value : bucketValues) {
Preconditions.checkArgument(
value >= 0 && value <= Byte.MAX_VALUE,
"values must be >= 0 and <= %s, found %s",
Byte.MAX_VALUE,
value
);
if (value < baseline) {
baselineCount = 1;
baseline = (byte) value;
} else if (value == baseline) {
++baselineCount;
}
}
currentSum = 0;
// then set all values (rescaled)
int bucket = 0;
for (int value : bucketValues) {
set(bucket, value - baseline);
currentSum += 1.0 / (1L << value);
++bucket;
}
}
public int getNumberOfBuckets() {
return numberOfBuckets;
}
@Override
public int getMaxAllowedBucketValue() {
return Byte.MAX_VALUE;
}
@Override
public boolean setIfGreater(int bucket, int highestBitPosition) {
int relativeHighestBitPosition = highestBitPosition - baseline;
if (relativeHighestBitPosition > BUCKET_MAX_VALUE) {
// we can't fit this in BITS_PER_BUCKET, so truncate (it shouldn't affect results
// significantly due to the low probability of this happening)
relativeHighestBitPosition = BUCKET_MAX_VALUE;
}
int oldValue = get(bucket);
if (relativeHighestBitPosition <= oldValue) {
return false;
}
set(bucket, relativeHighestBitPosition);
currentSum -= 1.0 / (1L << (oldValue + baseline));
currentSum += 1.0 / (1L << (relativeHighestBitPosition + baseline));
if (oldValue == 0) {
--baselineCount;
rescaleAndRecomputeBaseCountIfNeeded();
}
return true;
}
private void set(int bucket, int value) {
int slot = bucket / BUCKETS_PER_SLOT;
int offset = bucket % BUCKETS_PER_SLOT;
// clear the old value
long bucketClearMask = BUCKET_MASK << (offset * BITS_PER_BUCKET);
slots[slot] &= ~bucketClearMask;
// set the new value
long bucketSetMask = ((long) value) << (offset * BITS_PER_BUCKET);
slots[slot] |= bucketSetMask;
}
/**
* gets the value in the specified bucket relative to the current base
*/
private int get(int bucket) {
int slot = bucket / BUCKETS_PER_SLOT;
int offset = bucket % BUCKETS_PER_SLOT;
return (int) ((slots[slot] >> (offset * BITS_PER_BUCKET)) & BUCKET_MASK);
}
private void rescaleAndRecomputeBaseCountIfNeeded() {
while (baselineCount == 0) {
// no more values at the lower bound, so shift the window to the right
++baseline;
baselineCount = 0;
// and re-scale all current buckets
for (int i = 0; i < numberOfBuckets; ++i) {
int value = get(i);
--value;
set(i, value);
if (value == 0) {
// re-calculate the number of buckets who's value is at the lower bound
++baselineCount;
}
}
}
}
@Override
public long estimate() {
double alpha = HyperLogLogUtil.computeAlpha(numberOfBuckets);
double result = alpha * numberOfBuckets * numberOfBuckets / currentSum;
if (result <= 2.5 * numberOfBuckets) {
// adjust for small cardinalities
if (baseline == 0 && baselineCount > 0) {
// baselineCount is the number of buckets with value 0
result = numberOfBuckets * Math.log(numberOfBuckets * 1.0 / baselineCount);
}
}
return Math.round(result);
}
@Override
public int estimateSizeInBytes() {
return estimateSizeInBytes(numberOfBuckets);
}
public static int estimateSizeInBytes(int numberOfBuckets) {
return (numberOfBuckets + BUCKETS_PER_SLOT - 1) / BUCKETS_PER_SLOT * Long.SIZE / 8
+ INSTANCE_SIZE;
}
public int[] buckets() {
int[] result = new int[numberOfBuckets];
for (int i = 0; i < numberOfBuckets; ++i) {
result[i] = get(i) + baseline;
}
return result;
}
}