/*
* Copyright (C) 2012 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.stats.cardinality;
import com.google.common.base.Preconditions;
import javax.annotation.concurrent.NotThreadSafe;
import static com.facebook.stats.cardinality.HyperLogLogUtil.computeHash;
/**
* An implementation of the HyperLogLog algorithm:
* <p/>
* http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
*/
@NotThreadSafe
public class HyperLogLog {
private final byte[] buckets;
// The current sum of 1 / (1L << buckets[i]). Updated as new items are added and used for
// estimation
private double currentSum;
private int nonZeroBuckets = 0;
public HyperLogLog(int numberOfBuckets) {
Preconditions.checkArgument(
Numbers.isPowerOf2(numberOfBuckets),
"numberOfBuckets must be a power of 2"
);
Preconditions.checkArgument(numberOfBuckets > 0, "numberOfBuckets must be > 0");
buckets = new byte[numberOfBuckets];
currentSum = buckets.length;
}
public HyperLogLog(int[] buckets) {
this(buckets.length);
currentSum = 0;
for (int i = 0; i < buckets.length; i++) {
int value = buckets[i];
Preconditions.checkArgument(
value >= 0 && value <= Byte.MAX_VALUE,
"values must be > 0 and <= %s, found %s",
Byte.MAX_VALUE,
value
);
this.buckets[i] = (byte) value;
currentSum += 1.0 / (1 << value);
if (value != 0) {
nonZeroBuckets++;
}
}
}
public void add(long value) {
BucketAndHash bucketAndHash = BucketAndHash.fromHash(computeHash(value), buckets.length);
int bucket = bucketAndHash.getBucket();
int lowestBitPosition = Long.numberOfTrailingZeros(bucketAndHash.getHash()) + 1;
int previous = buckets[bucket];
if (previous == 0) {
nonZeroBuckets++;
}
if (lowestBitPosition > previous) {
currentSum -= 1.0 / (1L << previous);
currentSum += 1.0 / (1L << lowestBitPosition);
buckets[bucket] = (byte) lowestBitPosition;
}
}
public long estimate() {
double alpha = HyperLogLogUtil.computeAlpha(buckets.length);
double result = alpha * buckets.length * buckets.length / currentSum;
if (result <= 2.5 * buckets.length) {
// adjust for small cardinalities
int zeroBuckets = buckets.length - nonZeroBuckets;
if (zeroBuckets > 0) {
result = buckets.length * Math.log(buckets.length * 1.0 / zeroBuckets);
}
}
return Math.round(result);
}
public int[] buckets() {
int[] result = new int[buckets.length];
for (int i = 0; i < buckets.length; i++) {
result[i] = buckets[i];
}
return result;
}
}