/*
* Copyright (C) 2012 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.stats.cardinality;
import com.google.common.base.Preconditions;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import java.util.Random;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
public class HyperLogLogUtil {
private static final HashFunction HASH = Hashing.murmur3_128();
public static long estimateCardinality(int[] bucketValues) {
Preconditions.checkArgument(
Numbers.isPowerOf2(bucketValues.length),
"number of buckets must be a power of 2"
);
int zeroBuckets = 0;
double sum = 0;
for (Integer value : bucketValues) {
sum += 1.0 / (1L << value);
if (value == 0) {
++zeroBuckets;
}
}
double alpha = computeAlpha(bucketValues.length);
double result = alpha * bucketValues.length * bucketValues.length / sum;
if (result <= 2.5 * bucketValues.length) {
// adjust for small cardinalities
if (zeroBuckets > 0) {
// baselineCount is the number of buckets with value 0
result = bucketValues.length * Math.log(bucketValues.length * 1.0 / zeroBuckets);
}
}
return Math.round(result);
}
public static int[] generateBuckets(int numberOfBuckets, long cardinality) {
Preconditions.checkArgument(
Numbers.isPowerOf2(numberOfBuckets),
"number of buckets must be a power of 2"
);
double[] probabilities = computeProbabilities(numberOfBuckets, cardinality, Byte.MAX_VALUE);
Random random = new Random();
int[] result = new int[numberOfBuckets];
for (int i = 0; i < numberOfBuckets; ++i) {
double trial = random.nextDouble();
int k = 0;
while (trial > probabilities[k]) {
++k;
}
result[i] = k;
}
return result;
}
/**
* Probability that a bucket has a value <= k
*/
private static double cumulativeProbability(int numberOfBuckets, long cardinality, int k) {
return Math.pow(1.0 - 1.0 / ((1L << k) * 1.0 * numberOfBuckets), cardinality);
}
/**
* Compute cumulative probabilities for value <= k for all k = 0..maxK
*/
private static double[] computeProbabilities(int numberOfBuckets, long cardinality, int maxK) {
double[] result = new double[maxK];
for (int k = 0; k < maxK; ++k) {
result[k] = cumulativeProbability(numberOfBuckets, cardinality, k);
}
return result;
}
public static int[] mergeBuckets(int[] first, int[] second) {
checkNotNull(first, "first is null");
checkNotNull(second, "second is null");
checkArgument(
first.length == second.length,
"Array sizes must match, found %s vs %s",
first.length,
second.length
);
int[] result = new int[first.length];
for (int i = 0; i < first.length; i++) {
result[i] = Math.max(first[i], second[i]);
}
return result;
}
public static double computeAlpha(int numberOfBuckets) {
double alpha;
switch (numberOfBuckets) {
case (1 << 4):
alpha = 0.673;
break;
case (1 << 5):
alpha = 0.697;
break;
case (1 << 6):
alpha = 0.709;
break;
default:
alpha = (0.7213 / (1 + 1.079 / numberOfBuckets));
}
return alpha;
}
/**
* Computes a 64-bit hash suitable for adding to a hyperloglog instance.
*
* The hyperloglog implementation uses bits from least significant to most significant first, so
* If you need to keep shorter hashes around (e.g., for storage), make sure to drop bits from the
* most significant side, as the hyperloglog implementation uses bits from least significant
* to most significant.
*/
public static long computeHash(long value) {
return HASH.hashLong(value).asLong();
}
}