/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io.airlift.stats.cardinality; import io.airlift.slice.Slice; import org.testng.annotations.Test; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.ThreadLocalRandom; import static io.airlift.slice.testing.SliceAssertions.assertSlicesEqual; import static io.airlift.stats.cardinality.TestUtils.sequence; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; public class TestHyperLogLog { @Test public void testEstimates() throws Exception { int trials = 1000; for (int indexBits = 4; indexBits <= 13; indexBits++) { Map<Integer, Stats> errors = new HashMap<>(); int numberOfBuckets = 1 << indexBits; int maxCardinality = numberOfBuckets * 2; for (int trial = 0; trial < trials; trial++) { HyperLogLog hll = HyperLogLog.newInstance(numberOfBuckets); for (int cardinality = 1; cardinality <= maxCardinality; cardinality++) { hll.add(ThreadLocalRandom.current().nextLong()); if (cardinality % (numberOfBuckets / 10) == 0) { // only do this a few times, since computing the cardinality is currently not // as cheap as it should be double error = (hll.cardinality() - cardinality) * 1.0 / cardinality; Stats stats = errors.get(cardinality); if (stats == null) { stats = new Stats(); errors.put(cardinality, stats); } stats.add(error); } } } double expectedStandardError = 1.04 / Math.sqrt(1 << indexBits); for (Map.Entry<Integer, Stats> entry : errors.entrySet()) { // Give an extra error margin. This is mostly a sanity check to catch egregious errors assertTrue(entry.getValue().stdev() <= expectedStandardError * 1.1, String.format("Failed at p = %s, cardinality = %s. Expected std error = %s, actual = %s", indexBits, entry.getKey(), expectedStandardError, entry.getValue().stdev())); } } } @Test public void testMerge() throws Exception { // small vs small verifyMerge(sequence(0, 100), sequence(50, 150)); // small vs big verifyMerge(sequence(0, 100), sequence(50, 5000)); // big vs small verifyMerge(sequence(50, 5000), sequence(0, 100)); // big vs big verifyMerge(sequence(0, 5000), sequence(3000, 8000)); } private void verifyMerge(List<Long> one, List<Long> two) { HyperLogLog hll1 = HyperLogLog.newInstance(2048); HyperLogLog hll2 = HyperLogLog.newInstance(2048); HyperLogLog expected = HyperLogLog.newInstance(2048); for (long value : one) { hll1.add(value); expected.add(value); } for (long value : two) { hll2.add(value); expected.add(value); } hll1.verify(); hll2.verify(); hll1.mergeWith(hll2); hll1.verify(); assertEquals(hll1.cardinality(), expected.cardinality()); assertEquals(hll1.serialize(), expected.serialize()); } @Test public void testRoundtrip() throws Exception { // small verifyRoundtrip(sequence(0, 100)); // large verifyRoundtrip(sequence(0, 20000)); } private void verifyRoundtrip(List<Long> sequence) { HyperLogLog hll = HyperLogLog.newInstance(2048); for (Long value : sequence) { hll.add(value); } hll.verify(); Slice serialized = hll.serialize(); HyperLogLog deserialized = HyperLogLog.newInstance(serialized); deserialized.verify(); assertEquals(hll.cardinality(), deserialized.cardinality()); Slice reserialized = deserialized.serialize(); assertSlicesEqual(serialized, reserialized); } }