/* * Copyright (c) 2008-2017, Hazelcast, Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.hazelcast.cardinality.impl.hyperloglog.impl; import com.hazelcast.cardinality.impl.hyperloglog.HyperLogLog; import com.hazelcast.test.HazelcastParametersRunnerFactory; import com.hazelcast.util.HashUtil; import com.hazelcast.util.collection.IntHashSet; import org.HdrHistogram.Histogram; import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; import java.nio.ByteBuffer; import java.util.Arrays; import java.util.Collection; import java.util.Random; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; @RunWith(Parameterized.class) @Parameterized.UseParametersRunnerFactory(HazelcastParametersRunnerFactory.class) public class HyperLogLogImplTest { private static final int PRIME_PRECISION = 25; @Parameterized.Parameters() public static Collection<Integer[]> params() { return Arrays.asList(new Integer[][]{ {11, 10000000}, {12, 10000000}, {13, 10000000}, {14, 10000000}, {15, 10000000}, {16, 10000000} }); } @Parameterized.Parameter() public int precision; @Parameterized.Parameter(1) public int runLength; private HyperLogLog hyperLogLog; @Before public void setup() { hyperLogLog = new HyperLogLogImpl(precision, PRIME_PRECISION); } @Test public void add() { hyperLogLog.add(1000L); assertEquals(1L, hyperLogLog.estimate()); } @Test public void addAll() { hyperLogLog.addAll(new long[]{1L, 1L, 2000L, 3000, 40000L}); assertEquals(4L, hyperLogLog.estimate()); } /** * - Add up-to runLength() random numbers on both a Set and a HyperLogLog encoder. * - Sample the actual count, and the estimate respectively every 100 operations. * - Compute the error rate, of the measurements and store it in a histogram. * - Assert that the 99th percentile of the histogram is less than the expected max error, * which is the result of std error (1.04 / sqrt(m)) + 3%. * (2% is the typical accuracy, but tests on the implementation showed up rare occurrences of 3%) */ @Test public void testEstimateErrorRateForBigCardinalities() { double stdError = (1.04 / Math.sqrt(1 << precision)) * 100; double maxError = Math.ceil(stdError + 3.0); IntHashSet actualCount = new IntHashSet(runLength, -1); Random random = new Random(); Histogram histogram = new Histogram(5); ByteBuffer bb = ByteBuffer.allocate(4); int sampleStep = 100; long expected; long actual; for (int i = 1; i <= runLength; i++) { int toCount = random.nextInt(); actualCount.add(toCount); bb.clear(); bb.putInt(toCount); hyperLogLog.add(HashUtil.MurmurHash3_x64_64(bb.array(), 0, bb.array().length)); if (i % sampleStep == 0) { expected = actualCount.size(); actual = hyperLogLog.estimate(); double errorPct = ((actual * 100.0) / expected) - 100; histogram.recordValue(Math.abs((long) (errorPct * 100))); } } double errorPerc99 = histogram.getValueAtPercentile(99) / 100.0; if (errorPerc99 > maxError) { fail("For P=" + precision + ", Expected max error=" + maxError + "%." + " Actual error: " + errorPerc99 + "%."); } } }