/* * Copyright (c) 2008-2017, Hazelcast, Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.hazelcast.cardinality.impl.hyperloglog.impl; import com.hazelcast.test.HazelcastParallelClassRunner; import com.hazelcast.test.annotation.ParallelTest; import com.hazelcast.test.annotation.QuickTest; import com.hazelcast.util.HashUtil; import com.hazelcast.util.collection.IntHashSet; import org.HdrHistogram.Histogram; import org.junit.Before; import org.junit.Test; import org.junit.experimental.categories.Category; import org.junit.runner.RunWith; import java.nio.ByteBuffer; import java.util.Random; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @RunWith(HazelcastParallelClassRunner.class) @Category({QuickTest.class, ParallelTest.class}) public abstract class HyperLogLogEncoderAbstractTest { private HyperLogLogEncoder encoder; public abstract int runLength(); public abstract int precision(); public abstract HyperLogLogEncoder createStore(); @Before public void setup() { encoder = createStore(); } @Test public void add() { assertTrue(encoder.add(1000L)); assertEquals(1L, encoder.estimate()); } /** * - Add up-to runLength() random numbers on both a Set and a HyperLogLog encoder. * - Sample the actual count, and the estimate respectively every 100 operations. * - Compute the error rate, of the measurements and store it in a histogram. * - Assert that the 99th percentile of the histogram is less than the expected max error, * which is the result of std error (1.04 / sqrt(m)) + 3%. * (2% is the typical accuracy, but tests on the implementation showed up rare occurrences of 3%) */ @Test public void testEstimateErrorRateForBigCardinalities() { double stdError = (1.04 / Math.sqrt(1 << precision())) * 100; double maxError = Math.ceil(stdError + 3.0); IntHashSet actualCount = new IntHashSet(runLength(), -1); Random random = new Random(); Histogram histogram = new Histogram(5); ByteBuffer bb = ByteBuffer.allocate(4); int sampleStep = 100; long expected; long actual; for (int i = 1; i <= runLength(); i++) { int toCount = random.nextInt(); actualCount.add(toCount); bb.clear(); bb.putInt(toCount); encoder.add(HashUtil.MurmurHash3_x64_64(bb.array(), 0, bb.array().length)); if (i % sampleStep == 0) { expected = actualCount.size(); actual = encoder.estimate(); double errorPct = ((actual * 100.0) / expected) - 100; histogram.recordValue(Math.abs((long) (errorPct * 100))); } } double errorPerc99 = histogram.getValueAtPercentile(99) / 100.0; if (errorPerc99 > maxError) { fail("For P=" + precision() + ", max error=" + maxError + "% expected." + " Error: " + errorPerc99 + "%."); } } HyperLogLogEncoder getEncoder() { return encoder; } }