/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This file contains original code and/or modifications of original code. * Any modifications made by VoltDB Inc. are licensed under the following * terms and conditions: * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ /* * Copyright (C) 2012 Clearspring Technologies, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* This code was originally sourced from https://github.com/addthis/stream-lib in December 2014. */ package org.voltdb_hll; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.util.Arrays; import java.util.Random; import org.junit.Ignore; import org.junit.Test; import com.google_voltpatches.common.base.Charsets; import com.google_voltpatches.common.hash.HashFunction; import com.google_voltpatches.common.hash.Hashing; public class TestHyperLogLog { private static Random prng = new Random(); protected static Object streamElement(int i) { return Long.toHexString(prng.nextLong()); } @Test public void testComputeCount() { HyperLogLog hyperLogLog = new HyperLogLog(16); hyperLogLog.offer(0); hyperLogLog.offer(1); hyperLogLog.offer(2); hyperLogLog.offer(3); hyperLogLog.offer(16); hyperLogLog.offer(17); hyperLogLog.offer(18); hyperLogLog.offer(19); hyperLogLog.offer(19); assertEquals(8, hyperLogLog.cardinality()); } @Test public void testSerialization() throws IOException { HyperLogLog hll = new HyperLogLog(8); hll.offer("a"); hll.offer("b"); hll.offer("c"); hll.offer("d"); hll.offer("e"); HyperLogLog hll2 = HyperLogLog.fromBytes(hll.toBytes()); assertEquals(hll.cardinality(), hll2.cardinality()); } @Test public void testHighCardinality() { long start = System.currentTimeMillis(); HyperLogLog hyperLogLog = new HyperLogLog(10); int size = 10000000; for (int i = 0; i < size; i++) { hyperLogLog.offer(streamElement(i)); } System.out.println("time: " + (System.currentTimeMillis() - start)); long estimate = hyperLogLog.cardinality(); double err = Math.abs(estimate - size) / (double) size; System.out.println(err); assertTrue(err < .1); } @Test public void testMerge() { int numToMerge = 5; int bits = 16; int cardinality = 1000000; HyperLogLog[] hyperLogLogs = new HyperLogLog[numToMerge]; HyperLogLog baseline = new HyperLogLog(bits); for (int i = 0; i < numToMerge; i++) { hyperLogLogs[i] = new HyperLogLog(bits); for (int j = 0; j < cardinality; j++) { double val = Math.random(); hyperLogLogs[i].offer(val); baseline.offer(val); } } long expectedCardinality = numToMerge * cardinality; HyperLogLog hll = hyperLogLogs[0]; hyperLogLogs = Arrays.asList(hyperLogLogs).subList(1, hyperLogLogs.length).toArray(new HyperLogLog[0]); long mergedEstimate = hll.merge(hyperLogLogs).cardinality(); long baselineEstimate = baseline.cardinality(); double se = expectedCardinality * (1.04 / Math.sqrt(Math.pow(2, bits))); System.out.println("Baseline estimate: " + baselineEstimate); System.out.println("Expect estimate: " + mergedEstimate + " is between " + (expectedCardinality - (3 * se)) + " and " + (expectedCardinality + (3 * se))); assertTrue(mergedEstimate >= expectedCardinality - (3 * se)); assertTrue(mergedEstimate <= expectedCardinality + (3 * se)); assertEquals(mergedEstimate, baselineEstimate); } /** * should not fail with HyperLogLogMergeException: "Cannot merge estimators of different sizes" */ @Test public void testMergeWithRegisterSet() { HyperLogLog first = new HyperLogLog(16, new RegisterSet(1 << 20)); HyperLogLog second = new HyperLogLog(16, new RegisterSet(1 << 20)); first.offer(0); second.offer(1); first.merge(second); } @Test @Ignore public void testPrecise() { int cardinality = 1000000000; int b = 12; HyperLogLog baseline = new HyperLogLog(b); HyperLogLog guava128 = new HyperLogLog(b); HashFunction hf128 = Hashing.murmur3_128(); for (int j = 0; j < cardinality; j++) { Double val = Math.random(); String valString = val.toString(); baseline.offer(valString); guava128.offerHashed(hf128.hashString(valString, Charsets.UTF_8).asLong()); if (j > 0 && j % 1000000 == 0) { System.out.println("current count: " + j); } } long baselineEstimate = baseline.cardinality(); long g128Estimate = guava128.cardinality(); double se = cardinality * (1.04 / Math.sqrt(Math.pow(2, b))); double baselineError = (baselineEstimate - cardinality) / (double) cardinality; double g128Error = (g128Estimate - cardinality) / (double) cardinality; System.out.format("b: %f g128 %f", baselineError, g128Error); assertTrue("baseline estimate bigger than expected", baselineEstimate >= cardinality - (2 * se)); assertTrue("baseline estimate smaller than expected", baselineEstimate <= cardinality + (2 * se)); assertTrue("g128 estimate bigger than expected", g128Estimate >= cardinality - (2 * se)); assertTrue("g128 estimate smaller than expected", g128Estimate <= cardinality + (2 * se)); } }