/* * Copyright (c) 2008-2017, Hazelcast, Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.hazelcast.cardinality.impl.hyperloglog.impl; import com.hazelcast.cardinality.impl.CardinalityEstimatorDataSerializerHook; import com.hazelcast.nio.ObjectDataInput; import com.hazelcast.nio.ObjectDataOutput; import java.io.IOException; import java.util.Iterator; import java.util.Map; import java.util.NavigableMap; import java.util.TreeMap; /** * 1. http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf * 2. http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf */ @SuppressWarnings("checkstyle:magicnumber") public class DenseHyperLogLogEncoder implements HyperLogLogEncoder { private double[] invPowLookup; private byte[] register; private int numOfEmptyRegs; private int p; private int m; private long pFenseMask; public DenseHyperLogLogEncoder() { } public DenseHyperLogLogEncoder(final int p) { this(p, null); } public DenseHyperLogLogEncoder(final int p, final byte[] register) { this.init(p, register); } private void init(final int p, final byte[] register) { this.p = p; this.m = 1 << p; this.numOfEmptyRegs = m; this.register = register != null ? register : new byte[m]; this.invPowLookup = new double[64 - p + 1]; this.pFenseMask = 1 << (64 - p) - 1; this.prePopulateInvPowLookup(); } @Override public boolean add(long hash) { final int index = (int) hash & (register.length - 1); final int value = Long.numberOfTrailingZeros((hash >>> p) | pFenseMask) + 1; assert index < register.length; assert value <= (1 << 8) - 1; assert value <= 64 - p; if (value > register[index]) { register[index] = (byte) value; return true; } return false; } @Override public long estimate() { final double raw = (1 / computeE()) * alpha() * m * m; return applyRangeCorrection(raw); } @Override public int getFactoryId() { return CardinalityEstimatorDataSerializerHook.F_ID; } @Override public int getId() { return CardinalityEstimatorDataSerializerHook.HLL_DENSE_ENC; } @Override public void writeData(ObjectDataOutput out) throws IOException { out.writeInt(p); out.writeByteArray(register); } @Override public void readData(ObjectDataInput in) throws IOException { init(in.readInt(), null); this.register = in.readByteArray(); } @Override public int getMemoryFootprint() { return m; } @Override public HyperLogLogEncoding getEncodingType() { return HyperLogLogEncoding.DENSE; } private double alpha() { // make sure m is always >= 16 for p = 4 -> m = 16 // if p ∈ [4..16] as of [1] assert m >= 16; if (m >= 128) { return .7213 / (1 + 1.079 / m); } if (m == 64) { return .709; } if (m == 32) { return .697; } if (m == 16) { return .673; } return -1; } private long applyRangeCorrection(double e) { double ePrime = e <= m * 5 ? (e - estimateBias(e)) : e; double h = numOfEmptyRegs != 0 ? linearCounting(m, numOfEmptyRegs) : ePrime; return (long) (exceedsThreshold(h) ? ePrime : h); } private double computeE() { double e = 0; numOfEmptyRegs = 0; for (byte r : register) { if (r > 0) { e += invPow(r); } else { numOfEmptyRegs++; } } return e + numOfEmptyRegs; } /** * [2] We use k nearest neighbor interpolation to get the bias for a given raw estimate * The choice of k = 6 is rather arbitrary. The best value of k could be determined experimentally, * but we found that the choice has only a minuscule influence. */ private long estimateBias(double e) { int i = 0; double[] rawEstimates = DenseHyperLogLogConstants.RAW_ESTIMATE_DATA[p - 4]; double closestToZero = Math.abs(e - rawEstimates[0]); NavigableMap<Double, Integer> distances = new TreeMap<Double, Integer>(); for (double est : rawEstimates) { double distance = e - est; distances.put(distance, i++); if (Math.abs(distance) < closestToZero) { closestToZero = distance; } } // abomination to compute kNN elements (we could ideally use a tree structure) int kNN = 6; double sum = 0; Iterator<Map.Entry<Double, Integer>> firstX = distances.descendingMap().tailMap(closestToZero).entrySet().iterator(); Iterator<Map.Entry<Double, Integer>> lastX = distances.tailMap(closestToZero).entrySet().iterator(); int kNNLeft = kNN; while (kNNLeft-- > kNN / 2 && firstX.hasNext()) { sum += DenseHyperLogLogConstants.BIAS_DATA[p - 4][firstX.next().getValue()]; } while (kNNLeft-- >= 0 && lastX.hasNext()) { sum += DenseHyperLogLogConstants.BIAS_DATA[p - 4][lastX.next().getValue()]; } return (long) (sum / kNN); } private boolean exceedsThreshold(double e) { return e >= DenseHyperLogLogConstants.THRESHOLD[p - 4]; } private double invPow(int index) { assert index <= 64 - p; return invPowLookup[index]; } private long linearCounting(final int total, final int empty) { return (long) (total * Math.log(total / (double) empty)); } private void prePopulateInvPowLookup() { invPowLookup[0] = 1; for (int i = 1; i <= (64 - p); i++) { invPowLookup[i] = Math.pow(2, -i); } } }