/*
* Copyright (c) 2008-2017, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.cardinality.impl.hyperloglog.impl;
import com.hazelcast.cardinality.impl.CardinalityEstimatorDataSerializerHook;
import com.hazelcast.nio.Bits;
import com.hazelcast.nio.ObjectDataInput;
import com.hazelcast.nio.ObjectDataOutput;
import java.io.IOException;
import java.util.Arrays;
/**
* 1. http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf
*/
@SuppressWarnings("checkstyle:magicnumber")
public class SparseHyperLogLogEncoder implements HyperLogLogEncoder {
private static final long P_PRIME_FENCE_MASK = 0x8000000000L;
private static final int DEFAULT_TEMP_CAPACITY = 200;
private int p;
private int pMask;
private int pPrime;
private int pPrimeMask;
private long pDiffMask;
private long pDiffEncodedMask;
private VariableLengthDiffArray register;
private int[] temp;
private int mPrime;
private int tempIdx;
public SparseHyperLogLogEncoder() {
}
public SparseHyperLogLogEncoder(final int p, final int pPrime) {
init(p, pPrime, new VariableLengthDiffArray());
}
public void init(int p, int pPrime, VariableLengthDiffArray register) {
this.p = p;
this.pPrime = pPrime;
this.pPrimeMask = (1 << pPrime) - 1;
this.mPrime = 1 << pPrime;
this.temp = new int[DEFAULT_TEMP_CAPACITY];
this.pMask = ((1 << p) - 1);
this.pDiffMask = pPrimeMask ^ pMask;
this.pDiffEncodedMask = (1L << (pPrime - p)) - 1;
this.register = register;
}
@Override
public boolean add(long hash) {
int encoded = encodeHash(hash);
temp[tempIdx++] = encoded;
boolean isTempAtCapacity = tempIdx == DEFAULT_TEMP_CAPACITY;
if (isTempAtCapacity) {
mergeAndResetTmp();
}
return true;
}
@Override
public long estimate() {
mergeAndResetTmp();
return linearCounting(mPrime, mPrime - register.total);
}
@Override
public int getFactoryId() {
return CardinalityEstimatorDataSerializerHook.F_ID;
}
@Override
public int getId() {
return CardinalityEstimatorDataSerializerHook.HLL_SPARSE_ENC;
}
@Override
public void writeData(ObjectDataOutput out) throws IOException {
mergeAndResetTmp();
out.writeInt(p);
out.writeInt(pPrime);
out.writeInt(register.total);
out.writeInt(register.mark);
out.writeInt(register.prev);
out.writeByteArray(register.elements);
}
@Override
public void readData(ObjectDataInput in) throws IOException {
int p = in.readInt();
int pPrime = in.readInt();
int total = in.readInt();
int mark = in.readInt();
int prev = in.readInt();
byte[] bytes = in.readByteArray();
init(p, pPrime, new VariableLengthDiffArray(bytes, total, mark, prev));
}
@Override
public HyperLogLogEncoding getEncodingType() {
return HyperLogLogEncoding.SPARSE;
}
@Override
public int getMemoryFootprint() {
return register.mark + (DEFAULT_TEMP_CAPACITY * Bits.INT_SIZE_IN_BYTES);
}
public HyperLogLogEncoder asDense() {
byte[] dense = new byte[1 << this.p];
for (int hash : register.explode()) {
int index = decodeHashPIndex(hash);
dense[index] = (byte) Math.max(dense[index], decodeHashRunOfZeros(hash));
}
return new DenseHyperLogLogEncoder(p, dense);
}
private int encodeHash(long hash) {
int index = (int) (hash & pPrimeMask) << (32 - pPrime);
if ((hash & pDiffMask) == 0) {
return index | Long.numberOfTrailingZeros((hash >>> pPrime) | P_PRIME_FENCE_MASK) << 1 | 0x1;
}
return ((index >>> (32 - pPrime)) & pPrimeMask) << 1;
}
private int decodeHashPPrimeIndex(int hash) {
if (!hasRunOfZerosEncoded(hash)) {
return ((hash >> 1) & pPrimeMask) & mPrime - 1;
}
return (hash >> (32 - pPrime) & pPrimeMask) & mPrime - 1;
}
private int decodeHashPIndex(long hash) {
if (!hasRunOfZerosEncoded(hash)) {
return (int) ((hash >>> 1)) & pMask;
}
return (int) (hash >>> (32 - pPrime)) & pMask;
}
private byte decodeHashRunOfZeros(long hash) {
if (!hasRunOfZerosEncoded(hash)) {
// |-25bits-||-1bit-
// (p - p') || 0
int pDiff = (int) ((hash >>> 1) & pDiffEncodedMask);
return (byte) (Integer.numberOfTrailingZeros(pDiff) + 1);
}
// |-25bits-||-6bits-||-1bit-|
// (p - p') || p(w') || 1
int pW = (int) (hash & ((1 << (32 - pPrime)) - 1)) >>> 1;
return (byte) (pW + (pPrime - p) + 1);
}
private boolean hasRunOfZerosEncoded(long hash) {
// is format (p - p') || p(w') || 1
return ((hash & 0x1) == 1);
}
private long linearCounting(final int total, final int empty) {
return (long) (total * Math.log(total / (double) empty));
}
private void mergeAndResetTmp() {
if (tempIdx == 0) {
return;
}
// merge existing register with temp
int[] old = register.explode();
int[] all = Arrays.copyOf(old, old.length + tempIdx);
System.arraycopy(temp, 0, all, old.length, tempIdx);
Arrays.sort(all);
// clear register, re-inserting will be in different order, due to new values
register.clear();
int previousHash = all[0];
for (int i = 1; i < all.length; i++) {
int hash = all[i];
boolean conflictingIndex = decodeHashPPrimeIndex(hash) == decodeHashPPrimeIndex(previousHash);
if (!conflictingIndex) {
register.add(previousHash);
}
previousHash = hash;
}
register.add(previousHash);
Arrays.fill(temp, 0);
tempIdx = 0;
}
/**
* Variable length difference encoding for sorted integer lists.
*
* Single byte, (7 bits) used to store the value if less or equal to 127,
* or more bytes for larger numbers, having the MSB bit set to 1 to signify
* the next_flag. Also, numbers are stored as a diff from the previous one
* to make the Variable Length algo more efficient. Therefore, the input must
* be sorted first.
*/
private static class VariableLengthDiffArray {
//aka 32
private static final int INITIAL_CAPACITY = 1 << 5;
private byte[] elements = new byte[INITIAL_CAPACITY];
private int prev;
private int total;
private int mark;
VariableLengthDiffArray() {
}
VariableLengthDiffArray(final byte[] elements, final int total, final int mark, final int prev) {
this.elements = elements;
this.total = total;
this.mark = mark;
this.prev = prev;
}
void add(int value) {
append(value - prev);
prev = value;
}
void clear() {
Arrays.fill(elements, (byte) 0);
mark = 0;
total = 0;
prev = 0;
}
int[] explode() {
int[] exploded = new int[total];
int counter = 0;
int last = 0;
for (int i = 0; i < mark; i++) {
int noOfBytes = 0;
byte element;
do {
element = elements[i++];
exploded[counter] |= (element & 0x7F) << (7 * noOfBytes++);
} while (needsMoreBytes(element));
exploded[counter] += last;
last = exploded[counter];
// fix positions
i--;
counter++;
}
return exploded;
}
private void append(int diff) {
while (diff > 0x7F) {
ensureCapacity();
elements[mark++] = (byte) ((diff & 0x7F) | 0x80);
diff >>>= 7;
}
ensureCapacity();
elements[mark++] = (byte) (diff & 0x7F);
total++;
}
private void ensureCapacity() {
if (elements.length == mark) {
int newCapacity = elements.length << 1;
elements = Arrays.copyOf(elements, newCapacity);
}
}
private boolean needsMoreBytes(byte val) {
return (val & 0x80) != 0;
}
}
}