/*
* Copyright (C) 2011 Clearspring Technologies, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.streaminer.stream.cardinality;
import org.streaminer.util.hash.MurmurHash;
import org.streaminer.util.IBuilder;
import java.io.Serializable;
import java.util.Arrays;
import static java.lang.Math.*;
/**
* See <i>A Linear-Time Probabilistic Counting Algorithm for Database Applications</i>
* by Whang, Vander-Zanden, Taylor
*/
public class LinearCounting implements IRichCardinality
{
/**
* Bitmap
* Hashed stream elements are mapped to bits in this array
*/
protected byte[] map;
/**
* Size of the map in bits
*/
protected final int length;
/**
* Number of bits left unset in the map
*/
protected int count;
/**
* @param size of bit array in bytes
*/
public LinearCounting(int size)
{
this.length = 8 * size;
this.count = this.length;
map = new byte[size];
}
public LinearCounting(byte[] map)
{
this.map = map;
this.length = 8 * map.length;
this.count = computeCount();
}
@Override
public long cardinality()
{
return (long) (Math.round(length * Math.log(length / ((double) count))));
}
@Override
public byte[] getBytes()
{
return map;
}
@Override
public boolean offerHashed(long hashedLong)
{
throw new UnsupportedOperationException();
}
@Override
public boolean offerHashed(int hashedInt)
{
throw new UnsupportedOperationException();
}
@Override
public boolean offer(Object o)
{
boolean modified = false;
long hash = (long) MurmurHash.getInstance().hash(o);
int bit = (int) ((hash & 0xFFFFFFFFL) % (long) length);
int i = bit / 8;
byte b = map[i];
byte mask = (byte) (1 << (bit % 8));
if ((mask & b) == 0)
{
map[i] = (byte) (b | mask);
count--;
modified = true;
}
return modified;
}
@Override
public int sizeof()
{
return map.length;
}
public int computeCount()
{
int c = 0;
for (byte b : map)
{
c += Integer.bitCount(b & 0xFF);
}
return length - c;
}
/**
* @return (# set bits) / (total # of bits)
*/
public double getUtilization()
{
return (length - count) / (double) length;
}
public int getCount()
{
return count;
}
public boolean isSaturated()
{
return (count == 0);
}
/**
* For debug purposes
*
* @return
*/
protected String mapAsBitString()
{
StringBuilder sb = new StringBuilder();
for (byte b : map)
{
String bits = Integer.toBinaryString(b);
for (int i = 0; i < 8 - bits.length(); i++)
{
sb.append('0');
}
sb.append(bits);
}
return sb.toString();
}
/**
* @return this if estimators is null or no arguments are passed
* @throws LinearCountingMergeException if estimators are not mergeable (all estimators must be instances of LinearCounting of the same size)
*/
@Override
public IRichCardinality merge(IRichCardinality... estimators) throws LinearCountingMergeException
{
if (estimators == null)
{
return new LinearCounting(map);
}
LinearCounting[] lcs = Arrays.copyOf(estimators, estimators.length + 1, LinearCounting[].class);
lcs[lcs.length - 1] = this;
return LinearCounting.mergeEstimators(lcs);
}
/**
* Merges estimators to produce an estimator for their combined streams
*
* @param estimators
* @return merged estimator or null if no estimators were provided
* @throws LinearCountingMergeException if estimators are not mergeable (all estimators must be the same size)
*/
public static LinearCounting mergeEstimators(LinearCounting... estimators) throws LinearCountingMergeException
{
LinearCounting merged = null;
if (estimators != null && estimators.length > 0)
{
int size = estimators[0].map.length;
byte[] mergedBytes = new byte[size];
for (LinearCounting estimator : estimators)
{
if (estimator.map.length != size)
{
throw new LinearCountingMergeException("Cannot merge estimators of different sizes");
}
for (int b = 0; b < size; b++)
{
mergedBytes[b] |= estimator.map[b];
}
}
merged = new LinearCounting(mergedBytes);
}
return merged;
}
@SuppressWarnings("serial")
protected static class LinearCountingMergeException extends CardinalityMergeException
{
public LinearCountingMergeException(String message)
{
super(message);
}
}
public static class Builder implements IBuilder<IRichCardinality>, Serializable
{
private static final long serialVersionUID = -4245416224034648428L;
/**
* Taken from Table II of Whang et al.
*/
protected final static int[] onePercentErrorLength =
{
5034, 5067, 5100, 5133, 5166, 5199, 5231, 5264, 5296, // 100 - 900
5329, 5647, 5957, 6260, 6556, 6847, 7132, 7412, 7688, // 1000 - 9000
7960, 10506, 12839, 15036, 17134, 19156, 21117, 23029, 24897, // 10000 - 90000
26729, 43710, 59264, 73999, 88175, 101932, 115359, 128514, 141441, // 100000 - 900000
154171, 274328, 386798, 494794, 599692, 702246, 802931, 902069, 999894, // 1000000 - 9000000
1096582 // 10000000
};
protected final int size;
public Builder()
{
this(65536);
}
public Builder(int size)
{
this.size = size;
}
@Override
public LinearCounting build()
{
return new LinearCounting(size);
}
@Override
public int sizeof()
{
return size;
}
/**
* Returns a LinearCounting.Builder that generates an LC
* estimator which keeps estimates below 1% error on average and has
* a low likelihood of saturation (0.7%) for any stream with
* cardinality less than maxCardinality
*
* @param maxCardinality
* @return
* @throws IllegalArgumentException if maxCardinality is not a positive integer
*/
public static Builder onePercentError(int maxCardinality)
{
if (maxCardinality <= 0)
{
throw new IllegalArgumentException("maxCardinality (" + maxCardinality + ") must be a positive integer");
}
int length = -1;
if (maxCardinality < 100)
{
length = onePercentErrorLength[0];
}
else if (maxCardinality < 10000000)
{
int logscale = (int) Math.log10(maxCardinality);
int scaleValue = (int) Math.pow(10, logscale);
int scaleIndex = maxCardinality / scaleValue;
int index = 9 * (logscale - 2) + (scaleIndex - 1);
int lowerBound = scaleValue * scaleIndex;
length = lerp(lowerBound, onePercentErrorLength[index], lowerBound + scaleValue, onePercentErrorLength[index + 1], maxCardinality);
//System.out.println(String.format("Lower bound: %9d, Max cardinality: %9d, Upper bound: %9d", lowerBound, maxCardinality, lowerBound+scaleValue));
//System.out.println(String.format("Lower bound: %9d, Interpolated : %9d, Upper bound: %9d", onePercentErrorLength[index], length, onePercentErrorLength[index+1]));
}
else if (maxCardinality < 50000000)
{
length = lerp(10000000, 1096582, 50000000, 4584297, maxCardinality);
}
else if (maxCardinality < 100000000)
{
length = lerp(50000000, 4584297, 100000000, 8571013, maxCardinality);
}
else if (maxCardinality <= 120000000)
{
length = lerp(100000000, 8571013, 120000000, 10112529, maxCardinality);
}
else
{
length = maxCardinality / 12;
}
int sz = (int) Math.ceil(length / 8D);
//System.out.println("length: "+length+", size (bytes): "+sz);
return new Builder(sz);
}
/**
* Builds Linear Counter with arbitrary standard error and maximum expected cardinality.
* <p/>
* This method is more compute intensive than {@link #onePercentError(int)} as it is perform
* solving precision inequality in runtime. Therefore, {@link #onePercentError(int)} should be
* used whenever possible.
*
* @param eps standard error as a fraction (e.g. {@code 0.01} for 1%)
* @param maxCardinality maximum expected cardinality
*/
public static Builder withError(double eps, int maxCardinality)
{
int sz = computeRequiredBitMaskLength(maxCardinality, eps);
return new Builder((int) Math.ceil(sz / 8D));
}
/**
* Runs binary search to find minimum bit mask length that holds precision inequality.
*
* @param n expected cardinality
* @param eps desired standard error
* @return minimal required bit mask length
*/
private static int computeRequiredBitMaskLength(double n, double eps)
{
if (eps >= 1 || eps <= 0)
{
throw new IllegalArgumentException("Epsilon should be in (0, 1) range");
}
if (n <= 0)
{
throw new IllegalArgumentException("Cardinality should be positive");
}
int fromM = 1;
int toM = 100000000;
int m;
double eq;
do
{
m = (toM + fromM) / 2;
eq = precisionInequalityRV(n / m, eps);
if (m > eq)
{
toM = m;
}
else
{
fromM = m + 1;
}
} while (toM > fromM);
return m > eq ? m : m + 1;
}
/**
* @param t load factor for linear counter
* @param eps desired standard error
*/
private static double precisionInequalityRV(double t, double eps)
{
return max(1.0 / pow(eps * t, 2), 5) * (exp(t) - t - 1);
}
/**
* @param x0
* @param y0
* @param x1
* @param y1
* @param x
* @return linear interpolation
*/
protected static int lerp(int x0, int y0, int x1, int y1, int x)
{
return (int) Math.ceil(y0 + (x - x0) * (double) (y1 - y0) / (x1 - x0));
}
}
}