package org.streaminer.stream.cardinality;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
/**
* Flajolet-Martin algorithm approximates the number of unique objects in a
* stream or a database in one pass.
*
* Reference:
* Flajolet, Philippe, and G. Nigel Martin. "Probabilistic counting algorithms
* for data base applications." Journal of computer and system sciences 31.2
* (1985): 182-209.
*
* Source code: https://github.com/rbhide0/Columbus
*
* @author Ravi Bhide
*/
public class FlajoletMartin {
private static final double PHI = 0.77351D;
private int numHashGroups;
private int numHashFunctionsInHashGroup;
private HashFunction[][] hashes;
private int bitmapSize;
private boolean[][][] bitmaps;
private long numWords;
public FlajoletMartin(int bitmapSize, int numHashGroups, int numHashFunctionsInEachGroup) {
this.numHashGroups = numHashGroups;
this.numHashFunctionsInHashGroup = numHashFunctionsInEachGroup;
this.bitmapSize = bitmapSize;
bitmaps = new boolean[numHashGroups][numHashFunctionsInEachGroup][bitmapSize];
hashes = new HashFunction[numHashGroups][numHashFunctionsInEachGroup];
generateHashFunctions();
}
private void generateHashFunctions() {
Map<Integer, Collection<Integer>> mnMap = new HashMap<Integer, Collection<Integer>>();
for (int i=0; i<numHashGroups; i++) {
for (int j=0; j<numHashFunctionsInHashGroup; j++) {
hashes[i][j] = generateUniqueHashFunction(mnMap);
}
}
}
private HashFunction generateUniqueHashFunction(Map<Integer, Collection<Integer>> mnMap) {
// Get odd numbers for both m and n.
int m = 0;
do {
m = (int) (Integer.MAX_VALUE * Math.random());
} while (m % 2 == 0);
// Get pairs that we haven't seen before.
int n = 0;
do {
n = (int) (Integer.MAX_VALUE * Math.random());
} while ((n % 2 == 0) || contains(mnMap, m, n));
// Make a note of the (m, n) pair, so we don't use it again.
Collection<Integer> valueCollection = mnMap.get(m);
if (valueCollection == null) {
valueCollection = new HashSet<Integer>();
mnMap.put(m, valueCollection);
}
valueCollection.add(n);
// Generate hash function with the (m, n) pair.
// System.out.println("Generating hashFunction with (m=" + m + ", n=" + n + ")");
return new HashFunction(m, n, bitmapSize);
}
private static boolean contains(Map<Integer, Collection<Integer>> map, int m, int n) {
Collection<Integer> valueList = map.get(m);
return (valueList != null) && (valueList.contains(n));
}
public boolean offer(Object o) {
boolean affected = false;
for (int i=0; i<numHashGroups; i++) {
for (int j=0; j<numHashFunctionsInHashGroup; j++) {
HashFunction f = hashes[i][j];
long v = f.hash(o);
int index = rho(v);
if (!bitmaps[i][j][index]) {
bitmaps[i][j][index] = true;
affected = true;
}
}
}
return affected;
}
public long cardinality() {
List<Double> averageR = new ArrayList<Double>();
for (int i=0; i<numHashGroups; i++) {
int sumR = 0;
for (int j=0; j<numHashFunctionsInHashGroup; j++) {
sumR += (getFirstZeroBit(bitmaps[i][j]));
}
averageR.add(sumR * 1.0 / numHashFunctionsInHashGroup);
}
// Find the median R and estimate unique items
Collections.sort(averageR);
double r = 0;
int averageRMid = averageR.size() / 2;
if (averageR.size() % 2 == 0) {
r = (averageR.get(averageRMid) + averageR.get(averageRMid+1))/2;
} else {
r = averageR.get(averageRMid + 1);
}
return (long) (Math.pow(2, r) / PHI);
}
private int rho(long v) {
int rho = 0;
for (int i=0; i<bitmapSize; i++) { // size of long=64 bits.
if ((v & 0x01) == 0) {
v = v >> 1;
rho++;
} else {
break;
}
}
return rho == bitmapSize ? 0 : rho;
}
private static int getFirstZeroBit(boolean[] b) {
for (int i=0; i<b.length; i++) {
if (b[i] == false) {
return i;
}
}
return b.length;
}
private static class HashFunction {
private int m_m;
private int m_n;
private int m_bitmapSize;
private long m_pow2BitmapSize;
public HashFunction(int m, int n, int bitmapSize) {
if (bitmapSize > 64) {
throw new IllegalArgumentException("bitmap size should be at max. 64");
}
this.m_m = m;
this.m_n = n;
m_bitmapSize = bitmapSize;
m_pow2BitmapSize = 1 << m_bitmapSize;
}
public long hash(Object o) {
if (o instanceof String)
return hash(((String) o).hashCode());
if (o instanceof Number)
return hash(String.valueOf(o).hashCode());
return hash(o.hashCode());
}
public long hash(long hashCode) {
return m_m + m_n * hashCode;
}
}
}