package org.streaminer.stream.frequency;
import java.util.Random;
import org.streaminer.util.ArrayUtils;
import org.streaminer.util.hash.HashUtils;
/**
* Count sketches from Charikar, Chen, Farach-Colton '02. They support: finding
* frequent items, returning point estimates.
*
* @author Maycon Viana Bordin <mayconbordin@gmail.com>
*/
public class CCFCSketch implements ISimpleFrequency<Integer> {
private int tests;
private int logn;
private int gran;
private int buckets;
private int count;
private int[][] counts;
private int[] testa, testb, testc, testd;
private Random random = new Random();
/**
* Create the data structure for Adaptive Group Testing Keep T tests.
* @param buckets The number of buckets for each test
* @param tests The number of tests
* @param lgn The bit depth of the items which will arrive
* @param gran gran = 1 means to do one bit at a time, gran = 8 means to do one quad at a time, etc.
*/
public CCFCSketch(int buckets, int tests, int lgn, int gran) {
this.tests = tests;
this.logn = lgn;
this.gran = gran;
this.buckets = buckets;
this.count = 0;
testa = new int[tests];
testb = new int[tests];
testc = new int[tests];
testd = new int[tests];
// create space for the hash functions
counts = new int[lgn+1][buckets*tests];
for (int i=0; i<tests; i++) {
testa[i] = random.nextInt();
if (testa[i] < 0) testa[i]= -testa[i];
testb[i] = random.nextInt();
if (testb[i] < 0) testb[i]= -testb[i];
testc[i] = random.nextInt();
if (testc[i] < 0) testc[i]= -testc[i];
testd[i] = random.nextInt();
if (testd[i] < 0) testd[i]= -testd[i];
}
}
public boolean add(Integer item) {
return add(item, 1);
}
public boolean add(Integer item, long incrementCount) {
int hash;
int mult, offset;
count += incrementCount;
for (int i=0; i<logn; i+=gran) {
offset = 0;
for (int j=0;j< tests; j++) {
hash = (int) HashUtils.hash31(testa[j], testb[j], item);
hash = hash % buckets;
mult = (int) HashUtils.hash31(testc[j], testd[j], item);
if ((mult&1) == 1)
counts[i][offset+hash] += incrementCount;
else
counts[i][offset+hash] -= incrementCount;
offset += buckets;
}
item >>= gran;
}
return true;
}
public long estimateCount(Integer item) {
return estimateCount(item, 0);
}
public long estimateCount(Integer item, int depth) {
int offset = 0, hash, mult, estimate;
int[] estimates = new int[tests+1];
if (depth == logn) return count;
for (int i=1; i<=tests; i++) {
hash = (int) HashUtils.hash31(testa[i-1], testb[i-1], item);
hash = hash % (buckets);
mult = (int) HashUtils.hash31(testc[i-1], testd[i-1], item);
if ((mult&1) == 1)
estimates[i] = counts[depth][offset+hash];
else
estimates[i] = -counts[depth][offset+hash];
offset += buckets;
}
if (tests == 1)
estimate = estimates[1];
else if (tests == 2)
estimate = (estimates[1]+estimates[2])/2;
else
estimate = ArrayUtils.medSelect(1+tests/2,tests,estimates);
return estimate;
}
private int[] recursive(int depth, int start, int thresh) {
int blocksize;
int estcount = (int) estimateCount(start, depth);
int itemshift;
int[] results = new int[buckets];
results[0] = 0;
if (estcount >= thresh) {
if (depth == 0) {
if (results[0] < buckets) {
results[0]++;
results[results[0]] = start;
}
} else {
blocksize = 1 << gran;
itemshift = start << gran;
// assumes that gran is an exact multiple of the bit dept
for (int i=0; i<blocksize; i++)
recursive(depth-gran, itemshift+i, thresh);
}
}
return results;
}
public int[] output(int thresh) {
return recursive(logn, 0, thresh);
}
public long estimateF2() {
int r = 0;
long[] estimates = new long[tests+1];
long result, z;
for (int i=1; i<=tests; i++) {
z=0;
for (int j=0; j<buckets; j++) {
z += (long)counts[0][r] * (long)counts[0][r];
r++;
}
estimates[i] = z;
}
if (tests == 1)
result = estimates[1];
else if (tests == 2)
result = (estimates[1]+estimates[2])/2;
else
result = ArrayUtils.longMedSelect(1+tests/2, tests, estimates);
return result;
}
public long size() {
return count;
}
public boolean contains(Integer item) {
return estimateCount(item) > 0;
}
}