package org.streaminer.stream.frequency; import java.security.SecureRandom; import org.streaminer.util.ArrayUtils; import org.streaminer.util.hash.HashUtils; /** * "Tug of War" sketches due to Alon, Matias and Szegedy 96, and Alon, Gibbons, * Matias and Szegedy, 99. Some hashing tricks used for faster implementation. * They support: returning point estimates, approximating inner-products, * estimating the L2 norm of a vector. * * Using pairwise hash functions to speed up updates, Graham Cormode 2003 * Original implementation: http://www.cs.rutgers.edu/~muthu/massdal-code-index.html * * @author Maycon Viana Bordin <mayconbordin@gmail.com> */ public class AMSSketch implements ISimpleFrequency<Long>, Comparable<AMSSketch> { private int depth; private int buckets; private int count = 0; private int[] counts; private long[][] test; private SecureRandom prng = new SecureRandom(); public AMSSketch(int depth, int buckets) { this.depth = depth; this.buckets = buckets; counts = new int[buckets*depth]; test = new long[6][depth]; for (int i=0; i<depth; i++) { for (int j=0; j<6; j++) { test[j][i] = prng.nextLong(); if (test[j][i] < 0) test[j][i] = -test[j][i]; } } } public boolean add(Long item) { return add(item, 1); } public boolean add(Long item, long incrementCount) { int offset = 0, hash, mult; count += incrementCount; for (int j=0; j<depth; j++) { hash = (int) HashUtils.hash31(test[0][j], test[1][j], item); hash = hash % buckets; mult = (int) HashUtils.fourwise(test[2][j], test[3][j], test[4][j], test[5][j], item); if ((mult&1)==1) counts[offset+hash] += incrementCount; else counts[offset+hash]-= incrementCount; offset += buckets; } return true; } public int compareTo(AMSSketch o) { if (buckets != o.buckets) return (buckets > o.buckets) ? 1: -1; if (depth != o.depth) return (depth > o.depth) ? 1: -1; for (int i=0; i<depth; i++) for (int j=0; j<6; j++) if (test[j][i] != o.test[j][i]) return (test[j][i] > o.test[j][i]) ? 1: -1; return 0; } public long estimateCount(Long item) { int offset = 0, hash, mult, estimate; int[] estimates = new int[depth+1]; for (int i=1; i<=depth; i++) { hash = (int) HashUtils.hash31(test[0][i-1], test[1][i-1], item); hash = hash % buckets; mult = (int) HashUtils.fourwise(test[2][i-1], test[3][i-1], test[4][i-1], test[5][i-1], item); if ((mult&1)==1) estimates[i] = counts[offset+hash]; else estimates[i] = -counts[offset+hash]; offset += buckets; } if (depth == 1) estimate = estimates[1]; else if (depth == 2) estimate = (estimates[1]+estimates[2])/2; else estimate = ArrayUtils.medSelect(1+depth/2, depth, estimates); return estimate; } public long size() { return count; } public long estimateF2() { // estimate the F2 moment of the vector (sum of squares) int r = 0; long[] estimates = new long[depth+1]; long result, z; for (int i=1; i<=depth; i++) { z=0; for (int j=0; j<buckets; j++) { z += counts[r] * counts[r]; r++; } estimates[i]=z; } if (depth == 1) result = estimates[1]; else if (depth == 2) result = (estimates[1]+estimates[2])/2; else result = ArrayUtils.longMedSelect(1+depth/2, depth, estimates); return result; } public long innerProduct(AMSSketch b){ int r = 0; long[] estimates = new long[depth+1]; long result, z; // estimate the innerproduct of two vectors using their sketches. if (this.compareTo(b) != 0) return 0; for (int i=1; i<=depth; i++) { z=0; for (int j=0; j<buckets; j++) { z += counts[r] * b.counts[r]; r++; } estimates[i] = z; } if (depth == 1) result = estimates[1]; else if (depth == 2) result = (estimates[1]+estimates[2])/2; else result = ArrayUtils.longMedSelect(1+depth/2, depth, estimates); return result; } public boolean add(AMSSketch source) { int r = 0; // add one sketch to another if (this.compareTo(source) != 0) return false; for (int i=0; i<source.depth; i++) for (int j=0; j<source.buckets; j++) { counts[r] += source.counts[r]; r++; } return true; } public boolean subtract(AMSSketch source) { int r = 0; // subtract one sketch from another if (this.compareTo(source) != 0) return false; for (int i=0; i<source.depth; i++) for (int j=0; j<source.buckets; j++) { counts[r] -= source.counts[r]; r++; } return true; } public boolean contains(Long item) { return estimateCount(item) > 0; } }