package org.streaminer.stream.cardinality; import java.io.Serializable; import java.util.*; import org.streaminer.util.hash.function.HashFunction; import org.streaminer.util.hash.function.MurmurHashFunction; /** * Implementation of the BJKST algprothm for distinct counting. * * Source code: https://github.com/ananthc/streamstats * * Reference: * Bar-Yossef, Ziv, et al. "Counting distinct elements in a data stream." * Randomization and Approximation Techniques in Computer Science. Springer * Berlin Heidelberg, 2002. 1-10. * * @author ananthc */ public class BJKST implements Serializable { private static final long serialVersionUID = -2032575802259420762L; private int numMedians=25; private int sizeOfMedianSet; private double error = 0.02f; private List<Integer> limits; private int bufferSize = 100; private List<HashSet<String>> buffers; private List<HashFunction<Object>> hHashers; private List<HashFunction<Object>> gHashers; private int intLength = Integer.toString(Integer.MAX_VALUE).length(); private String lengthOfIntegerRepresentation = null; public BJKST(int numberOfMedianAttempts, int sizeOfEachMedianSet) { this.numMedians = numberOfMedianAttempts; this.sizeOfMedianSet = sizeOfEachMedianSet; init(); } public BJKST(int numberOfMedianAttempts, int sizeOfEachMedianSet, double allowedError) { if (allowedError < 0 || allowedError > 1) { throw new IllegalArgumentException("Permitted error should be < 1 and in float format"); } this.numMedians = numberOfMedianAttempts; this.sizeOfMedianSet = sizeOfEachMedianSet; this.error = allowedError; init(); } private void init() { this.bufferSize = (int) ((this.sizeOfMedianSet) / Math.pow(this.error,2.0) ) ; lengthOfIntegerRepresentation = ("%0" + intLength + "d"); limits = new ArrayList<Integer>(numMedians); buffers = new ArrayList<HashSet<String>>(numMedians); hHashers = new ArrayList<HashFunction<Object>>(numMedians); gHashers = new ArrayList<HashFunction<Object>>(numMedians); for ( int i =0 ; i < numMedians; i++) { limits.add(0); buffers.add(new HashSet<String>()); hHashers.add(new MurmurHashFunction<Object>()); gHashers.add(new MurmurHashFunction<Object>()); } } public void offer(Object o) { for ( int i =0 ; i < numMedians; i++) { String binaryRepr = Long.toBinaryString(hHashers.get(i).hash(o)); int zereosP = binaryRepr.length() - binaryRepr.lastIndexOf('1'); int currentZ = limits.get(i); if (zereosP >= currentZ) { HashSet<String> currentBuffer = buffers.get(i); currentBuffer.add(String.format(lengthOfIntegerRepresentation, gHashers.get(i).hash(o)) + String.format(lengthOfIntegerRepresentation, zereosP)); while (currentBuffer.size() > bufferSize) { currentZ = currentZ + 1; for (Iterator<String> itr = currentBuffer.iterator(); itr.hasNext();) { String element = itr.next(); long zeroesOld = Long.parseLong(element.substring(intLength)); if (zeroesOld < currentZ) { itr.remove(); } } } } } } public long cardinality() { HashMap<Integer,Integer> results = new HashMap<Integer,Integer>(); for ( int i =0 ; i < numMedians; i++) { int currentGuess = (int) (buffers.get(i).size() * Math.pow(2,limits.get(i))); if (!results.containsKey(currentGuess)) { results.put(currentGuess,1); } else { int currentCount = results.get(currentGuess); results.put(currentGuess,(currentCount + 1)); } } int finalEstimate = 0; int highestVote = 0; for (Map.Entry<Integer,Integer> pair : results.entrySet()) { int possibleAnswer = pair.getValue(); if (possibleAnswer > highestVote ) { highestVote = possibleAnswer; finalEstimate = pair.getKey(); } } return finalEstimate; } }