package org.streaminer.stream.frequency;
import org.streaminer.stream.frequency.util.CountEntry;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
/**
* Implementation of the "Sticky Sampling" algorithm as described in the paper
* "Approximate Frequency Counts over Data Streams" written by Gurmeet Singh Manku
* and Rajeev Motwani
*
* @author Benedikt Kulmann
*/
public class StickySampling<T> extends BaseFrequency<T> {
private double support;
private double error;
/**
* <p>Elements which are not yet existing in the data structure will be added to it
* with probablity <code>1/r</code>.</p>
*
* <p><code>(samplingRate * t)</code> is the number of items
* until the {@link #adaptNewSamplingRate()} method will be invoked since its last invocation.</p>
*/
private long samplingRate;
/**
* <p>Calculated at object creation time<br />
* have a look at the {@link #samplingRate} for further documentation</p>
*/
private final double t;
/**
* <p>Counter for the current sampling interval so that it is possible to determine whether
* the {@link #adaptNewSamplingRate()} method has to be invoked.</p>
*/
private long windowCount;
/**
* <p>The length of the current "sampling window", determined with <code>(samplingRate * t)</code></p>
*/
private long windowLength;
/**
* <p>The data structure which holds all counting information.</p>
*/
private final Map<T, CountEntry<T>> dataStructure;
/**
* The total count of all counted elements
* in the stream so far.
*/
private long elementsCounted;
/**
* <p>Creates a new instance of StickySampling.</p>
*
* @param support The threshold whether an element is frequent or not. Has to be out of (0,1).
* @param error An epsilon for the threshold. Has to be out of (0,1).
* @param probabilityOfFailure Probability for an item to fail to fulfill the three quality characteristics of this algorithm. Has to be out of (0,1).
*/
public StickySampling(double support, double error, double probabilityOfFailure) {
super(support);
if (support <= 0 || support >= 1) {
throw new IllegalArgumentException("Support has to be > 0 and < 1.");
}
if (error <= 0 || error >= 1) {
throw new IllegalArgumentException("Error has to be > 0 and < 1.");
}
if (probabilityOfFailure <= 0 || probabilityOfFailure >= 1) {
throw new IllegalArgumentException("Probability of failure has to be > 0 and < 1.");
}
this.support = support;
this.error = error;
this.samplingRate = 1;
this.t = (1 / error) * Math.log(1 / (support * probabilityOfFailure));
this.windowCount = 0;
this.windowLength = (long)(2 * t);//only on initialization. Later this is calculated by (samplingRate * t)
this.elementsCounted = 0;
this.dataStructure = new ConcurrentHashMap<T, CountEntry<T>>();
}
@Override
public boolean add(T item, long incrementCount) {
boolean newItem = true;
if (containsItem(item)) {
incrementCount(item, incrementCount);
newItem = false;
} else {
if(sample()) {
insertItem(item, incrementCount);
}
}
windowCount++;
if (changeOfSamplingRateNeeded()) {
changeSamplingRate();
adaptNewSamplingRate();
}
return newItem;
}
@Override
public long estimateCount(T item) {
if (dataStructure.containsKey(item)) {
return dataStructure.get(item).frequency;
} else {
return 0L;
}
}
public boolean contains(T item) {
return dataStructure.containsKey(item);
}
@Override
public long size() {
return elementsCounted;
}
@Override
public Set<T> keySet() {
return dataStructure.keySet();
}
public List<CountEntry<T>> getFrequentItems(double minSupport) {
List<CountEntry<T>> frequentItems = new ArrayList<CountEntry<T>>();
for (CountEntry<T> entry : dataStructure.values()) {
if (isFrequent(entry.frequency)) {
frequentItems.add(entry);
}
}
return frequentItems;
}
/**
* <p>Returns whether the provided frequency is a frequent one in terms of sticky sampling.</p>
*
* @param frequency The frequency which shall be tested
* @return True if the frequency would classify an item as frequent in terms of sticky sampling, false otherwise
*/
public boolean isFrequent(long frequency) {
return frequency >= (support - error) * elementsCounted;
}
/**
* <p>Decision whether a new item should be put into the data structure.</p>
*
* @return Whether an item should be put into the data structure
*/
private boolean sample() {
return Math.random() <= 1 / (double)samplingRate;
}
/**
* <p>If the end of the current "sampling window" is reached, reset the current sampling window counter
* and increment the sampling rate.</p>
*
* @return Whether a change of the current sampling rate is needed
*/
private boolean changeOfSamplingRateNeeded() {
return windowCount == windowLength;
}
/**
* <p>Makes changes to the sampling rate.</p>
*
* <p>Changes to the data structure will be performed by {@link #adaptNewSamplingRate()}</p>
*/
private void changeSamplingRate() {
windowCount = 0;
samplingRate *= 2;
windowLength = (long)(samplingRate * t);
}
/**
* <p>Diminish counts and remove elements which reach a count of 0.</p>
*
* <p>This transforms the data structure such that it contains elements
* which would also have been sampled with the new sampling rate, only.</p>
*
* <p>The modification of the sampling rate itself is performed by {@link #changeSamplingRate()}.</p>
*/
private void adaptNewSamplingRate() {
for (T item : dataStructure.keySet()) {
while(tossCoin()) {
decrementCount(item);
if(frequencyIsZero(item)) {
removeItem(item);
break;
}
}
}
}
/**
* <p>50:50 random event</p>
*
* @return Result of 50:50 random event
*/
private boolean tossCoin() {
return Math.random() < 0.5;
}
/**
* <p>Removes the {@link CountEntry} associated with the provided item from the internal
* data structure.</p>
*
* @param itemToRemove The item whose {@link CountEntry} shall be removed
*/
private void removeItem(T itemToRemove) {
dataStructure.remove(itemToRemove);
}
/**
* <p>Returns whether the internal data structure contains a counter for the provided item.</p>
*
* @param item The item in question
* @return True if the internal data structure contains a counter for the provided item, false otherwise.
*/
private boolean containsItem(T item) {
return dataStructure.containsKey(item);
}
/**
* <p>Increment the count frequency of the provided item by 1.</p>
*
* @param item The item whose frequency shall be incremented by 1.
*/
private void incrementCount(T item, long incrementCount) {
dataStructure.get(item).frequency += incrementCount;
elementsCounted++;
}
/**
* <p>Decrements the count frequency of the provided item by 1. Used within the
* {@link StickySampling#adaptNewSamplingRate()} method of the algorithm.</p>
*
* @param item The item whose count frequency shall be decremented by 1.
*/
private void decrementCount(T item) {
dataStructure.get(item).frequency--;
}
/**
* <p>Returns whether the count frequency of the provided item corresponds to 0
* (i.e. frequency == 0 or item doesn't exist within the internal data structure).</p>
*
* @param item The item in question.
* @return true if the count frequency of the provided item corresponds to 0
*/
private boolean frequencyIsZero(T item) {
return !dataStructure.containsKey(item) || dataStructure.get(item).frequency == 0;
}
/**
* Inserts the provided item into the internal data structure with an initial count of 1.
*
* @param item The item which shall be inserted into the internal data structure
*/
private void insertItem(T item, long incrementCount) {
dataStructure.put(item, new CountEntry<T>(item, incrementCount));
elementsCounted++;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("StickySamplingModel[");
for (T key : dataStructure.keySet()) {
sb.append(dataStructure.get(key)).append(";");
}
sb.append("]");
return sb.toString();
}
}