package org.streaminer.stream.frequency;
import org.streaminer.stream.frequency.util.CountEntry;
import org.streaminer.stream.frequency.util.CountEntryWithMaxError;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
/**
* <p>
* Implementation of the Lossy Counting algorithm described in the paper
* "Approximate Frequency Counts over Data Streams" written by 'Rajeev Motwani' and
* 'Gurmeet Singh Manku'.
* </p>
*
* @author Marcin Skirzynski (main work), Benedikt Kulmann (modifications)
* @param <T>
*/
public class LossyCounting<T> extends BaseFrequency<T> {
/**
* The window size which will be set at
* the beginning and will never change
*/
private int windowSize;
/**
* The number of the current window
* beginning with 0
*/
private long currentWindow;
/**
* The maximum error set be the user at the
* beginning.
*/
private double error;
/**
* The data structures which holds all
* counting information.
*/
private Map<T, CountEntryWithMaxError<T>> dataStructure;
/**
* The total count of all counted elements
* in the stream so far.
*/
private long elementsCounted;
/**
* <p>
* Constructs an instance of the LossyCounting algorithm
* with the specified maximum error bound, which can not
* be changed.
* </p>
*
* @param maxError the maximum error bound
*/
public LossyCounting(double maxError) {
if (maxError < 0 || maxError > 1) {
throw new IllegalArgumentException("Maximal error needs to be a double between 0 and 1");
}
this.windowSize = (int) Math.ceil(1 / maxError);
this.currentWindow = 1;
this.elementsCounted = 0;
this.error = maxError;
this.dataStructure = new ConcurrentHashMap<T, CountEntryWithMaxError<T>>();
updateCurrentWindow();
}
@Override
public boolean add(T item, long incrementCount) {
boolean newItem = true;
if (containsItem(item)) {
incrementCount(item, incrementCount);
newItem = false;
} else {
insertItem(item, incrementCount, currentWindow - 1);
}
updateCurrentWindow();
if (elementsCounted % windowSize == 0) {
compress();
}
return newItem;
}
/**
* <p>
* Returns the estimated frequency of the given element.
* </p>
*
* <p>
* The LossyCounting algorithm compresses the internal data structure which means
* that an element will be deleted if it doesn't emerge frequently enough.
* That means that even when the element appeared in the stream
* the estimated frequency can be 0.
* </p>
*
* @param item the item for which the estimated frequency will be returned
* @return the estimated frequency of the given item
*/
@Override
public long estimateCount(T item) {
if (dataStructure.containsKey(item)) {
return dataStructure.get(item).frequency;
}
return 0L;
}
public boolean contains(T item) {
return dataStructure.containsKey(item);
}
@Override
public long size() {
return elementsCounted;
}
@Override
public Set<T> keySet() {
return dataStructure.keySet();
}
public List<CountEntry<T>> getFrequentItems(double minSupport) {
List<CountEntry<T>> result = new ArrayList<CountEntry<T>>();
for (T element : dataStructure.keySet()) {
CountEntry<T> entry = dataStructure.get(element);
if (entry.frequency >= (minSupport - error) * elementsCounted) {
result.add(entry);
}
}
return result;
}
/**
* <p>
* Compresses the data structure. Will be called automatically
* by the count method, when a new window is reached.
* </p>
*/
private void compress() {
Collection<T> markedToRemove = new ArrayList<T>();
for (T element : dataStructure.keySet()) {
CountEntryWithMaxError<T> entry = dataStructure.get(element);
if (entry.frequency + entry.maxError < currentWindow) {
markedToRemove.add(element);
}
}
for (T element : markedToRemove) {
dataStructure.remove(element);
}
}
/**
* <p>
* Updates the current window
* </p>
*/
private void updateCurrentWindow() {
this.currentWindow = (int) Math.ceil(elementsCounted / (double) windowSize);
}
private boolean containsItem(T item) {
return dataStructure.containsKey(item);
}
private void incrementCount(T item, long incrementCount) {
dataStructure.get(item).frequency += incrementCount;
elementsCounted++;
}
private void insertItem(T item, long initialFrequency, long maxError) {
dataStructure.put(item, new CountEntryWithMaxError<T>(item, initialFrequency, maxError));
elementsCounted++;
}
}