package org.streaminer.stream.frequency; import org.streaminer.stream.frequency.util.CountEntryWithMaxError; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.LinkedList; import java.util.List; import java.util.Set; import org.streaminer.stream.frequency.util.CountEntry; /** * Implementation of the Space-Saving algorithm described in the paper * "Efficient Computation of Frequent and Top-k Elements in Data Streams" * written by 'Ahmed Metwally', 'Divyakant Agrawal' and 'Amr El Abbadi' * * @author Lukas Kalabis * @param <T> The type of object that will be stored */ public class SpaceSaving<T> extends BaseFrequency<T> { private double support = 0.1d; private double error = 0.1d; private int counter = 1000; /** * The data structure which holds all counting information. */ private final List<Bucket<List<T>>> dataStructure; /** * The total count of all counted elements in the stream so far. */ protected long elementsCounted; private boolean guaranteed = true; /** * Creates a new instance of SpaceSaving * @param counters number of available counters * @param support min-support for the frequency * @param maxError maximum error */ public SpaceSaving(int counters, double support, double maxError) { super(support); if (support <= 0 || support >= 1) { throw new IllegalArgumentException("Support has to be > 0 and < 1."); } if (counters <= 0 ) { throw new IllegalArgumentException("Counters has to be > 0"); } this.counter = counters; this.support = support; this.error = maxError; elementsCounted = 0L; dataStructure = new LinkedList<Bucket<List<T>>>(); for (int i = 0; i < this.counter; i++) { dataStructure.add(new Bucket<List<T>>(new LinkedList<T>(), 0, 0)); } } @Override public boolean add(T item, long incrementCount) { if (containsItem(item)) { incrementCount(item, incrementCount); return false; } else { // Replace the item with the lowest count with the new one. insertItem(item, incrementCount); return true; } } @Override public long estimateCount(T item) { if (dataStructure.contains(item)) { return getBucketForItem(item).frequency; } return 0L; } public boolean contains(T item) { return dataStructure.contains(item); } @Override public long size() { return elementsCounted; } @Override public Set<T> keySet() { return null; } public List<CountEntry<T>> getFrequentItems(double minSupport) { List<CountEntry<T>> result = new ArrayList<CountEntry<T>>(); int j = 1; for (Bucket<List<T>> b : dataStructure) { if (((b.frequency - error) > (minSupport * elementsCounted)) && (j <= dataStructure.size())) { for(int i = 0; i < b.item.size(); i++) { result.add(new CountEntry<T>(b.item.get(i), b.frequency)); } if ((b.frequency - error) < (minSupport * elementsCounted)) { guaranteed = false; } } j++; } return result; } /** * Shows if the frequent Items are still in the guaranteed * bounds of the algorithm. * @return */ public boolean getGuaranteed() { return guaranteed; } private boolean containsItem(T item){ Bucket<List<T>> bucket = getBucketForItem(item); return bucket != null; } /** * <p>Increment the count frequency of the provided item by 1.</p> * <p> * First it checks two thinks. * <li>Is the Bucket which count has to be updated the last bucket?</li> * <li>Is the new frequency of the Bucket the same as the neighbor bucket frequency?</li> * If one of this is true the bucket with the least hits will be deleted and the new item will * get this bucket.<br/> * Otherwise: The item will be added into the neighbor bucket and removed from the original bucket. * </p> * <p> * In the end the whole data structure is sorted * </p> * * @param item The item whose frequency shall be incremented. * @param incrementCount The number that will be added to the item count. */ private void incrementCount(T item, long incrementCount) { Bucket<List<T>> firstBucket = getBucketForItem(item); long bucketCount = firstBucket.frequency + incrementCount; boolean replaceOldBucket = false; if (dataStructure.indexOf(firstBucket) == dataStructure.size()-1) { replaceOldBucket = true; } else if (dataStructure.get(dataStructure.indexOf(firstBucket)+1).frequency != bucketCount) { replaceOldBucket = true; } else { Bucket<List<T>> neighborBucket = dataStructure.get(dataStructure.indexOf(firstBucket)+1); neighborBucket.item.add(item); firstBucket.item.remove(item); } if (replaceOldBucket) { Bucket<List<T>> bucket = dataStructure.get(0); long oldMaxError = bucket.getMaxError(); bucket.item.clear(); bucket.item.add(item); bucket.frequency += incrementCount; bucket.setMaxError(oldMaxError); } else { firstBucket.frequency = bucketCount; } elementsCounted++; sortDataStructure(); } /** * <p> * This method insert a new, not yet seen item, into the data structure. * <br /> * The bucket with the least hits will be cleared and the new item will get this * bucket. Also the frequency will be increment. * <br /> * In the last step the whole data structure will be sorted * </p> * @param item The item that is inserted into the model. */ private void insertItem(T item, long incrementCount) { Bucket<List<T>> bucket = dataStructure.get(0); bucket.item.clear(); bucket.item.add(item); bucket.frequency += incrementCount; elementsCounted++; sortDataStructure(); } /** * <p> * This method sort the data structure by the frequency. * </p> */ private void sortDataStructure() { Collections.sort(dataStructure, new Comparator<Bucket<List<T>>>() { @Override public int compare(Bucket<List<T>> o1, Bucket<List<T>> o2) { return Long.valueOf(o1.frequency).compareTo(o2.frequency); } }); } /** * <p> * This method returns the bucket which contains the item or null * if the items is in no bucket. * </p> * @param item The item which you search for in all buckets * @return Bucket with the item or null if Element is not enclosed. */ private Bucket<List<T>> getBucketForItem(T item) { for (Bucket<List<T>> b : dataStructure) { if (b.item.contains(item)){ return b; } } return null; } private class Bucket<T> extends CountEntryWithMaxError<T> { private static final long serialVersionUID = 1L; public Bucket(T item, long frequency, long maxError){ super(item, frequency, maxError); } public void setMaxError(long maxError){ this.maxError = maxError; } public long getMaxError(){ return this.maxError; } } }