/**
* Copyright 2013 ananthc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.streaminer.stream.frequency;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.streaminer.stream.frequency.util.CountEntry;
/**
* Implementation of the MisraGries frequency count algorithm.
*
* Reference:
* Jayadev Misra and David Gries. Finding repeated elements. Sci. Comput.
* Program., 2(2):143–152, 1982.
*
* Source: https://github.com/ananthc/streamstats
*
* @author ananthc
* @param <T>
*/
public class MisraGries<T> extends BaseFrequency<T> {
private int k = 1;
private Map<T, Long> dataStructure = new HashMap<T, Long>();
private long elementsCounted;
public MisraGries(int k) {
this.k = k;
}
@Override
public boolean add(T item, long incrementCount) throws FrequencyException {
boolean newItem = true;
long count = 0;
if (dataStructure.containsKey(item) ) {
count = dataStructure.get(item) + incrementCount;
dataStructure.put(item, count);
newItem = false;
} else {
if (dataStructure.size() < k - 1) {
dataStructure.put(item, incrementCount);
} else {
Iterator<Map.Entry<T, Long>> itr = dataStructure.entrySet().iterator();
while (itr.hasNext()) {
Map.Entry<T, Long> entry = itr.next();
count = dataStructure.get(entry.getKey()) - 1;
if (count <= 0 ) {
itr.remove();
} else {
dataStructure.put(entry.getKey(), count);
}
}
}
}
return newItem;
}
@Override
public long estimateCount(T item) {
if (dataStructure.containsKey(item)) {
return dataStructure.get(item).longValue();
}
return 0L;
}
public boolean contains(T item) {
return dataStructure.containsKey(item);
}
@Override
public long size() {
return dataStructure.size();
}
@Override
public Set<T> keySet() {
return dataStructure.keySet();
}
public List<CountEntry<T>> getFrequentItems(double minSupport) {
List<CountEntry<T>> frequentItems = new ArrayList<CountEntry<T>>();
for (Map.Entry<T, Long> entry : dataStructure.entrySet()) {
if (isFrequent(entry.getValue(), minSupport)) {
frequentItems.add(new CountEntry<T>(entry.getKey(), entry.getValue()));
}
}
return frequentItems;
}
/**
* Determines whether a frequency is currently (i.e. in relation to the current total number
* of elements) said to be frequent given a specific threshold.
*
* @param frequency The frequency in question
* @param minSupport The threshold for determining whether a frequency is deemed to be frequent
* @return
*/
private boolean isFrequent(long frequency, double minSupport) {
return frequency >= minSupport * elementsCounted;
}
}