package sizzle.aggregators;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
import sizzle.io.EmitKey;
/**
* A Sizzle aggregator to estimate the top <i>n</i> values in a dataset by
* cardinality.
*
* @author anthonyu
*
*/
@AggregatorSpec(name = "top", formalParameters = { "int" }, weightType = "int")
public class TopAggregator extends Aggregator {
private CountingSet<String> set;
private final CountedString[] list;
private final int last;
/**
* Construct a TopAggregator.
*
* @param n
* A long representing the number of values to return
*/
public TopAggregator(final long n) {
super(n);
// an array of weighted string of length n
this.list = new CountedString[(int) n];
// the index of the last entry in the list
this.last = (int) (this.getArg() - 1);
}
/** {@inheritDoc} */
@Override
public void start(final EmitKey key) {
super.start(key);
this.set = new CountingSet<String>();
// clear out the list
for (int i = 0; i < this.getArg(); i++)
this.list[i] = new CountedString("", Long.MIN_VALUE);
}
/** {@inheritDoc} */
@Override
public void aggregate(final String data, final String metadata) {
if (metadata == null)
this.set.add(data, 1);
else
this.set.add(data, Double.valueOf(metadata).longValue());
}
/** {@inheritDoc} */
@Override
public void finish() throws IOException, InterruptedException {
if (this.isCombining()) {
for (final Entry<String, Long> e : this.set.getEntries())
this.collect(e.getKey().toString(), e.getValue().toString());
} else {
// TODO: replace this with the algorithm described in M. Charikar,
// K. Chen, and M. Farach-Colton, Finding frequent items in data
// streams, Proc 29th Intl. Colloq. on Automata, Languages and
// Programming, 2002.
for (final Entry<String, Long> e : this.set.getEntries()) {
if (e.getValue() > this.list[this.last].getCount() || e.getValue() == this.list[this.last].getCount()
&& this.list[this.last].getString().compareTo(e.getKey()) > 0) {
// find this new item's position within the list
for (int i = 0; i < this.getArg(); i++)
if (e.getValue().longValue() > this.list[i].getCount() || e.getValue() == this.list[i].getCount()
&& this.list[i].getString().compareTo(e.getKey()) > 0) {
// here it is. move all subsequent items down one
for (int j = (int) (this.getArg() - 2); j >= i; j--)
this.list[j + 1] = this.list[j];
// insert the item where it belongs
this.list[i] = new CountedString(e.getKey(), e.getValue());
break;
}
}
}
for (final CountedString c : this.list)
if (c.getCount() > Long.MIN_VALUE)
this.collect(c.toString());
}
}
/** {@inheritDoc} */
@Override
public boolean isAssociative() {
return true;
}
/** {@inheritDoc} */
@Override
public boolean isCommutative() {
return true;
}
}
/**
* A tuple containing a {@link String} and its count.
*
* @author anthonyu
*
*/
class CountedString {
private final String string;
private final long count;
/**
* Construct a CountedString.
*
* @param string
* A {@link String} containing the string part of the tuple
*
* @param weight
* A long representing the count part of the tuple
*/
public CountedString(final String string, final long count) {
super();
this.string = string;
this.count = count;
}
/**
* Get the string part of the tuple.
*
* @return A {@link String} containing the string part of the tuple
*/
public String getString() {
return this.string;
}
/**
* Get the string part of the tuple.
*
* @return A long representing the count part of the tuple
*/
public long getCount() {
return this.count;
}
/** {@inheritDoc} */
@Override
public String toString() {
return this.string + ", " + this.count + ", 0";
}
}
/**
* A counting set. Like a {@link Set}, but also keeps track of how many times a
* given member has been added.
*
* @author anthonyu
*
* @param <T>
* The type of value that will be inserted into the set
*/
class CountingSet<T> {
private Map<T, Long> map;
/**
* Construct a CountingSet.
*/
public CountingSet() {
this.map = new HashMap<T, Long>();
}
/**
* Add a value to the set.
*
* @param t
* The value to be added
*/
public void add(final T t) {
// add it with cardinality 1
this.add(t, 1);
}
/**
* Add a value and its cardinality to the set.
*
* @param t
* The value to be added
* @param n
* The cardinality of the value
*/
public void add(final T t, final long n) {
// if the map already has this key, add n to the current cardiality and
// reinsert
if (this.map.containsKey(t))
this.map.put(t, Long.valueOf(this.map.get(t).longValue() + n));
else
this.map.put(t, Long.valueOf(n));
}
public void clear() {
this.map = new HashMap<T, Long>();
}
/**
* Get the entries in this set.
*
* @return A {@link Set} of T containing the entries in this set
*/
public Set<java.util.Map.Entry<T, Long>> getEntries() {
return this.map.entrySet();
}
}
/**
* A pair of values.
*
* @author anthonyu
*
* @param <F>
* The type of the first value
*
* @param <S>
* The type of the second value
*/
class Pair<F, S> {
private final F first;
private final S second;
/**
* Construct a Pair.
*
* @param first
* The first value
*
* @param second
* The second value
*/
public Pair(final F first, final S second) {
this.first = first;
this.second = second;
}
/**
* Get the first value.
*
* @return The first value
*/
public F getFirst() {
return this.first;
}
/**
* Get the second value.
*
* @return The second value
*/
public S getSecond() {
return this.second;
}
}