package sizzle.aggregators; import java.io.IOException; import java.util.Arrays; import java.util.List; import sizzle.io.EmitKey; /** * A Sizzle aggregator to calculate a histogram for the values in a dataset. * * @author anthonyu * */ abstract class HistogramAggregator extends Aggregator { private final long min; private final long max; private final int buckets; /** * Construct a HistogramAggregator. * * @param min * A long representing the minimum value to be considered in the * histogram * * @param max * A long representing the maximum value to be considered in the * histogram * * @param buckets * A long representing the number of buckets in the histogram */ public HistogramAggregator(long min, long max, long buckets) { this.min = min; this.max = max; this.buckets = (int) buckets; } public long count(String metadata) { // if the metadata is null, it counts as a single if (metadata == null) return 1; // otherwise, parse the metadata and count it as that else return Long.parseLong(metadata); } /** {@inheritDoc} */ @Override public void start(EmitKey key) { super.start(key); } /** {@inheritDoc} */ @Override public abstract void aggregate(String data, String metadata) throws NumberFormatException, IOException, InterruptedException; /** {@inheritDoc} */ @Override public void finish() throws IOException, InterruptedException { if (this.isCombining()) { // if we're in the combiner, just output the compressed data for (Pair<Number, Long> p : this.getTuples()) this.collect(p.getFirst().toString(), p.getSecond().toString()); } else { // otherwise, set up the histogram int[] buckets = new int[this.buckets]; // calculate the step or the space between the buckets double step = (this.max - this.min) / (double) this.buckets; // for each of the compressed data points, increment the bucket it // belongs to by its cardinality for (Pair<Number, Long> p : this.getTuples()) buckets[(int) ((p.getFirst().longValue() - this.min) / step)] += p.getSecond(); this.collect(Arrays.toString(buckets)); } } /** * Return the data points from the dataset in pairs. * * @return A {@link List} of {@link Pair}<{@link Number}, {@link Long}> * containing the data points from the dataset */ public abstract List<Pair<Number, Long>> getTuples(); /** {@inheritDoc} */ @Override public boolean isAssociative() { return true; } /** {@inheritDoc} */ @Override public boolean isCommutative() { return true; } }