package com.twitter.common.stats;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.logging.Logger;
import com.google.common.base.Preconditions;
import com.twitter.common.quantity.Amount;
import com.twitter.common.quantity.Data;
/**
* Implements Histogram structure for computing approximate quantiles.
* The implementation is based on the following paper:
*
* [MP80] Munro & Paterson, "Selection and Sorting with Limited Storage",
* Theoretical Computer Science, Vol 12, p 315-323, 1980.
*
* You could read a detailed description of the same algorithm here:
*
* [MRL98] Manku, Rajagopalan & Lindsay, "Approximate Medians and other
* Quantiles in One Pass and with Limited Memory", Proc. 1998 ACM
* SIGMOD, Vol 27, No 2, p 426-435, June 1998.
*
* There's a good explanation of the algorithm in the Sawzall source code
* See: http://szl.googlecode.com/svn-history/r36/trunk/src/emitters/szlquantile.cc
*
* Here's a schema of the tree:
*
* [4] level 3, weight=rootWeight=8
* |
* [3] level 2, weight=4
* |
* [2] level 1, weight=2
* / \
* [0] [1] level 0, weight=1
*
* [i] represent buffer[i]
* The depth of the tree is limited to a maximum value
* Every buffer has the same size
*
* We add element in [0] or [1].
* When [0] and [1] are full, we collapse them, it generates a temporary buffer of weight 2,
* if [2] is empty, we put the collapsed buffer into [2] otherwise we collapse [2] with
* the temporary buffer and put it in [3] if it's empty and so on...
*/
public final class ApproximateHistogram implements Histogram {
private static final Logger LOG = Logger.getLogger(Histogram.class.getName());
private static final Precision DEFAULT_PRECISION = new Precision(0.0001, 1000 * 1000);
private static final Amount<Long, Data> DEFAULT_MAX_MEMORY = Amount.of(4L, Data.KB);
private static final int elemSize = 8; // sizeof long
// See above
private List<List<Long>> buffer;
private int bufferSize;
private int maxDepth;
private int rootWeight = 1;
private long count = 0L;
/**
* Private init method that is called only by constructors
*
* @param bufSize size of each buffer
* @param b maximum depth of the tree of buffers
*/
private void init(int bufSize, int maxDepth) {
this.bufferSize = bufSize;
this.maxDepth = maxDepth;
buffer = new ArrayList<List<Long>>(maxDepth);
clear();
}
/**
* Constructor without memory constraint
* @param precision the requested precision
*/
public ApproximateHistogram(Precision precision) {
Preconditions.checkNotNull(precision);
int b = computeB(precision.getEpsilon(), precision.getN());
int bufSize = computeBufferSize(b, precision.getN());
init(bufSize, b);
}
/**
* Constructor without precision constraint
* @param maxMemory the maximum memory that the instance will take
*/
public ApproximateHistogram(Amount<Long, Data> maxMemory) {
Preconditions.checkNotNull(maxMemory);
int b = computeB(DEFAULT_PRECISION.getEpsilon(), DEFAULT_PRECISION.getN());
int bufSize = computeBufferSize(b, DEFAULT_PRECISION.getN());
int maxDepth = computeMaxDepth(maxMemory, bufSize);
init(bufSize, maxDepth);
}
/**
* Constructor with default arguments
* @see #ApproximateHistogram(Amount<Long, Data>)
*/
public ApproximateHistogram() {
this(DEFAULT_MAX_MEMORY);
}
@Override
public synchronized void add(long x) {
count += 1;
// if the leaves of the tree are full, "collapse" recursively the tree
if (buffer.get(0).size() == bufferSize && buffer.get(1).size() == bufferSize) {
Collections.sort(buffer.get(0));
Collections.sort(buffer.get(1));
recCollapse(buffer.get(0), 1);
buffer.get(0).clear();
}
// Now we're sure there is space for adding x
int i = 1;
if (buffer.get(0).size() < bufferSize) {
i = 0;
}
buffer.get(i).add(x);
}
@Override
public synchronized long[] getQuantiles(double[] qs) {
long[] output = new long[qs.length];
if (count == 0) {
Arrays.fill(output, 0L);
return output;
}
int io = 0;
long qsum = 0;
long[] qss = quantilesSums(qs);
int iq = 0;
Collections.sort(buffer.get(0));
Collections.sort(buffer.get(1));
int[] indices = new int[buffer.size()];
Arrays.fill(indices, 0);
while (io < output.length || qsum < count) {
int i = smallest(indices);
long x = buffer.get(i).get(indices[i] - 1);
qsum += weight(i);
while (iq < qss.length && qss[iq] <= qsum) {
output[io] = x;
io += 1;
iq += 1;
}
}
return output;
}
@Override
public synchronized void clear() {
count = 0L;
buffer.clear();
buffer.add(new ArrayList<Long>(bufferSize));
buffer.add(new ArrayList<Long>(bufferSize));
}
/**
* We compute the "smallest possible k" satisfying two inequalities:
* 1) (b - 2) * (2 ^ (b - 2)) + 0.5 <= epsilon * N
* 2) k * (2 ^ (b - 1)) >= N
*
* For an explanation of these inequalities, please read the Munro-Paterson or
* the Manku-Rajagopalan-Linday papers.
*/
private int computeB(double epsilon, long n) {
int b = 2;
while ((b - 2) * (1L << (b - 2)) + 0.5 <= epsilon * n) {
b += 1;
}
return b;
}
private int computeBufferSize(int b, long n) {
return (int) (n / (0x1L << (b - 1)));
}
/**
* Return the maximum depth of the graph to comply with the memory constraint
* @param bufferSize the size of each buffer
*/
private int computeMaxDepth(Amount<Long, Data> maxMemory, int bufferSize) {
int bm = 0;
long n = maxMemory.as(Data.BYTES) - 100 - (elemSize * bufferSize);
if (n < 0) {
bm = 2;
} else {
bm = (int) (n / (16 + elemSize * bufferSize));
}
if (bm < 2) {
bm = 2;
}
return bm;
}
/**
* return the weight of the level ie. 2^(i-1) except for the two tree leaves (weight=1) and for
* the root
*/
private int weight(int level) {
int w = 0;
if (level < 2) {
w = 1;
} else if (level == maxDepth) {
w = rootWeight;
} else {
w = 1 << (level - 1);
}
return w;
}
private long[] quantilesSums(double[] qs) {
long[] qss = new long[qs.length];
int i = 0;
while (i < qss.length) {
qss[i] = (long) (qs[i] * count);
i += 1;
}
return qss;
}
/**
* Return the level of the smallest element, and update the indices array
* the indices array represent (for each level of the tree) the index of the next value to read
*/
private int smallest(int[] indices) {
int iSmallest = 0;
long smallest = Long.MAX_VALUE;
for (int i = 0; i < buffer.size(); i++) {
long head = Long.MAX_VALUE;
if (!buffer.get(i).isEmpty() && indices[i] < buffer.get(i).size()) {
head = buffer.get(i).get(indices[i]);
}
if (head < smallest) {
smallest = head;
iSmallest = i;
}
}
indices[iSmallest] += 1;
return iSmallest;
}
private void recCollapse(List<Long> buf, int level) {
assert isSorted(buf);
// if we reach the root, we can't add more buffer
if (level == maxDepth) {
// weight() return the weight of the root, in that case we need the weight of merge result
int mergeWeight = 1 << (level - 1);
List<Long> merged = collapse(buf, mergeWeight, buffer.get(level), rootWeight);
buffer.set(level, merged);
rootWeight += mergeWeight;
} else {
int currentTop = buffer.size() - 1;
List<Long> merged = collapse(buf, 1, buffer.get(level), 1);
if (level == currentTop) {
// if we reach the top, add a new buffer
buffer.add(merged);
rootWeight *= 2;
} else if (buffer.get(level + 1).isEmpty()) {
// if the upper buffer is empty, use it
buffer.set(level + 1, merged);
} else {
// it the upper buffer isn't empty, collapse with it
recCollapse(merged, level + 1);
}
// now that the values have been collapsed, clean the buffer
buffer.get(level).clear();
}
}
/**
* collapse two sorted Arrays of different weight
* ex: [2,5,7] weight 2 and [3,8,9] weight 3
* weight x array + concat = [2,2,5,5,7,7,3,3,3,8,8,8,9,9,9]
* sort = [2,2,3,3,3,5,5,7,7,8,8,8,9,9,9]
* select every nth elems = [3,7,9] (n = sum weight / 2)
*/
private List<Long> collapse(
List<Long> left,
int leftWeight,
List<Long> right,
int rightWeight) {
assert left.size() == right.size();
assert isSorted(left);
assert isSorted(right);
int i = 0;
int j = 0;
int cnt = 0;
List<Long> output = new ArrayList<Long>(left.size());
while (i < left.size() || j < right.size()) {
long smallest = 0;
int weight = 0;
if (i < left.size() && (j == right.size() || left.get(i) < right.get(j))) {
smallest = left.get(i);
weight = leftWeight;
i += 1;
} else {
smallest = right.get(j);
weight = rightWeight;
j += 1;
}
int totalWeight = leftWeight + rightWeight;
for (int t = 0; t < weight; t++) {
if (cnt % totalWeight == totalWeight / 2) {
output.add(smallest);
}
cnt += 1;
}
}
assert isSorted(output);
return output;
}
/**
* Only used by assert during development of the algorithm
*/
private boolean isSorted(List<Long> list) {
boolean sorted = true;
int i = 0;
while (sorted && i < list.size() - 1) {
if (list.get(i) > list.get(i + 1)) {
sorted = false;
}
i += 1;
}
return sorted;
}
}