package io.airlift.stats; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.base.Ticker; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterators; import com.google.common.collect.Multimap; import com.google.common.collect.Multimaps; import com.google.common.collect.Ordering; import com.google.common.collect.PeekingIterator; import com.google.common.util.concurrent.AtomicDouble; import io.airlift.slice.BasicSliceInput; import io.airlift.slice.DynamicSliceOutput; import io.airlift.slice.SizeOf; import io.airlift.slice.Slice; import io.airlift.slice.SliceInput; import io.airlift.slice.SliceOutput; import org.openjdk.jol.info.ClassLayout; import javax.annotation.concurrent.NotThreadSafe; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; import static java.lang.String.format; /** * <p>Implements http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.132.7343, a data structure * for approximating quantiles by trading off error with memory requirements.</p> * * <p>The size of the digest is adjusted dynamically to achieve the error bound and requires * O(log2(U) / maxError) space, where <em>U</em> is the number of bits needed to represent the * domain of the values added to the digest. The error is defined as the discrepancy between the * real rank of the value returned in a quantile query and the rank corresponding to the queried * quantile.</p> * * <p>Thus, for a query for quantile <em>q</em> that returns value <em>v</em>, the error is * |rank(v) - q * N| / N, where N is the number of elements added to the digest and rank(v) is the * real rank of <em>v</em></p> * * <p>This class also supports exponential decay. The implementation is based on the ideas laid out * in http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.159.3978</p> */ @NotThreadSafe public class QuantileDigest { private static final int MAX_BITS = 64; private static final int QUANTILE_DIGEST_SIZE = ClassLayout.parseClass(QuantileDigest.class).instanceSize(); // needs to be such that Math.exp(alpha * seconds) does not grow too big static final long RESCALE_THRESHOLD_SECONDS = 50; static final double ZERO_WEIGHT_THRESHOLD = 1e-5; private static final int INITIAL_CAPACITY = 1; private final double maxError; private final Ticker ticker; private final double alpha; private long landmarkInSeconds; private double weightedCount; private long max = Long.MIN_VALUE; private long min = Long.MAX_VALUE; private int root = -1; private int nextNode = 0; private double[] counts; private byte[] levels; private long[] values; private int[] lefts; private int[] rights; // We use lefts[] to store a linked list of free slots. // freeIndex points to the first available slot private int freeCount; private int firstFree = -1; private enum TraversalOrder { FORWARD, REVERSE } /** * <p>Create a QuantileDigest with a maximum error guarantee of "maxError" and no decay. * * @param maxError the max error tolerance */ public QuantileDigest(double maxError) { this(maxError, 0.0); } /** * <p>Create a QuantileDigest with a maximum error guarantee of "maxError" and exponential decay * with factor "alpha".</p> * * @param maxError the max error tolerance * @param alpha the exponential decay factor */ public QuantileDigest(double maxError, double alpha) { this(maxError, alpha, alpha == 0.0 ? noOpTicker() : Ticker.systemTicker()); } @VisibleForTesting QuantileDigest(double maxError, double alpha, Ticker ticker) { checkArgument(maxError >= 0 && maxError <= 1, "maxError must be in range [0, 1]"); checkArgument(alpha >= 0 && alpha < 1, "alpha must be in range [0, 1)"); this.maxError = maxError; this.alpha = alpha; this.ticker = ticker; landmarkInSeconds = TimeUnit.NANOSECONDS.toSeconds(ticker.read()); counts = new double[INITIAL_CAPACITY]; levels = new byte[INITIAL_CAPACITY]; values = new long[INITIAL_CAPACITY]; lefts = new int[INITIAL_CAPACITY]; rights = new int[INITIAL_CAPACITY]; Arrays.fill(lefts, -1); Arrays.fill(rights, -1); } public QuantileDigest(QuantileDigest other) { this.maxError = other.maxError; this.alpha = other.alpha; this.ticker = alpha == 0.0 ? noOpTicker() : Ticker.systemTicker(); this.landmarkInSeconds = other.landmarkInSeconds; this.weightedCount = other.weightedCount; this.max = other.max; this.min = other.min; this.root = other.root; this.nextNode = other.nextNode; this.counts = other.counts.clone(); this.levels = other.levels.clone(); this.values = other.values.clone(); this.lefts = other.lefts.clone(); this.rights = other.rights.clone(); this.freeCount = other.freeCount; this.firstFree = other.firstFree; } public QuantileDigest(Slice serialized) { SliceInput input = new BasicSliceInput(serialized); maxError = input.readDouble(); alpha = input.readDouble(); if (alpha == 0.0) { ticker = noOpTicker(); } else { ticker = Ticker.systemTicker(); } min = input.readLong(); max = input.readLong(); int nodeCount = input.readInt(); // non-zero-nodes < 3 * k, and k <= log2(domain-size) / max-error // To be conservative, assume all non-zero-nodes can be leaves and we need a complete tree => total-nodes <= 2 * non-zero-nodes // => total-nodes <= 2 * 3 * log2(domain-size) / max-error int numberOfLevels = MAX_BITS - Long.numberOfLeadingZeros(min ^ max) + 1; double k = 3 * numberOfLevels / maxError; checkArgument(nodeCount <= 2 * k, "Too many nodes in deserialized tree. Possible corruption"); counts = new double[nodeCount]; for (int i = 0; i < nodeCount; i++) { double count = input.readDouble(); weightedCount += count; counts[i] = count; } levels = new byte[nodeCount]; for (int i = 0; i < nodeCount; i++) { levels[i] = input.readByte(); } values = new long[nodeCount]; for (int i = 0; i < nodeCount; i++) { values[i] = input.readLong(); } int[] stack = new int[(Integer.highestOneBit(nodeCount - 1) << 1) + 1]; // log2 ceiling int top = -1; // The nodes are organized in a left-to-right post-order sequence, so // we rebuild the tree (left/right links) bottom up lefts = new int[nodeCount]; rights = new int[nodeCount]; for (int node = 0; node < nodeCount; node++) { byte flags = input.readByte(); if ((flags & Flags.HAS_RIGHT) != 0) { rights[node] = stack[top--]; } else { rights[node] = -1; } if ((flags & Flags.HAS_LEFT) != 0) { lefts[node] = stack[top--]; } else { lefts[node] = -1; } stack[++top] = node; } checkArgument(nodeCount == 0 || top == 0, "Tree is corrupted. Expected a single root node"); root = nodeCount - 1; // last node in post-order nextNode = nodeCount; } public double getMaxError() { return maxError; } public double getAlpha() { return alpha; } public void add(long value) { add(value, 1); } /** * Adds a value to this digest. The value must be {@code >= 0} */ public void add(long value, long count) { checkArgument(count > 0, "count must be > 0"); boolean needsCompression = false; double weight = count; if (alpha > 0.0) { long nowInSeconds = TimeUnit.NANOSECONDS.toSeconds(ticker.read()); if (nowInSeconds - landmarkInSeconds >= RESCALE_THRESHOLD_SECONDS) { rescale(nowInSeconds); needsCompression = true; // rescale affects weights globally, so force compression } weight = weight(nowInSeconds) * count; } max = Math.max(max, value); min = Math.min(min, value); double previousCount = weightedCount; insert(longToBits(value), weight); // When totalCount crosses the next multiple of k (compression factor), the compression // equation changes for every node in the tree, so we need to compress globally. // Otherwise, only node along the insertion path are affected -- TODO: implement this. int compressionFactor = calculateCompressionFactor(); if (needsCompression || ((long) previousCount) / compressionFactor != ((long) weightedCount) / compressionFactor) { compress(); } } public void merge(QuantileDigest other) { rescaleToCommonLandmark(this, other); // 1. merge other into this (don't modify other) root = merge(root, other, other.root); max = Math.max(max, other.max); min = Math.min(min, other.min); // 2. compress to remove unnecessary nodes compress(); } /** * Get a lower bound on the quantiles for the given proportions. A returned q quantile is guaranteed to be within * the q - maxError and q quantiles. * <p> * The input list of quantile proportions must be sorted in increasing order, and each value must be in the range [0, 1] */ public List<Long> getQuantilesLowerBound(List<Double> quantiles) { checkArgument(Ordering.natural().isOrdered(quantiles), "quantiles must be sorted in increasing order"); for (double quantile : quantiles) { checkArgument(quantile >= 0 && quantile <= 1, "quantile must be between [0,1]"); } List<Double> reversedQuantiles = ImmutableList.copyOf(quantiles).reverse(); ImmutableList.Builder<Long> builder = ImmutableList.builder(); PeekingIterator<Double> iterator = Iterators.peekingIterator(reversedQuantiles.iterator()); postOrderTraversal(root, new Callback() { private double sum; @Override public boolean process(int node) { sum += counts[node]; while (iterator.hasNext() && sum > (1.0 - iterator.peek()) * weightedCount) { iterator.next(); // we know the min value ever seen, so cap the percentile to provide better error // bounds in this case long value = Math.max(lowerBound(node), min); builder.add(value); } return iterator.hasNext(); } }, TraversalOrder.REVERSE); // we finished the traversal without consuming all quantiles. This means the remaining quantiles // correspond to the max known value while (iterator.hasNext()) { builder.add(min); iterator.next(); } return builder.build().reverse(); } /** * Get an upper bound on the quantiles for the given proportions. A returned q quantile is guaranteed to be within * the q and q + maxError quantiles. * <p> * The input list of quantile proportions must be sorted in increasing order, and each value must be in the range [0, 1] */ public List<Long> getQuantilesUpperBound(List<Double> quantiles) { checkArgument(Ordering.natural().isOrdered(quantiles), "quantiles must be sorted in increasing order"); for (double quantile : quantiles) { checkArgument(quantile >= 0 && quantile <= 1, "quantile must be between [0,1]"); } final ImmutableList.Builder<Long> builder = ImmutableList.builder(); final PeekingIterator<Double> iterator = Iterators.peekingIterator(quantiles.iterator()); postOrderTraversal(root, new Callback() { private double sum = 0; public boolean process(int node) { sum += counts[node]; while (iterator.hasNext() && sum > iterator.peek() * weightedCount) { iterator.next(); // we know the max value ever seen, so cap the percentile to provide better error // bounds in this case long value = Math.min(upperBound(node), max); builder.add(value); } return iterator.hasNext(); } }); // we finished the traversal without consuming all quantiles. This means the remaining quantiles // correspond to the max known value while (iterator.hasNext()) { builder.add(max); iterator.next(); } return builder.build(); } public List<Long> getQuantiles(List<Double> quantiles) { return getQuantilesUpperBound(quantiles); } /** * Gets the value at the specified quantile +/- maxError. The quantile must be in the range [0, 1] */ public long getQuantile(double quantile) { return getQuantiles(ImmutableList.of(quantile)).get(0); } public long getQuantileLowerBound(double quantile) { return getQuantilesLowerBound(ImmutableList.of(quantile)).get(0); } public long getQuantileUpperBound(double quantile) { return getQuantilesUpperBound(ImmutableList.of(quantile)).get(0); } /** * Number (decayed) of elements added to this quantile digest */ public double getCount() { return weightedCount / weight(TimeUnit.NANOSECONDS.toSeconds(ticker.read())); } /* * Get the exponentially-decayed approximate counts of values in multiple buckets. The elements in * the provided list denote the upper bound each of the buckets and must be sorted in ascending * order. * * The approximate count in each bucket is guaranteed to be within 2 * totalCount * maxError of * the real count. */ public List<Bucket> getHistogram(List<Long> bucketUpperBounds) { checkArgument(Ordering.natural().isOrdered(bucketUpperBounds), "buckets must be sorted in increasing order"); ImmutableList.Builder<Bucket> builder = ImmutableList.builder(); PeekingIterator<Long> iterator = Iterators.peekingIterator(bucketUpperBounds.iterator()); AtomicDouble sum = new AtomicDouble(); AtomicDouble lastSum = new AtomicDouble(); // for computing weighed average of values in bucket AtomicDouble bucketWeightedSum = new AtomicDouble(); double normalizationFactor = weight(TimeUnit.NANOSECONDS.toSeconds(ticker.read())); postOrderTraversal(root, node -> { while (iterator.hasNext() && iterator.peek() <= upperBound(node)) { double bucketCount = sum.get() - lastSum.get(); Bucket bucket = new Bucket(bucketCount / normalizationFactor, bucketWeightedSum.get() / bucketCount); builder.add(bucket); lastSum.set(sum.get()); bucketWeightedSum.set(0); iterator.next(); } bucketWeightedSum.addAndGet(middle(node) * counts[node]); sum.addAndGet(counts[node]); return iterator.hasNext(); }); while (iterator.hasNext()) { double bucketCount = sum.get() - lastSum.get(); Bucket bucket = new Bucket(bucketCount / normalizationFactor, bucketWeightedSum.get() / bucketCount); builder.add(bucket); iterator.next(); } return builder.build(); } public long getMin() { final AtomicLong chosen = new AtomicLong(min); postOrderTraversal(root, node -> { if (counts[node] >= ZERO_WEIGHT_THRESHOLD) { chosen.set(lowerBound(node)); return false; } return true; }, TraversalOrder.FORWARD); return Math.max(min, chosen.get()); } public long getMax() { final AtomicLong chosen = new AtomicLong(max); postOrderTraversal(root, node -> { if (counts[node] >= ZERO_WEIGHT_THRESHOLD) { chosen.set(upperBound(node)); return false; } return true; }, TraversalOrder.REVERSE); return Math.min(max, chosen.get()); } public int estimatedInMemorySizeInBytes() { return (int) (QUANTILE_DIGEST_SIZE + SizeOf.sizeOf(counts) + SizeOf.sizeOf(levels) + SizeOf.sizeOf(values) + SizeOf.sizeOf(lefts) + SizeOf.sizeOf(rights)); } public int estimatedSerializedSizeInBytes() { int nodeSize = SizeOf.SIZE_OF_LONG + // counts SizeOf.SIZE_OF_BYTE + // levels SizeOf.SIZE_OF_LONG + // values SizeOf.SIZE_OF_BYTE; // left/right flags return SizeOf.SIZE_OF_DOUBLE + // maxError SizeOf.SIZE_OF_DOUBLE + // alpha SizeOf.SIZE_OF_LONG + // min SizeOf.SIZE_OF_LONG + // max SizeOf.SIZE_OF_INT + // node count getNodeCount() * nodeSize; } public Slice serialize() { compress(); SliceOutput output = new DynamicSliceOutput(estimatedSerializedSizeInBytes()); output.writeDouble(maxError); output.writeDouble(alpha); output.writeLong(min); output.writeLong(max); output.writeInt(getNodeCount()); int[] nodes = new int[getNodeCount()]; postOrderTraversal(root, new Callback() { int index = 0; @Override public boolean process(int node) { nodes[index++] = node; return true; } }); for (int node : nodes) { output.writeDouble(counts[node]); } for (int node : nodes) { // TODO: levels can only go to 64, so we should be able to pack them better output.writeByte(levels[node]); } for (int node : nodes) { output.writeLong(values[node]); } for (int node : nodes) { // TODO: pack 4 nodes per byte (2 bits each) byte flags = 0; if (lefts[node] != -1) { flags |= Flags.HAS_LEFT; } if (rights[node] != -1) { flags |= Flags.HAS_RIGHT; } output.writeByte(flags); } return output.slice(); } @VisibleForTesting int getNodeCount() { return nextNode - freeCount; } @VisibleForTesting void compress() { double bound = Math.floor(weightedCount / calculateCompressionFactor()); postOrderTraversal(root, node -> { // if children's weights are 0 remove them and shift the weight to their parent int left = lefts[node]; int right = rights[node]; if (left == -1 && right == -1) { // leaf, nothing to do return true; } double leftCount = (left == -1) ? 0.0 : counts[left]; double rightCount = (right == -1) ? 0.0 : counts[right]; boolean shouldCompress = (counts[node] + leftCount + rightCount) < bound; if (left != -1 && (shouldCompress || leftCount < ZERO_WEIGHT_THRESHOLD)) { lefts[node] = tryRemove(left); counts[node] += leftCount; } if (right != -1 && (shouldCompress || rightCount < ZERO_WEIGHT_THRESHOLD)) { rights[node] = tryRemove(right); counts[node] += rightCount; } return true; }); // root's count may have decayed to ~0 if (root != -1 && counts[root] < ZERO_WEIGHT_THRESHOLD) { root = tryRemove(root); } } private double weight(long timestamp) { return Math.exp(alpha * (timestamp - landmarkInSeconds)); } private void rescale(long newLandmarkInSeconds) { // rescale the weights based on a new landmark to avoid numerical overflow issues double factor = Math.exp(-alpha * (newLandmarkInSeconds - landmarkInSeconds)); weightedCount *= factor; for (int i = 0; i < nextNode; i++) { counts[i] *= factor; } landmarkInSeconds = newLandmarkInSeconds; } private int calculateCompressionFactor() { if (root == -1) { return 1; } return Math.max((int) ((levels[root] + 1) / maxError), 1); } private void insert(long value, double count) { if (count < ZERO_WEIGHT_THRESHOLD) { return; } long lastBranch = 0; int parent = -1; int current = root; while (true) { if (current == -1) { setChild(parent, lastBranch, createLeaf(value, count)); return; } long currentValue = values[current]; byte currentLevel = levels[current]; if (!inSameSubtree(value, currentValue, currentLevel)) { // if value and node.value are not in the same branch given node's level, // insert a parent above them at the point at which branches diverge setChild(parent, lastBranch, makeSiblings(current, createLeaf(value, count))); return; } if (currentLevel == 0 && currentValue == value) { // found the node counts[current] += count; weightedCount += count; return; } // we're on the correct branch of the tree and we haven't reached a leaf, so keep going down long branch = value & getBranchMask(currentLevel); parent = current; lastBranch = branch; if (branch == 0) { current = lefts[current]; } else { current = rights[current]; } } } private void setChild(int parent, long branch, int child) { if (parent == -1) { root = child; } else if (branch == 0) { lefts[parent] = child; } else { rights[parent] = child; } } private int makeSiblings(int first, int second) { long firstValue = values[first]; long secondValue = values[second]; int parentLevel = MAX_BITS - Long.numberOfLeadingZeros(firstValue ^ secondValue); int parent = createNode(firstValue, parentLevel, 0); // the branch is given by the bit at the level one below parent long branch = firstValue & getBranchMask(levels[parent]); if (branch == 0) { lefts[parent] = first; rights[parent] = second; } else { lefts[parent] = second; rights[parent] = first; } return parent; } private int createLeaf(long value, double count) { return createNode(value, 0, count); } private int createNode(long value, int level, double count) { int node = popFree(); if (node == -1) { if (nextNode == counts.length) { // try to double the array, but don't allocate too much to avoid going over the upper bound of nodes // by a large margin (hence, the heuristic to not allocate more than k / 5 nodes) int newSize = counts.length + Math.min(counts.length, calculateCompressionFactor() / 5 + 1); counts = Arrays.copyOf(counts, newSize); levels = Arrays.copyOf(levels, newSize); values = Arrays.copyOf(values, newSize); lefts = Arrays.copyOf(lefts, newSize); rights = Arrays.copyOf(rights, newSize); } node = nextNode; nextNode++; } weightedCount += count; values[node] = value; levels[node] = (byte) level; counts[node] = count; lefts[node] = -1; rights[node] = -1; return node; } private int merge(int node, QuantileDigest other, int otherNode) { if (otherNode == -1) { return node; } else if (node == -1) { return copyRecursive(other, otherNode); } else if (!inSameSubtree(values[node], other.values[otherNode], Math.max(levels[node], other.levels[otherNode]))) { return makeSiblings(node, copyRecursive(other, otherNode)); } else if (levels[node] > other.levels[otherNode]) { long branch = other.values[otherNode] & getBranchMask(levels[node]); if (branch == 0) { // variable needed because the array may be re-allocated during merge() int left = merge(lefts[node], other, otherNode); lefts[node] = left; } else { // variable needed because the array may be re-allocated during merge() int right = merge(rights[node], other, otherNode); rights[node] = right; } return node; } else if (levels[node] < other.levels[otherNode]) { long branch = values[node] & getBranchMask(other.levels[otherNode]); // variables needed because the arrays may be re-allocated during merge() int left; int right; if (branch == 0) { left = merge(node, other, other.lefts[otherNode]); right = copyRecursive(other, other.rights[otherNode]); } else { left = copyRecursive(other, other.lefts[otherNode]); right = merge(node, other, other.rights[otherNode]); } int result = createNode(other.values[otherNode], other.levels[otherNode], other.counts[otherNode]); lefts[result] = left; rights[result] = right; return result; } // else, they must be at the same level and on the same path, so just bump the counts weightedCount += other.counts[otherNode]; counts[node] += other.counts[otherNode]; // variables needed because the arrays may be re-allocated during merge() int left = merge(lefts[node], other, other.lefts[otherNode]); int right = merge(rights[node], other, other.rights[otherNode]); lefts[node] = left; rights[node] = right; return node; } private static boolean inSameSubtree(long bitsA, long bitsB, int level) { return level == MAX_BITS || (bitsA >>> level) == (bitsB >>> level); } private int copyRecursive(QuantileDigest other, int otherNode) { if (otherNode == -1) { return otherNode; } int node = createNode(other.values[otherNode], other.levels[otherNode], other.counts[otherNode]); if (other.lefts[otherNode] != -1) { // variable needed because the array may be re-allocated during merge() int left = copyRecursive(other, other.lefts[otherNode]); lefts[node] = left; } if (other.rights[otherNode] != -1) { // variable needed because the array may be re-allocated during merge() int right = copyRecursive(other, other.rights[otherNode]); rights[node] = right; } return node; } /** * Remove the node if possible or set its count to 0 if it has children and * it needs to be kept around */ private int tryRemove(int node) { checkArgument(node != -1, "node is -1"); int left = lefts[node]; int right = rights[node]; if (left == -1 && right == -1) { // leaf, just remove it remove(node); return -1; } if (left != -1 && right != -1) { // node has both children so we can't physically remove it counts[node] = 0; return node; } // node has a single child, so remove it and return the child remove(node); if (left != -1) { return left; } else { return right; } } private void remove(int node) { if (node == nextNode - 1) { // if we're removing the last node, no need to add it to the free list nextNode--; } else { pushFree(node); } if (node == root) { root = -1; } } private void pushFree(int node) { lefts[node] = firstFree; firstFree = node; freeCount++; } private int popFree() { int node = firstFree; if (node == -1) { return node; } firstFree = lefts[firstFree]; freeCount--; return node; } private void postOrderTraversal(int node, Callback callback) { postOrderTraversal(node, callback, TraversalOrder.FORWARD); } private void postOrderTraversal(int node, Callback callback, TraversalOrder order) { if (order == TraversalOrder.FORWARD) { postOrderTraversal(node, callback, lefts, rights); } else { postOrderTraversal(node, callback, rights, lefts); } } private boolean postOrderTraversal(int node, Callback callback, int[] lefts, int[] rights) { if (node == -1) { return false; } int first = lefts[node]; int second = rights[node]; if (first != -1 && !postOrderTraversal(first, callback, lefts, rights)) { return false; } if (second != -1 && !postOrderTraversal(second, callback, lefts, rights)) { return false; } return callback.process(node); } /** * Computes the maximum error of the current digest */ public double getConfidenceFactor() { return computeMaxPathWeight(root) * 1.0 / weightedCount; } @VisibleForTesting boolean equivalent(QuantileDigest other) { return (getNodeCount() == other.getNodeCount() && min == other.min && max == other.max && weightedCount == other.weightedCount && alpha == other.alpha); } private void rescaleToCommonLandmark(QuantileDigest one, QuantileDigest two) { long nowInSeconds = TimeUnit.NANOSECONDS.toSeconds(ticker.read()); // 1. rescale this and other to common landmark long targetLandmark = Math.max(one.landmarkInSeconds, two.landmarkInSeconds); if (nowInSeconds - targetLandmark >= RESCALE_THRESHOLD_SECONDS) { targetLandmark = nowInSeconds; } if (targetLandmark != one.landmarkInSeconds) { one.rescale(targetLandmark); } if (targetLandmark != two.landmarkInSeconds) { two.rescale(targetLandmark); } } /** * Computes the max "weight" of any path starting at node and ending at a leaf in the * hypothetical complete tree. The weight is the sum of counts in the ancestors of a given node */ private double computeMaxPathWeight(int node) { if (node == -1 || levels[node] == 0) { return 0; } double leftMaxWeight = computeMaxPathWeight(lefts[node]); double rightMaxWeight = computeMaxPathWeight(rights[node]); return Math.max(leftMaxWeight, rightMaxWeight) + counts[node]; } @VisibleForTesting void validate() { AtomicDouble sum = new AtomicDouble(); AtomicInteger nodeCount = new AtomicInteger(); Set<Integer> freeSlots = computeFreeList(); checkState(freeSlots.size() == freeCount, "Free count (%s) doesn't match actual free slots: %s", freeCount, freeSlots.size()); if (root != -1) { validateStructure(root, freeSlots); postOrderTraversal(root, node -> { sum.addAndGet(counts[node]); nodeCount.incrementAndGet(); return true; }); } checkState(Math.abs(sum.get() - weightedCount) < ZERO_WEIGHT_THRESHOLD, "Computed weight (%s) doesn't match summary (%s)", sum.get(), weightedCount); checkState(nodeCount.get() == getNodeCount(), "Actual node count (%s) doesn't match summary (%s)", nodeCount.get(), getNodeCount()); } private void validateStructure(int node, Set<Integer> freeNodes) { checkState(levels[node] >= 0); checkState(!freeNodes.contains(node), "Node is in list of free slots: %s", node); if (lefts[node] != -1) { validateBranchStructure(node, lefts[node], rights[node], true); validateStructure(lefts[node], freeNodes); } if (rights[node] != -1) { validateBranchStructure(node, rights[node], lefts[node], false); validateStructure(rights[node], freeNodes); } } private void validateBranchStructure(int parent, int child, int otherChild, boolean isLeft) { checkState(levels[child] < levels[parent], "Child level (%s) should be smaller than parent level (%s)", levels[child], levels[parent]); long branch = values[child] & (1L << (levels[parent] - 1)); checkState(branch == 0 && isLeft || branch != 0 && !isLeft, "Value of child node is inconsistent with its branch"); Preconditions.checkState(counts[parent] > 0 || counts[child] > 0 || otherChild != -1, "Found a linear chain of zero-weight nodes"); } private Set<Integer> computeFreeList() { Set<Integer> freeSlots = new HashSet<>(); int index = firstFree; while (index != -1) { freeSlots.add(index); index = lefts[index]; } return freeSlots; } public String toGraphviz() { StringBuilder builder = new StringBuilder(); builder.append("digraph QuantileDigest {\n") .append("\tgraph [ordering=\"out\"];"); final List<Integer> nodes = new ArrayList<>(); postOrderTraversal(root, node -> { nodes.add(node); return true; }); Multimap<Byte, Integer> nodesByLevel = Multimaps.index(nodes, input -> levels[input]); for (Map.Entry<Byte, Collection<Integer>> entry : nodesByLevel.asMap().entrySet()) { builder.append("\tsubgraph level_" + entry.getKey() + " {\n") .append("\t\trank = same;\n"); for (int node : entry.getValue()) { if (levels[node] == 0) { builder.append(String.format("\t\t%s [label=\"%s:[%s]@%s\\n%s\", shape=rect, style=filled,color=%s];\n", idFor(node), node, lowerBound(node), levels[node], counts[node], counts[node] > 0 ? "salmon2" : "white")); } else { builder.append(String.format("\t\t%s [label=\"%s:[%s..%s]@%s\\n%s\", shape=rect, style=filled,color=%s];\n", idFor(node), node, lowerBound(node), upperBound(node), levels[node], counts[node], counts[node] > 0 ? "salmon2" : "white")); } } builder.append("\t}\n"); } for (int node : nodes) { if (lefts[node] != -1) { builder.append(format("\t%s -> %s [style=\"%s\"];\n", idFor(node), idFor(lefts[node]), levels[node] - levels[lefts[node]] == 1 ? "solid" : "dotted")); } if (rights[node] != -1) { builder.append(format("\t%s -> %s [style=\"%s\"];\n", idFor(node), idFor(rights[node]), levels[node] - levels[rights[node]] == 1 ? "solid" : "dotted")); } } builder.append("}\n"); return builder.toString(); } private static String idFor(int node) { return String.format("node_%x", node); } /** * Convert a java long (two's complement representation) to a 64-bit lexicographically-sortable binary */ private static long longToBits(long value) { return value ^ 0x8000_0000_0000_0000L; } /** * Convert a 64-bit lexicographically-sortable binary to a java long (two's complement representation) */ private static long bitsToLong(long bits) { return bits ^ 0x8000_0000_0000_0000L; } private long getBranchMask(byte level) { return (1L << (level - 1)); } private long upperBound(int node) { // set all lsb below level to 1 (we're looking for the highest value of the range covered by this node) long mask = 0; if (levels[node] > 0) { // need to special case when level == 0 because (value >> 64 really means value >> (64 % 64)) mask = 0xFFFF_FFFF_FFFF_FFFFL >>> (MAX_BITS - levels[node]); } return bitsToLong(values[node] | mask); } private long lowerBound(int node) { // set all lsb below level to 0 (we're looking for the lowest value of the range covered by this node) long mask = 0; if (levels[node] > 0) { // need to special case when level == 0 because (value >> 64 really means value >> (64 % 64)) mask = 0xFFFF_FFFF_FFFF_FFFFL >>> (MAX_BITS - levels[node]); } return bitsToLong(values[node] & (~mask)); } private long middle(int node) { long lower = lowerBound(node); long upper = upperBound(node); return lower + (upper - lower) / 2; } private static Ticker noOpTicker() { return new Ticker() { @Override public long read() { return 0; } }; } public static class Bucket { private double count; private double mean; public Bucket(double count, double mean) { this.count = count; this.mean = mean; } public double getCount() { return count; } public double getMean() { return mean; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } final Bucket bucket = (Bucket) o; if (Double.compare(bucket.count, count) != 0) { return false; } if (Double.compare(bucket.mean, mean) != 0) { return false; } return true; } @Override public int hashCode() { int result; long temp; temp = count != +0.0d ? Double.doubleToLongBits(count) : 0L; result = (int) (temp ^ (temp >>> 32)); temp = mean != +0.0d ? Double.doubleToLongBits(mean) : 0L; result = 31 * result + (int) (temp ^ (temp >>> 32)); return result; } public String toString() { return String.format("[count: %f, mean: %f]", count, mean); } } private interface Callback { /** * @param node the node to process * @return true if processing should continue */ boolean process(int node); } private static class Flags { public static final int HAS_LEFT = 1 << 0; public static final int HAS_RIGHT = 1 << 1; } }