// This file is part of OpenTSDB. // Copyright (C) 2010-2012 The OpenTSDB Authors. // // This program is free software: you can redistribute it and/or modify it // under the terms of the GNU Lesser General Public License as published by // the Free Software Foundation, either version 2.1 of the License, or (at your // option) any later version. This program is distributed in the hope that it // will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser // General Public License for more details. You should have received a copy // of the GNU Lesser General Public License along with this program. If not, // see <http://www.gnu.org/licenses/>. package net.opentsdb.stats; import java.util.Arrays; /** * A histogram to keep track of the approximation of a distribution of values. * <p> * This is not a general purpose implementation of histogram. It's * specifically designed for "small" values (close to 0) as the primary * use case is latency histograms. * <p> * All values must be positive ({@code >= 0}). * <p> * The histogram is linear (fixed size buckets) up to a given cutoff * point. Beyond that point, the histogram becomes exponential (each * bucket is twice as large as the previous one). This gives good * granularity for lower values while still allowing a rough * classification for the "long tail" of larger values. * <p> * Note that this implementation doesn't allow you to directly control * the number of buckets in the histogram. The number will depend on * the arguments given to the constructor. * <p> * This class is not synchronized. */ public final class Histogram { /** Interval between each bucket for the linear part of the histogram. */ private final short interval; /** Inclusive value beyond which we switch to exponential buckets. */ private final int cutoff; /** * How many linear buckets we have. * Technically we don't need to store this value but we do in order to * avoid having to re-compute it in the fast path each time we add a * new value. */ private final short num_linear_buckets; /** * The power of 2 used by the first exponential bucket. * Technically we don't need to store this value but we do in order to * avoid having to re-compute it in the fast path each time we add a * new value. */ private final short exp_bucket_shift; /** Buckets where we actually store the values. */ private final int[] buckets; /** * Constructor. * @param max The maximum value of the histogram. Any value greater * than this will be considered to be "infinity". * @param interval The interval (size) of each linear bucket. * @param cutoff The value beyond which to switch to exponential * buckets. The histogram may actually use this value or a value up * to {@code interval} greater. * @throws IllegalArgumentException if any of following conditions are * not met: * <pre> * 0 < interval <= max * 0 <= cutoff <= max * </pre> */ public Histogram(final int max, final short interval, final int cutoff) { if (interval > max) { throw new IllegalArgumentException("interval > max! interval=" + interval + ", max=" + max); } else if (cutoff > max) { throw new IllegalArgumentException("cutoff > max! cutoff=" + cutoff + ", max=" + max); } else if (interval < 1) { throw new IllegalArgumentException("interval < 1! interval=" + interval); } else if (cutoff < 0) { throw new IllegalArgumentException("cutoff < 0! interval=" + cutoff); } this.interval = interval; // One linear bucket every `interval' up to `cutoff'. num_linear_buckets = (short) (cutoff / interval); this.cutoff = num_linear_buckets * interval; this.exp_bucket_shift = (short) log2rounddown(interval); this.buckets = new int[num_linear_buckets // Find how many exponential buckets we need, starting from the // first power of 2 that's less than or equal to `interval'. + log2roundup((max - cutoff) >> exp_bucket_shift) // Add an extra overflow bucket at the end. + 1]; } /** * Computes the logarithm base 2 (rounded up) of an integer. * <p> * This is essentially equivalent to * {@code Math.ceil(Math.log(n) / Math.log(2))} * except it's 3 times faster. * @param n A strictly positive integer. * @return The logarithm base 2. As a special case, if the integer * given in argument is 0, this function returns 0. If the integer * given in argument is negative, the return value is undefined. * @see #log2rounddown */ static final int log2roundup(final int n) { int log2 = 0; while (n > 1 << log2) { log2++; } return log2; } /** * Computes the logarithm base 2 (rounded down) of an integer. * <p> * This is essentially equivalent to * {@code Math.floor(Math.log(n) / Math.log(2))} * except it's 4.5 times faster. This function is also almost 70% * faster than {@link #log2roundup}. * @param n A strictly positive integer. * @return The logarithm base 2. As a special case, if the integer * given in argument is 0, this function returns 0. If the integer * given in argument is negative, the return value is undefined. * @see #log2roundup */ static final int log2rounddown(int n) { int log2 = 0; while (n > 1) { n >>>= 1; log2++; } return log2; } /** Returns the number of buckets in this histogram. */ public int buckets() { return buckets.length; } /** * Adds a value to the histogram. * <p> * This method works in {@code O(1)}. * @param value The value to add. * @throws IllegalArgumentException if the value given is negative. */ public void add(final int value) { if (value < 0) { throw new IllegalArgumentException("negative value: " + value); } buckets[bucketIndexFor(value)]++; } /** * Returns the value of the <i>p</i>th percentile in this histogram. * <p> * This method works in {@code O(N)} where {@code N} is the number of * {@link #buckets buckets}. * @param p A strictly positive integer in the range {@code [1; 100]} * @throws IllegalArgumentException if {@code p} is not valid. */ public int percentile(int p) { if (p < 1 || p > 100) { throw new IllegalArgumentException("invalid percentile: " + p); } int count = 0; // Count of values in the histogram. for (int i = 0; i < buckets.length; i++) { count += buckets[i]; } if (count == 0) { // Empty histogram. Need to special-case it, otherwise return 0; // the `if (count <= p)' below will be erroneously true. } // Find the number of elements at or below which the pth percentile is. p = count * p / 100; // Now walk the array backwards and decrement the count until it reaches p. for (int i = buckets.length - 1; i >= 0; i--) { count -= buckets[i]; if (count <= p) { return bucketHighInterval(i); } } return 0; } /** * Prints this histogram in a human readable ASCII format. * <p> * This is equivalent to calling {@link #printAsciiBucket} on every * bucket. * @param out The buffer to which to write the output. */ public void printAscii(final StringBuilder out) { for (int i = 0; i < buckets.length; i++) { printAsciiBucket(out, i); } } /** * Prints a bucket of this histogram in a human readable ASCII format. * @param out The buffer to which to write the output. * @see #printAscii */ final void printAsciiBucket(final StringBuilder out, final int i) { out.append('[') .append(bucketLowInterval(i)) .append('-') .append(i == buckets.length - 1 ? "Inf" : bucketHighInterval(i)) .append("): ") .append(buckets[i]) .append('\n'); } /** Helper for unit tests that returns the value in the given bucket. */ final int valueInBucket(final int index) { return buckets[index]; } /** Finds the index of the bucket in which the given value should be. */ private int bucketIndexFor(final int value) { if (value < cutoff) { return value / interval; } int bucket = num_linear_buckets // Skip all linear buckets. // And find which bucket the rest (after `cutoff') should be in. // Reminder: the first exponential bucket ends at 2^exp_bucket_shift. + log2rounddown((value - cutoff) >> exp_bucket_shift); if (bucket >= buckets.length) { return buckets.length - 1; } return bucket; } /** Returns the low interval (inclusive) of the given bucket. */ private int bucketLowInterval(final int index) { if (index <= num_linear_buckets) { return index * interval; } else { return cutoff + (1 << (index - num_linear_buckets + exp_bucket_shift)); } } /** Returns the high interval (exclusive) of the given bucket. */ private int bucketHighInterval(final int index) { if (index == buckets.length - 1) { return Integer.MAX_VALUE; } else { return bucketLowInterval(index + 1); } } public String toString() { return "Histogram(interval=" + interval + ", cutoff=" + cutoff + ", num_linear_buckets=" + num_linear_buckets + ", exp_bucket_shift=" + exp_bucket_shift + ", buckets=" + Arrays.toString(buckets) + ')'; } }