/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.commons.math.stat.descriptive.rank; import java.io.Serializable; import java.util.Arrays; import org.apache.commons.math.exception.OutOfRangeException; import org.apache.commons.math.exception.util.LocalizedFormats; import org.apache.commons.math.stat.descriptive.AbstractUnivariateStatistic; import org.apache.commons.math.util.FastMath; /** * Provides percentile computation. * <p> * There are several commonly used methods for estimating percentiles (a.k.a. * quantiles) based on sample data. For large samples, the different methods * agree closely, but when sample sizes are small, different methods will give * significantly different results. The algorithm implemented here works as follows: * <ol> * <li>Let <code>n</code> be the length of the (sorted) array and * <code>0 < p <= 100</code> be the desired percentile.</li> * <li>If <code> n = 1 </code> return the unique array element (regardless of * the value of <code>p</code>); otherwise </li> * <li>Compute the estimated percentile position * <code> pos = p * (n + 1) / 100</code> and the difference, <code>d</code> * between <code>pos</code> and <code>floor(pos)</code> (i.e. the fractional * part of <code>pos</code>). If <code>pos >= n</code> return the largest * element in the array; otherwise</li> * <li>Let <code>lower</code> be the element in position * <code>floor(pos)</code> in the array and let <code>upper</code> be the * next element in the array. Return <code>lower + d * (upper - lower)</code> * </li> * </ol></p> * <p> * To compute percentiles, the data must be at least partially ordered. Input * arrays are copied and recursively partitioned using an ordering definition. * The ordering used by <code>Arrays.sort(double[])</code> is the one determined * by {@link java.lang.Double#compareTo(Double)}. This ordering makes * <code>Double.NaN</code> larger than any other value (including * <code>Double.POSITIVE_INFINITY</code>). Therefore, for example, the median * (50th percentile) of * <code>{0, 1, 2, 3, 4, Double.NaN}</code> evaluates to <code>2.5.</code></p> * <p> * Since percentile estimation usually involves interpolation between array * elements, arrays containing <code>NaN</code> or infinite values will often * result in <code>NaN<code> or infinite values returned.</p> * <p> * Since 2.2, Percentile implementation uses only selection instead of complete * sorting and caches selection algorithm state between calls to the various * {@code evaluate} methods when several percentiles are to be computed on the same data. * This greatly improves efficiency, both for single percentile and multiple * percentiles computations. However, it also induces a need to be sure the data * at one call to {@code evaluate} is the same as the data with the cached algorithm * state from the previous calls. Percentile does this by checking the array reference * itself and a checksum of its content by default. If the user already knows he calls * {@code evaluate} on an immutable array, he can save the checking time by calling the * {@code evaluate} methods that do <em>not</em> * </p> * <p> * <strong>Note that this implementation is not synchronized.</strong> If * multiple threads access an instance of this class concurrently, and at least * one of the threads invokes the <code>increment()</code> or * <code>clear()</code> method, it must be synchronized externally.</p> * * @version $Id: Percentile.java 1131229 2011-06-03 20:49:25Z luc $ */ public class Percentile extends AbstractUnivariateStatistic implements Serializable { /** Serializable version identifier */ private static final long serialVersionUID = -8091216485095130416L; /** Minimum size under which we use a simple insertion sort rather than Hoare's select. */ private static final int MIN_SELECT_SIZE = 15; /** Maximum number of partitioning pivots cached (each level double the number of pivots). */ private static final int MAX_CACHED_LEVELS = 10; /** Determines what percentile is computed when evaluate() is activated * with no quantile argument */ private double quantile = 0.0; /** Cached pivots. */ private int[] cachedPivots; /** * Constructs a Percentile with a default quantile * value of 50.0. */ public Percentile() { this(50.0); } /** * Constructs a Percentile with the specific quantile value. * @param p the quantile * @throws IllegalArgumentException if p is not greater than 0 and less * than or equal to 100 */ public Percentile(final double p) { setQuantile(p); cachedPivots = null; } /** * Copy constructor, creates a new {@code Percentile} identical * to the {@code original} * * @param original the {@code Percentile} instance to copy */ public Percentile(Percentile original) { copy(original, this); } /** {@inheritDoc} */ @Override public void setData(final double[] values) { if (values == null) { cachedPivots = null; } else { cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1]; Arrays.fill(cachedPivots, -1); } super.setData(values); } /** {@inheritDoc} */ @Override public void setData(final double[] values, final int begin, final int length) { if (values == null) { cachedPivots = null; } else { cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1]; Arrays.fill(cachedPivots, -1); } super.setData(values, begin, length); } /** * Returns the result of evaluating the statistic over the stored data. * <p> * The stored array is the one which was set by previous calls to * </p> * @param p the percentile value to compute * @return the value of the statistic applied to the stored data */ public double evaluate(final double p) { return evaluate(getDataRef(), p); } /** * Returns an estimate of the <code>p</code>th percentile of the values * in the <code>values</code> array. * <p> * Calls to this method do not modify the internal <code>quantile</code> * state of this statistic.</p> * <p> * <ul> * <li>Returns <code>Double.NaN</code> if <code>values</code> has length * <code>0</code></li> * <li>Returns (for any value of <code>p</code>) <code>values[0]</code> * if <code>values</code> has length <code>1</code></li> * <li>Throws <code>IllegalArgumentException</code> if <code>values</code> * is null or p is not a valid quantile value (p must be greater than 0 * and less than or equal to 100) </li> * </ul></p> * <p> * See {@link Percentile} for a description of the percentile estimation * algorithm used.</p> * * @param values input array of values * @param p the percentile value to compute * @return the percentile value or Double.NaN if the array is empty * @throws IllegalArgumentException if <code>values</code> is null * or p is invalid */ public double evaluate(final double[] values, final double p) { test(values, 0, 0); return evaluate(values, 0, values.length, p); } /** * Returns an estimate of the <code>quantile</code>th percentile of the * designated values in the <code>values</code> array. The quantile * estimated is determined by the <code>quantile</code> property. * <p> * <ul> * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li> * <li>Returns (for any value of <code>quantile</code>) * <code>values[begin]</code> if <code>length = 1 </code></li> * <li>Throws <code>IllegalArgumentException</code> if <code>values</code> * is null, or <code>start</code> or <code>length</code> * is invalid</li> * </ul></p> * <p> * See {@link Percentile} for a description of the percentile estimation * algorithm used.</p> * * @param values the input array * @param start index of the first array element to include * @param length the number of elements to include * @return the percentile value * @throws IllegalArgumentException if the parameters are not valid * */ @Override public double evaluate( final double[] values, final int start, final int length) { return evaluate(values, start, length, quantile); } /** * Returns an estimate of the <code>p</code>th percentile of the values * in the <code>values</code> array, starting with the element in (0-based) * position <code>begin</code> in the array and including <code>length</code> * values. * <p> * Calls to this method do not modify the internal <code>quantile</code> * state of this statistic.</p> * <p> * <ul> * <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li> * <li>Returns (for any value of <code>p</code>) <code>values[begin]</code> * if <code>length = 1 </code></li> * <li>Throws <code>IllegalArgumentException</code> if <code>values</code> * is null , <code>begin</code> or <code>length</code> is invalid, or * <code>p</code> is not a valid quantile value (p must be greater than 0 * and less than or equal to 100)</li> * </ul></p> * <p> * See {@link Percentile} for a description of the percentile estimation * algorithm used.</p> * * @param values array of input values * @param p the percentile to compute * @param begin the first (0-based) element to include in the computation * @param length the number of array elements to include * @return the percentile value * @throws IllegalArgumentException if the parameters are not valid or the * input array is null */ public double evaluate(final double[] values, final int begin, final int length, final double p) { test(values, begin, length); if ((p > 100) || (p <= 0)) { throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p, 0, 100); } if (length == 0) { return Double.NaN; } if (length == 1) { return values[begin]; // always return single value for n = 1 } double n = length; double pos = p * (n + 1) / 100; double fpos = FastMath.floor(pos); int intPos = (int) fpos; double dif = pos - fpos; double[] work; int[] pivotsHeap; if (values == getDataRef()) { work = getDataRef(); pivotsHeap = cachedPivots; } else { work = new double[length]; System.arraycopy(values, begin, work, 0, length); pivotsHeap = new int[(0x1 << MAX_CACHED_LEVELS) - 1]; Arrays.fill(pivotsHeap, -1); } if (pos < 1) { return select(work, pivotsHeap, 0); } if (pos >= n) { return select(work, pivotsHeap, length - 1); } double lower = select(work, pivotsHeap, intPos - 1); double upper = select(work, pivotsHeap, intPos); return lower + dif * (upper - lower); } /** * Select the k<sup>th</sup> smallest element from work array * @param work work array (will be reorganized during the call) * @param pivotsHeap set of pivot index corresponding to elements that * are already at their sorted location, stored as an implicit heap * (i.e. a sorted binary tree stored in a flat array, where the * children of a node at index n are at indices 2n+1 for the left * child and 2n+2 for the right child, with 0-based indices) * @param k index of the desired element * @return k<sup>th</sup> smallest element */ private double select(final double[] work, final int[] pivotsHeap, final int k) { int begin = 0; int end = work.length; int node = 0; while (end - begin > MIN_SELECT_SIZE) { final int pivot; if ((node < pivotsHeap.length) && (pivotsHeap[node] >= 0)) { // the pivot has already been found in a previous call // and the array has already been partitioned around it pivot = pivotsHeap[node]; } else { // select a pivot and partition work array around it pivot = partition(work, begin, end, medianOf3(work, begin, end)); if (node < pivotsHeap.length) { pivotsHeap[node] = pivot; } } if (k == pivot) { // the pivot was exactly the element we wanted return work[k]; } else if (k < pivot) { // the element is in the left partition end = pivot; node = Math.min(2 * node + 1, pivotsHeap.length); // the min is here to avoid integer overflow } else { // the element is in the right partition begin = pivot + 1; node = Math.min(2 * node + 2, pivotsHeap.length); // the min is here to avoid integer overflow } } // the element is somewhere in the small sub-array // sort the sub-array using insertion sort insertionSort(work, begin, end); return work[k]; } /** Select a pivot index as the median of three * @param work data array * @param begin index of the first element of the slice * @param end index after the last element of the slice * @return the index of the median element chosen between the * first, the middle and the last element of the array slice */ int medianOf3(final double[] work, final int begin, final int end) { final int inclusiveEnd = end - 1; final int middle = begin + (inclusiveEnd - begin) / 2; final double wBegin = work[begin]; final double wMiddle = work[middle]; final double wEnd = work[inclusiveEnd]; if (wBegin < wMiddle) { if (wMiddle < wEnd) { return middle; } else { return (wBegin < wEnd) ? inclusiveEnd : begin; } } else { if (wBegin < wEnd) { return begin; } else { return (wMiddle < wEnd) ? inclusiveEnd : middle; } } } /** * Partition an array slice around a pivot * <p> * Partitioning exchanges array elements such that all elements * smaller than pivot are before it and all elements larger than * pivot are after it * </p> * @param work data array * @param begin index of the first element of the slice * @param end index after the last element of the slice * @param pivot initial index of the pivot * @return index of the pivot after partition */ private int partition(final double[] work, final int begin, final int end, final int pivot) { final double value = work[pivot]; work[pivot] = work[begin]; int i = begin + 1; int j = end - 1; while (i < j) { while ((i < j) && (work[j] >= value)) { --j; } while ((i < j) && (work[i] <= value)) { ++i; } if (i < j) { final double tmp = work[i]; work[i++] = work[j]; work[j--] = tmp; } } if ((i >= end) || (work[i] > value)) { --i; } work[begin] = work[i]; work[i] = value; return i; } /** * Sort in place a (small) array slice using insertion sort * @param work array to sort * @param begin index of the first element of the slice to sort * @param end index after the last element of the slice to sort */ private void insertionSort(final double[] work, final int begin, final int end) { for (int j = begin + 1; j < end; j++) { final double saved = work[j]; int i = j - 1; while ((i >= begin) && (saved < work[i])) { work[i + 1] = work[i]; i--; } work[i + 1] = saved; } } /** * Returns the value of the quantile field (determines what percentile is * computed when evaluate() is called with no quantile argument). * * @return quantile */ public double getQuantile() { return quantile; } /** * Sets the value of the quantile field (determines what percentile is * computed when evaluate() is called with no quantile argument). * * @param p a value between 0 < p <= 100 * @throws IllegalArgumentException if p is not greater than 0 and less * than or equal to 100 */ public void setQuantile(final double p) { if (p <= 0 || p > 100) { throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p, 0, 100); } quantile = p; } /** * {@inheritDoc} */ @Override public Percentile copy() { Percentile result = new Percentile(); copy(this, result); return result; } /** * Copies source to dest. * <p>Neither source nor dest can be null.</p> * * @param source Percentile to copy * @param dest Percentile to copy to * @throws NullPointerException if either source or dest is null */ public static void copy(Percentile source, Percentile dest) { dest.setData(source.getDataRef()); if (source.cachedPivots != null) { System.arraycopy(source.cachedPivots, 0, dest.cachedPivots, 0, source.cachedPivots.length); } dest.quantile = source.quantile; } }