BasicStatistics.java example

Explorer
msInspect-master
/*
 * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.fhcrc.cpl.toolbox.statistics;

import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;

import org.fhcrc.cpl.toolbox.datastructure.Pair;

/**
 * A class with generic methods for basic stats.  Add methods as needed.
 * A lot of things are written twice, once for floats and once for doubles.  That's annoying.
 * But it would be wasteful of space and time to do conversions
 */
public class BasicStatistics
{
    /**
     * Calculate the standard deviation of the inputs
     * @param inputs
     * @return
     */
    public static float standardDeviation(float[] inputs)
    {
        if (inputs.length < 2)
        return 0;
        float variance = variance(inputs);

        return (float) Math.sqrt(variance);
    }

    /**
     * Calculate the standard deviation of the inputs
     * @param inputs
     * @return
     */
    public static double standardDeviation(double[] inputs)
    {
        if (inputs.length < 2)
        return 0;
        double variance = variance(inputs);

        return Math.sqrt(variance);
    }

    public static double standardDeviation(List<? extends Number> inputs)
    {
        double[] inputsAsArray = new double[inputs.size()];
        for (int i=0; i<inputs.size(); i++)
            inputsAsArray[i] = inputs.get(i).doubleValue();
        return standardDeviation(inputsAsArray);
    }

    public static float standardDeviationFloatList(List<? extends Number> inputs)
    {
        double[] inputsAsArray = new double[inputs.size()];
        for (int i=0; i<inputs.size(); i++)
            inputsAsArray[i] = inputs.get(i).doubleValue();
        return (float) standardDeviation(inputsAsArray);
    }
    

    /**
     * Calculate the variance of the inputs
     * @param inputs
     * @return
     */
    public static float variance(float[] inputs)
    {
        if (inputs.length < 2)
            return 0;

        float mean = mean(inputs);

        float varianceNumerator = 0;
        for (int i=0; i<inputs.length; i++)
        {
            float diffFromMean = inputs[i] - mean;
            varianceNumerator += (diffFromMean * diffFromMean);
        }
        return varianceNumerator / (inputs.length-1);
    }

    /**
     * Calculate the variance of the inputs
     * @param inputs
     * @return
     */
    public static double variance(double[] inputs)
    {
        if (inputs.length < 2)
            return 0;

        double mean = mean(inputs);

        double varianceNumerator = 0;
        for (int i=0; i<inputs.length; i++)
        {
            double diffFromMean = inputs[i] - mean;
            varianceNumerator += (diffFromMean * diffFromMean);
        }
        return varianceNumerator / (inputs.length-1);
    }

    /**
     * Calculate the mean of the inputs
     * @param inputs
     * @return
     */
    public static double mean(int[] inputs)
    {
        float sum = 0;
        for (int i=0; i<inputs.length; i++)
        {
            sum += inputs[i];
        }
        float mean = sum / inputs.length;
        return mean;
    }

    /**
     * Calculate the mean of the inputs
     * @param inputs
     * @return
     */
    public static float mean(float[] inputs)
    {
        float sum = 0;
        for (int i=0; i<inputs.length; i++)
        {
            sum += inputs[i];
        }
        float mean = sum / inputs.length;
        return mean;
    }



    /**
     * Calculate the mean of the inputs
     * @param inputs
     * @return
     */
    public static double mean(List<? extends Number> inputs)
    {
        return sum(inputs) / (double) inputs.size();
    }

    /**
     * Calculate the mean of the inputs
     * @param inputs
     * @return
     */
    public static double mean(double[] inputs)
    {
        double sum = 0;
        for (int i=0; i<inputs.length; i++)
        {
            sum += inputs[i];
        }
        double mean = sum / inputs.length;
        return mean;
    }

    public static double median(List<? extends Number> inputs)
    {
        double[] inputsArray = new double[inputs.size()];
        for (int i=0; i<inputs.size(); i++)
            inputsArray[i] = inputs.get(i).doubleValue();
        return median(inputsArray);
    }

    /**
     * Calculate the median of the inputs.
     * This is median as defined in R -- average of the middle two points.  NOT necessarily
     * one of the inputs.
     * Clones before sorting, so as not to disrupt input
     * @param inputs
     * @return
     */
    public static double median(double[] inputs)
    {
        //clone before sorting
        double[] inputsClone = inputs.clone();
        Arrays.sort(inputsClone);

        if (inputs.length == 0)
            return 0;
        //if odd, return the middle element
        if (inputsClone.length % 2 == 1)
            return inputsClone[(inputsClone.length-1)/2];
        //if got here, then even.  Average the middle two elements
        return (inputsClone[(inputsClone.length)/2 - 1] +
                inputsClone[(inputsClone.length)/2])  / 2;
    }

    /**
     * Calculate the interquartile range.  I think this is right.
     *
     * Calculate the median, then calculate the median of everything under the
     * median and everything over the median.
     * @param inputs
     * @return
     */
    public static Pair<Double,Double> interQuartileRange(double[] inputs)
    {
        double median = median(inputs);
        List<Double> underMedian = new ArrayList<Double>();
        List<Double> overMedian = new ArrayList<Double>();

        for (Double input : inputs)
        {
            if (input <= median)
                underMedian.add(input);
            else
                overMedian.add(input);
        }

        double q1 = median(underMedian);
        double q3 = median(overMedian);

        return new Pair<Double,Double>(q1,q3);
    }

    public static Pair<Double,Double> interQuartileRange(List<? extends Number> inputs)
    {
        double[] inputsArray = new double[inputs.size()];
        for (int i=0; i<inputs.size(); i++)
            inputsArray[i] = inputs.get(i).doubleValue();
        return interQuartileRange(inputsArray);
    }

    public static double min(double[] values)
    {
        double minValue = Double.MAX_VALUE;
        for (double value : values)
            if (value < minValue)
                minValue = value;
        return minValue;
    }

    public static float min(float[] values)
    {
        float minValue = Float.MAX_VALUE;
        for (float value : values)
            if (value < minValue)
                minValue = value;
        return minValue;
    }

    public static double min(List<? extends Number> values)
    {
        double minValue = Float.MAX_VALUE;
        for (Number value : values)
            if (value.doubleValue() < minValue)
                minValue = value.doubleValue();
        return minValue;
    }

    public static double max(double[] values)
    {
        double maxValue = Double.MIN_VALUE;
        for (double value : values)
            if (value > maxValue)
                maxValue = value;
        return maxValue;
    }

    public static float max(float[] values)
    {
        float maxValue = Float.MIN_VALUE;
        for (float value : values)
            if (value > maxValue)
                maxValue = value;
        return maxValue;
    }

    public static double max(List<? extends Number> values)
    {
        double maxValue = Float.MIN_VALUE;
        for (Number value : values)
            if (value.doubleValue() > maxValue)
                maxValue = value.doubleValue();
        return maxValue;        
    }
    

    /**
     * calculate percentile p of values in list
     * @param values
     * @param p
     * @return
     */
    public static double percentile(List<? extends Number> values, double p)
    {
        double[] inputsArray = new double[values.size()];
        for (int i=0; i<values.size(); i++)
            inputsArray[i] = values.get(i).doubleValue();
        return percentile(inputsArray, p);
    }

    /**
     * Returns an estimate of the <code>p</code>th percentile of the values
     * in the <code>values</code> array, starting with the element in (0-based)
     * position <code>begin</code> in the array and including <code>length</code>
     * values.
     * <p>
     * Calls to this method do not modify the array
     *
     * Copied from org.apache.commons.math.stat
     *
     * todo: figure out for sure whether having median() call this is OK
     *
     * @param values array of input values
     * @param p  the percentile to compute
     * @return  the percentile value
     * @throws IllegalArgumentException if the parameters are not valid or the
     * input array is null
     */
    public static double percentile(double[] values, double p)
    {
        int length = values.length;

        if ((p > 100) || (p <= 0))
        {
            throw new IllegalArgumentException("invalid quantile value: " + p);
        }
        double n = (double) length;
        if (n == 0)
        {
            return Double.NaN;
        }
        if (n == 1)
        {
            return values[0]; // always return single value for n = 1
        }
        double pos = p * (n + 1) / 100;
        double fpos = Math.floor(pos);
        int intPos = (int) fpos;
        double dif = pos - fpos;
        double[] sorted = new double[length];
        System.arraycopy(values, 0, sorted, 0, length);
        Arrays.sort(sorted);

        if (pos < 1)
        {
            return sorted[0];
        }
        if (pos >= n)
        {
            return sorted[length - 1];
        }
        double lower = sorted[intPos - 1];
        double upper = sorted[intPos];
        return lower + dif * (upper - lower);
    }


    /**
     * Calculate the median of the inputs.
     * This is median as defined in R -- average of the middle two points.  NOT necessarily
     * one of the inputs.
     * Clones before sorting, so as not to disrupt input
     * @param inputs
     * @return
     */
    public static float median(float[] inputs)
    {
        //clone before sorting
        float[] inputsClone = inputs.clone();
        Arrays.sort(inputsClone);

        if (inputsClone.length == 0)
            return 0;
        //if odd, return the middle element
        if (inputsClone.length % 2 == 1)
            return inputsClone[(inputsClone.length-1)/2];
        //if got here, then even.  Average the middle two elements
        return (inputsClone[(inputsClone.length)/2 - 1] +
                inputsClone[(inputsClone.length)/2])  / 2;
    }

    /**
     * Compute the leverage of each input:
     * (x-xbar)^2/((n-1)sigmax^2))
     * @param inputs
     * @return
     */
    public static double[] leverages(double[] inputs)
    {
        int n = inputs.length;

        double sigmaInputsSquared = variance(inputs);
        double meanInput = BasicStatistics.mean(inputs);

        double leverageDenomenator = (n-1) * sigmaInputsSquared;

        double[] leverages = new double[n];

        for (int i=0; i < n; i++)
        {
            leverages[i] = (1.0 / (double) n) +
                           (Math.pow(inputs[i] - meanInput,2) / leverageDenomenator);
        }

        return leverages;
    }

    /**
     * Compute the leverage of each input:
     * (x-xbar)^2/((n-1)sigmax^2))
     * @param inputs
     * @return
     */
    public static float[] leverages(float[] inputs)
    {
        int n = inputs.length;

        double sigmaInputsSquared = variance(inputs);
        double meanInput = BasicStatistics.mean(inputs);

        double leverageDenomenator = (n-1) * sigmaInputsSquared;

        float[] leverages = new float[n];

        for (int i=0; i < n; i++)
        {
            leverages[i] = (float) (Math.pow(inputs[i] - meanInput,2) /
                               leverageDenomenator);
        }

        return leverages;
    }

    /**
     * Convenience method
     * @param xValues
     * @param residuals
     * @return
     */
    public static double[] studentizedResiduals(double[] xValues,
                                                double[] residuals)
    {
        double[] leverages = leverages(xValues);
        return studentizedResiduals(xValues, residuals, leverages);                
    }

    /**
     * Calculate the studentized residuals of arrays of somethingorother
     * represented here by their X values, leverages (based on X values),
     * and residuals.
     * @param xValues
     * @param residuals
     * @param leverages
     * @return
     */
    public static double[] studentizedResiduals(double[] xValues,
                                                double[] residuals,
                                                double[] leverages)
    {
        int n = xValues.length;

        if (leverages == null)
            leverages = leverages(xValues);

        //sigmaError requires a denomenator of n-2, so can't use
        //BasicStatistics.standardDeviation()
        double sumSquaredResiduals=0;
        for (double residual : residuals)
            sumSquaredResiduals += Math.pow(residual, 2);

        double sigmaError = Math.sqrt(sumSquaredResiduals / (n - 2));
        //1 + 1/n.  Same for all inputs, compute once
        double onePlusOneOverN = 1 + 1/n;

        double[] studentizedResiduals = new double[n];

        //Find the "studentized residual" of every feature:
        for (int i=0; i < n; i++)
        {
            double scaledResidual = residuals[i] / sigmaError;
            studentizedResiduals[i] = scaledResidual /
                    Math.sqrt(onePlusOneOverN + leverages[i]);
        }

        return studentizedResiduals;
    }

    public static double sum(double[] values)
    {
        double result = 0;
        for (double value : values)
            result += value;
        return result;
    }

    public static double sum(int[] values)
    {
        double result = 0;
        for (double value : values)
            result += value;
        return result;
    }

    public static double sum(List<? extends Number> values)
    {
        double result = 0;
        for (Number value : values)
        {
            double doubleValue = value.doubleValue();
            if (doubleValue != Double.NaN)
                result += doubleValue;
        }
        return result;
    }

    public static float sum(float[] values)
    {
        float result = 0;
        for (float value : values)
            result += value;
        return result;
    }

    public static double coefficientOfVariation(double[] input)
    {
        return standardDeviation(input) / mean(input);
    }

    public static double coefficientOfVariation(List<? extends Number> input)
    {
        return standardDeviation(input) / mean(input);
    }

    /**
     *
     * Covariance = (sum(x*y) - ((sumx)(sumy)/n)) / (n-1)
     * @param xvalues
     * @param yvalues
     * @return
     */
    public static double covariance(double[] xvalues, double[] yvalues)
    {
        int n = xvalues.length;

        double[] xtimesy = new double[n];
        for (int i=0; i<n; i++)
            xtimesy[i] = xvalues[i] * yvalues[i];

        double numerator = sum(xtimesy) - (((sum(xvalues) * sum(yvalues))) / n);
        return numerator / (double) (n - 1);
    }

    /**
     *
     * Covariance = (sum(x*y) - ((sumx)(sumy)/n)) / (n-1)
     * @param xvalues
     * @param yvalues
     * @return
     */
    public static float covariance(float[] xvalues, float[] yvalues)
    {
        int n = xvalues.length;

        float[] xtimesy = new float[n];
        for (int i=0; i<n; i++)
            xtimesy[i] = xvalues[i] * yvalues[i];

        float numerator = sum(xtimesy) - (((sum(xvalues) * sum(yvalues))) / n);
        return numerator / (float) (n - 1);
    }

    public static double covariance(List<? extends Number> xvaluesList, List<? extends Number> yvaluesList)
    {
        int n = xvaluesList.size();
        double[] xtimesy = new double[n];
        for (int i=0; i<n; i++)
            xtimesy[i] = xvaluesList.get(i).doubleValue() * yvaluesList.get(i).doubleValue();

        double numerator = sum(xtimesy) - (((BasicStatistics.sum(xvaluesList) * BasicStatistics.sum(yvaluesList))) / n);
        return numerator / (float) (n - 1);
    }

    /**
     * covariance(xy) / (sx * sy)
     */    
    public static double correlationCoefficient(List<? extends Number> xvaluesList, List<? extends Number> yvaluesList)
    {
        return covariance(xvaluesList, yvaluesList) /
                (standardDeviation(xvaluesList) * standardDeviation(yvaluesList));
    }

    /**
     * covariance(xy) / (sx * sy)    
     */
    public static double correlationCoefficient(double[] xvalues, double[] yvalues)
    {
        return covariance(xvalues, yvalues) /
                (standardDeviation(xvalues) * standardDeviation(yvalues));
    }

    /**
     * covariance(xy) / (sx * sy)
     */
    public static float correlationCoefficient(float[] xvalues, float[] yvalues)
    {
        return covariance(xvalues, yvalues) /
                (standardDeviation(xvalues) * standardDeviation(yvalues));
    }


    /**
     * Geometric mean is defined as ((x1)(x2)(x3)...(xk)) ^ 1/k
     * @param values
     * @return
     */
    public static double geometricMean(double[] values)
    {
        double result = 1.0;
        for(double value : values)
            result *= value;
        return Math.pow(result, 1.0/(double)values.length);
    }

    public static double geometricMean(List<? extends Number> inputs)
    {
        double result = 1.0;
        for(Number value : inputs)
            result *= value.doubleValue();
        return Math.pow(result, 1.0/(double)inputs.size());
    }

    public static float geometricMean(float[] values)
    {
        float result = 1.0f;
        for(float value : values)
            result *= value;
        return (float) Math.pow(result, 1.0f/(float)values.length);
    }

    public static double geometricMean(double value1, double value2)
    {
        double[] values = new double[2];
        values[0] = value1;
        values[1] = value2;

        return geometricMean(values);
    }

    /**
     * Calculate the weighted geometric mean.  Exponentiated weighted mean of logs.
     * @param inputs
     * @param weights
     * @return
     */
    public static double weightedGeometricMean(List<? extends Number> inputs, List<? extends Number> weights)
    {
        List<Double> logInputs = new ArrayList<Double>();
        for (Number input : inputs)
            logInputs.add(Math.log(input.doubleValue()));
        return Math.exp(weightedMean(logInputs, weights));
    }

    /**
     * Calculate the weighted mean
     * @param inputs
     * @param weights
     * @return
     */
    public static double weightedMean(List<? extends Number> inputs, List<? extends Number> weights)
    {
        double weightedSum = 0;
        double sumOfWeights = 0;

        for (int i=0; i<inputs.size(); i++)
        {
            weightedSum += (inputs.get(i).doubleValue() * weights.get(i).doubleValue());
            sumOfWeights += weights.get(i).doubleValue();
        }

        return weightedSum / sumOfWeights;
    }

    /**
     * Calculate a standard Gaussian with mean 0, variance 1
     * @param x
     * @return
     */
    public static double standardGaussian(double x)
    {
        //sqrt(2pi) = 2.50662827463
        return (1.0 / (2.50662827463)) * Math.exp(-Math.pow(x,2) / 2);
    }

    /**
     * Calculate the density of the normal distribution function with parameters mu and sigma,
     * at point x
     * @param mu
     * @param sigma
     * @param x
     * @return
     */
    public static double calcNormalDistDensity(double mu, double sigma, double x)
    {
        //sqrt(2pi) = 2.50662827463
        return Math.pow(Math.E, -.5 * Math.pow((x - mu) / sigma, 2)) / (sigma * 2.50662827463);
    }

    /**
     * Error function for standard normal distribution
     * @param z
     * @return
     */
    public static double erf(double z)
    {
        double t = 1.0 / (1.0 + 0.5 * Math.abs(z));

        // use Horner's method
        double ans = 1 - t * Math.exp( -z*z   -   1.26551223 +
                                            t * ( 1.00002368 +
                                            t * ( 0.37409196 +
                                            t * ( 0.09678418 +
                                            t * (-0.18628806 +
                                            t * ( 0.27886807 +
                                            t * (-1.13520398 +
                                            t * ( 1.48851587 +
                                            t * (-0.82215223 +
                                            t * ( 0.17087277))))))))));
        if (z >= 0)
            return  ans;
        else
            return -ans;
    }

    /**
     * Calculate the <i>cumulative</i> density of the standard normal distribution at point x
     * @param z
     * @return
     */
    public static double calcStandardNormalCumDensity(double z)
    {
        return 0.5 * (1.0 + erf(z / (Math.sqrt(2.0))));
    }


    /**
     * Calculate the <i>cumulative</i> density of the normal distribution function with parameters mu and sigma,
     * at point x
     * @param mu
     * @param sigma
     * @param x
     * @return
     */
    public static double calcNormalCumDensity(double mu, double sigma, double x)
    {
        return calcStandardNormalCumDensity((x - mu) / sigma);
    }
}