KolmogorovSmirnovConfidence.java example

Explorer
Foundry-master
- Components
/*
 * File:                KolmogorovSmirnovConfidence.java
 * Authors:             Kevin R. Dixon
 * Company:             Sandia National Laboratories
 * Project:             Cognitive Foundry
 *
 * Copyright August 15, 2007, Sandia Corporation.  Under the terms of Contract
 * DE-AC04-94AL85000, there is a non-exclusive license for use of this work by
 * or on behalf of the U.S. Government. Export of this program may require a
 * license from the United States Government. See CopyrightHistory.txt for
 * complete details.
 *
 */

package gov.sandia.cognition.statistics.method;

import gov.sandia.cognition.annotation.PublicationReference;
import gov.sandia.cognition.annotation.PublicationType;
import gov.sandia.cognition.collection.NumberComparator;
import gov.sandia.cognition.statistics.CumulativeDistributionFunction;
import gov.sandia.cognition.statistics.distribution.KolmogorovDistribution;
import gov.sandia.cognition.statistics.distribution.UnivariateGaussian;
import gov.sandia.cognition.util.AbstractCloneableSerializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;

/**
 * Performs a Kolmogorov-Smirnov Confidence Test.  This is often simply called
 * the "K-S test". This is a powerful
 * nonparametric test that determines the probability that two distributions
 * were generated by the same distribution.  There are minimal (no?)
 * assumptions on the underlying data or distributions.  That is, the
 * distributions are NOT assumed to be Gaussian, etc.
 * @author Kevin R. Dixon
 * @since  2.0
 *
 */
@ConfidenceTestAssumptions(
    name="Kolmogorov-Smirnov test",
    alsoKnownAs="K-S test",
    description={
        "Determines if two datasets were drawn from the same univariate distribution.",
        "Robust, nonparameteric test that makes no assumptions on the underlying distribution (continuous, discrete, etc.)."
    },
    assumptions={
        "The data were sampled independently from each other."
    },
    nullHypothesis="The data were drawn from the same distribution.",
    dataPaired=false,
    dataSameSize=false,
    distribution=KolmogorovDistribution.CDF.class,
    reference=@PublicationReference(
        author="Wikipedia",
        title="Kolmogorov-Smirnov test",
        type=PublicationType.WebPage,
        year=2009,
        url="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"
    )
)
public class KolmogorovSmirnovConfidence
    extends AbstractCloneableSerializable
    implements NullHypothesisEvaluator<Collection<? extends Number>>
{

    /**
     * Default instance of the K-S test.
     */
    public static final KolmogorovSmirnovConfidence INSTANCE =
        new KolmogorovSmirnovConfidence();

    /** 
     * Creates a new instance of KolmogorovSmirnovConfidence
     */
    public KolmogorovSmirnovConfidence()
    {
    }

    /**
     * Returns an array of ascending sorted values from the given Collection
     * @param data 
     * Collection of doubles to sort into ascending order
     * @return 
     * Array of ascending sorted values
     */
    protected static double[] computeAscendingArray(
        Collection<? extends Number> data)
    {

        double[] values = new double[data.size()];
        int index = 0;
        for (Number value : data)
        {
            values[index] = value.doubleValue();
            index++;
        }
        Arrays.sort(values);
        return values;

    }

    /**
     * This is the standard K-S test for two distributions of data.  Determines
     * the probability that the two distributions of data were generated by
     * the same underlying distributions.  This is a parameter-free test, so
     * the assumptions on the underlying data are minimal (inexistent?).
     * @param data1 
     * First dataset to consider
     * @param data2 
     * Second dataset to consider
     * @return 
     * ConfidenceStatistic from the K-S test.
     */
    @PublicationReference(
        author={
            "William H. Press",
            "Saul A. Teukolsky",
            "William T. Vetterling",
            "Brian P. Flannery"
        },
        title="Numerical Recipes in C, Second Edition",
        type=PublicationType.Book,
        year=1992,
        pages={625,626},
        notes={
            "Section 14.3",
            "Function kstwo()"
        },
        url="http://www.nrbook.com/a/bookcpdf.php"
    )
    @Override
    public KolmogorovSmirnovConfidence.Statistic evaluateNullHypothesis(
        Collection<? extends Number> data1,
        Collection<? extends Number> data2)
    {
        double[] dataArray1 =
            KolmogorovSmirnovConfidence.computeAscendingArray(data1);
        double[] dataArray2 =
            KolmogorovSmirnovConfidence.computeAscendingArray(data2);

        int j1 = 0;
        int j2 = 0;
        double N1 = dataArray1.length;
        double N2 = dataArray2.length;

        double fn1 = 0.0;
        double fn2 = 0.0;
        double D = 0.0;
        while ((j1 < N1) && (j2 < N2))
        {
            double d1 = dataArray1[j1];
            double d2 = dataArray2[j2];

            if (d1 <= d2 || Double.isNaN(d1))
            {
                j1++;
                fn1 = j1 / N1;
            }
            if (d2 <= d1 || Double.isNaN(d2))
            {
                j2++;
                fn2 = j2 / N2;
            }

            double dt = Math.abs(fn2 - fn1);
            if (dt > D)
            {
                D = dt;
            }
        }

        double Ne = Math.sqrt((N1 * N2) / (N1 + N2));
        return new KolmogorovSmirnovConfidence.Statistic(Ne, D);
    }

    /**
     * This is the standard K-S test for determining if the given data were
     * generated by the given CDF.  Computes the probability that the two
     * distributions of data are actually the same underlying distributions. 
     * This is a parameter-free test, so the assumptions on the underlying data
     * are minimal (inexistent?).  For example, to test if a dataset is normally
     * distribution, call 
     * computeNullHypothesisProbability( data, new UnivariateGaussian.CumulativeDistribution() ).
     * @param <DomainType> Type of Number to consider
     * @param data1 Dataset to consider
     * @param function CDF to compare against the given data
     * @return 
     * ConfidenceStatistic from the K-S test.
     */
    @PublicationReference(
        author={
            "William H. Press",
            "Saul A. Teukolsky",
            "William T. Vetterling",
            "Brian P. Flannery"
        },
        title="Numerical Recipes in C, Second Edition",
        type=PublicationType.Book,
        year=1992,
        pages=625,
        notes={
            "Section 14.3",
            "Function ksone()"
        }
    )
    public static <DomainType extends Number> KolmogorovSmirnovConfidence.Statistic evaluateNullHypothesis(
        Collection<? extends DomainType> data1,
        CumulativeDistributionFunction<DomainType> function)
    {

        // This code nulls out the early repeated values.  This signals to
        // the subsequent loop that only the final repeated domain value
        // should be tested, making the discontinuities with discrete-valued
        // distributions manageable. -- krdixon 2010-03-24
        ArrayList<DomainType> sortedData1 = new ArrayList<DomainType>( data1 );
        Collections.sort( sortedData1, NumberComparator.INSTANCE );
        for( int n = 1; n < sortedData1.size(); n++ )
        {
            if( sortedData1.get(n-1).equals( sortedData1.get(n) ) )
            {
                sortedData1.set(n-1, null);
            }
        }

        // This method computes the two-tailed K-S statistic (written "D*")
        // This can be transformed into a one-sided statistic by changing the
        // "double dt..." line to select the D+ (fn-ff) or D- (fo-ff) terms.
        // Knuth prefers the one-sided statistics, but I haven't seen a huge
        // difference one way or the other.
        double fo = 0.0;
        double D = 0.0;

        final double Ne = sortedData1.size();
        for (int j = 0; j < Ne; j++)
        {
            double fn = (j + 1) / Ne;
            if( sortedData1.get(j) != null )
            {
                double ff = function.evaluate( sortedData1.get(j) );
                double dt = Math.max(Math.abs(fo - ff), Math.abs(fn - ff));
                if (dt > D)
                {
                    D = dt;
                }
            }
            fo = fn;
        }

        return new KolmogorovSmirnovConfidence.Statistic(Ne, D);

    }

    /**
     * Evaluates the Hypothesis that the given data were generated according
     * to a UnivariateGaussian distribution.  A high null-hypothesis
     * probability is not conclusive proof that the data were generated by
     * a Gaussian.  However, a low null-hypothesis probability is conclusive
     * that the data were NOT likely generated by a Gaussian
     * @param data 
     * Data to evaluate the possibility that they were generated according to
     * a Gaussian Distribution
     * @return Confidence statistic from the K-S test
     */
    public static KolmogorovSmirnovConfidence.Statistic evaluateGaussianHypothesis(
        Collection<Double> data)
    {

        // First, fit the ML Gaussian to the data
        UnivariateGaussian gaussian =
            UnivariateGaussian.MaximumLikelihoodEstimator.learn(data, 0.0);
        UnivariateGaussian.CDF cdf = new UnivariateGaussian.CDF(gaussian);

        // Now, run a standard K-S test against the data and the ML gaussian
        return evaluateNullHypothesis(data, cdf);

    }

    /**
     * Computes the ConfidenceStatistic associated with a K-S test
     */
    public static class Statistic
        extends AbstractConfidenceStatistic
    {

        /**
         * This is the D-statistic used in the K-S CDF,
         * usually known as the D-statistic, which is the maximum
         * difference between the two distributions.  I use the two-tail
         * version of D.
         */
        private double D;

        /**
         * This is the degrees of freedom in the K-S distribution for the
         * CDF calculation.
         */
        private double Ne;

        /**
         * Creates a new instance of Statistic
         * @param D 
         * This is the D-statistic used in the K-S CDF,
         * usually known as the D-statistic, which is the maximum
         * difference between the two distributions.  I use the two-tail
         * version of D.
         * @param Ne 
         * This is the degrees of freedom in the K-S distribution for the
         * CDF calculation.
         */
        public Statistic(
            double Ne,
            double D)
        {
            super(Statistic.KSsignificance(Ne, D));
            this.setNe(Ne);
            this.setD(D);
        }

        /**
         * Computes the significance of the K-S test from the given degrees of
         * freedom and D-statistic.  This approximation is from Numerical
         * Recipes in C, p. 624
         * @param Ne 
         * Number of degrees of freedom in the data
         * @param D 
         * This is the D-statistic used in the K-S CDF,
         * usually known as the D-statistic, which is the maximum
         * difference between the two distributions.  I use the two-tail
         * version of D.
         * @return 
         * Probability of the null hypothesis
         */
        @PublicationReference(
            author={
                "William H. Press",
                "Saul A. Teukolsky",
                "William T. Vetterling",
                "Brian P. Flannery"
            },
            title="Numerical Recipes in C, Second Edition",
            type=PublicationType.Book,
            year=1992,
            pages=624,
            notes={
                "Section 14.3",
                "Equation 14.3.9"
            }
        )
        public static double KSsignificance(
            double Ne,
            double D)
        {

            double Nesqrt = Math.sqrt(Ne);
            double x = (Nesqrt + 0.12 + 0.11 / Nesqrt) * D;
            return 1.0 - KolmogorovDistribution.CDF.evaluate(x);
        }

        /**
         * Setter for D
         * @return 
         * This is the D-statistic used in the K-S CDF,
         * usually known as the D-statistic, which is the maximum
         * difference between the two distributions.  I use the two-tail
         * version of D.
         */
        public double getD()
        {
            return this.D;
        }

        /**
         * Setter for D
         * @param D
         * This is the D-statistic used in the K-S CDF,
         * usually known as the D-statistic, which is the maximum
         * difference between the two distributions.  I use the two-tail
         * version of D.  0.0 <= D <= 1.0
         */
        protected void setD(
            double D)
        {
            if ((D < 0.0) ||
                (D > 1.0))
            {
                throw new IllegalArgumentException("0.0 <= D <= 1.0");
            }
            this.D = D;
        }

        /**
         * Getter for Ne
         * @return 
         * This is the degrees of freedom in the K-S distribution for the
         * CDF calculation.
         */
        public double getNe()
        {
            return this.Ne;
        }

        /**
         * Setter for Ne
         * @param Ne 
         * This is the degrees of freedom in the K-S distribution for the
         * CDF calculation.
         */
        protected void setNe(
            double Ne)
        {
            if (Ne <= 0.0)
            {
                throw new IllegalArgumentException("Ne > 0.0");
            }
            this.Ne = Ne;
        }

        @Override
        public double getTestStatistic()
        {
            return this.getD();
        }

    }

}