/* * File: KolmogorovSmirnovConfidence.java * Authors: Kevin R. Dixon * Company: Sandia National Laboratories * Project: Cognitive Foundry * * Copyright August 15, 2007, Sandia Corporation. Under the terms of Contract * DE-AC04-94AL85000, there is a non-exclusive license for use of this work by * or on behalf of the U.S. Government. Export of this program may require a * license from the United States Government. See CopyrightHistory.txt for * complete details. * */ package gov.sandia.cognition.statistics.method; import gov.sandia.cognition.annotation.PublicationReference; import gov.sandia.cognition.annotation.PublicationType; import gov.sandia.cognition.collection.NumberComparator; import gov.sandia.cognition.statistics.CumulativeDistributionFunction; import gov.sandia.cognition.statistics.distribution.KolmogorovDistribution; import gov.sandia.cognition.statistics.distribution.UnivariateGaussian; import gov.sandia.cognition.util.AbstractCloneableSerializable; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; /** * Performs a Kolmogorov-Smirnov Confidence Test. This is often simply called * the "K-S test". This is a powerful * nonparametric test that determines the probability that two distributions * were generated by the same distribution. There are minimal (no?) * assumptions on the underlying data or distributions. That is, the * distributions are NOT assumed to be Gaussian, etc. * @author Kevin R. Dixon * @since 2.0 * */ @ConfidenceTestAssumptions( name="Kolmogorov-Smirnov test", alsoKnownAs="K-S test", description={ "Determines if two datasets were drawn from the same univariate distribution.", "Robust, nonparameteric test that makes no assumptions on the underlying distribution (continuous, discrete, etc.)." }, assumptions={ "The data were sampled independently from each other." }, nullHypothesis="The data were drawn from the same distribution.", dataPaired=false, dataSameSize=false, distribution=KolmogorovDistribution.CDF.class, reference=@PublicationReference( author="Wikipedia", title="Kolmogorov-Smirnov test", type=PublicationType.WebPage, year=2009, url="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test" ) ) public class KolmogorovSmirnovConfidence extends AbstractCloneableSerializable implements NullHypothesisEvaluator<Collection<? extends Number>> { /** * Default instance of the K-S test. */ public static final KolmogorovSmirnovConfidence INSTANCE = new KolmogorovSmirnovConfidence(); /** * Creates a new instance of KolmogorovSmirnovConfidence */ public KolmogorovSmirnovConfidence() { } /** * Returns an array of ascending sorted values from the given Collection * @param data * Collection of doubles to sort into ascending order * @return * Array of ascending sorted values */ protected static double[] computeAscendingArray( Collection<? extends Number> data) { double[] values = new double[data.size()]; int index = 0; for (Number value : data) { values[index] = value.doubleValue(); index++; } Arrays.sort(values); return values; } /** * This is the standard K-S test for two distributions of data. Determines * the probability that the two distributions of data were generated by * the same underlying distributions. This is a parameter-free test, so * the assumptions on the underlying data are minimal (inexistent?). * @param data1 * First dataset to consider * @param data2 * Second dataset to consider * @return * ConfidenceStatistic from the K-S test. */ @PublicationReference( author={ "William H. Press", "Saul A. Teukolsky", "William T. Vetterling", "Brian P. Flannery" }, title="Numerical Recipes in C, Second Edition", type=PublicationType.Book, year=1992, pages={625,626}, notes={ "Section 14.3", "Function kstwo()" }, url="http://www.nrbook.com/a/bookcpdf.php" ) @Override public KolmogorovSmirnovConfidence.Statistic evaluateNullHypothesis( Collection<? extends Number> data1, Collection<? extends Number> data2) { double[] dataArray1 = KolmogorovSmirnovConfidence.computeAscendingArray(data1); double[] dataArray2 = KolmogorovSmirnovConfidence.computeAscendingArray(data2); int j1 = 0; int j2 = 0; double N1 = dataArray1.length; double N2 = dataArray2.length; double fn1 = 0.0; double fn2 = 0.0; double D = 0.0; while ((j1 < N1) && (j2 < N2)) { double d1 = dataArray1[j1]; double d2 = dataArray2[j2]; if (d1 <= d2 || Double.isNaN(d1)) { j1++; fn1 = j1 / N1; } if (d2 <= d1 || Double.isNaN(d2)) { j2++; fn2 = j2 / N2; } double dt = Math.abs(fn2 - fn1); if (dt > D) { D = dt; } } double Ne = Math.sqrt((N1 * N2) / (N1 + N2)); return new KolmogorovSmirnovConfidence.Statistic(Ne, D); } /** * This is the standard K-S test for determining if the given data were * generated by the given CDF. Computes the probability that the two * distributions of data are actually the same underlying distributions. * This is a parameter-free test, so the assumptions on the underlying data * are minimal (inexistent?). For example, to test if a dataset is normally * distribution, call * computeNullHypothesisProbability( data, new UnivariateGaussian.CumulativeDistribution() ). * @param <DomainType> Type of Number to consider * @param data1 Dataset to consider * @param function CDF to compare against the given data * @return * ConfidenceStatistic from the K-S test. */ @PublicationReference( author={ "William H. Press", "Saul A. Teukolsky", "William T. Vetterling", "Brian P. Flannery" }, title="Numerical Recipes in C, Second Edition", type=PublicationType.Book, year=1992, pages=625, notes={ "Section 14.3", "Function ksone()" } ) public static <DomainType extends Number> KolmogorovSmirnovConfidence.Statistic evaluateNullHypothesis( Collection<? extends DomainType> data1, CumulativeDistributionFunction<DomainType> function) { // This code nulls out the early repeated values. This signals to // the subsequent loop that only the final repeated domain value // should be tested, making the discontinuities with discrete-valued // distributions manageable. -- krdixon 2010-03-24 ArrayList<DomainType> sortedData1 = new ArrayList<DomainType>( data1 ); Collections.sort( sortedData1, NumberComparator.INSTANCE ); for( int n = 1; n < sortedData1.size(); n++ ) { if( sortedData1.get(n-1).equals( sortedData1.get(n) ) ) { sortedData1.set(n-1, null); } } // This method computes the two-tailed K-S statistic (written "D*") // This can be transformed into a one-sided statistic by changing the // "double dt..." line to select the D+ (fn-ff) or D- (fo-ff) terms. // Knuth prefers the one-sided statistics, but I haven't seen a huge // difference one way or the other. double fo = 0.0; double D = 0.0; final double Ne = sortedData1.size(); for (int j = 0; j < Ne; j++) { double fn = (j + 1) / Ne; if( sortedData1.get(j) != null ) { double ff = function.evaluate( sortedData1.get(j) ); double dt = Math.max(Math.abs(fo - ff), Math.abs(fn - ff)); if (dt > D) { D = dt; } } fo = fn; } return new KolmogorovSmirnovConfidence.Statistic(Ne, D); } /** * Evaluates the Hypothesis that the given data were generated according * to a UnivariateGaussian distribution. A high null-hypothesis * probability is not conclusive proof that the data were generated by * a Gaussian. However, a low null-hypothesis probability is conclusive * that the data were NOT likely generated by a Gaussian * @param data * Data to evaluate the possibility that they were generated according to * a Gaussian Distribution * @return Confidence statistic from the K-S test */ public static KolmogorovSmirnovConfidence.Statistic evaluateGaussianHypothesis( Collection<Double> data) { // First, fit the ML Gaussian to the data UnivariateGaussian gaussian = UnivariateGaussian.MaximumLikelihoodEstimator.learn(data, 0.0); UnivariateGaussian.CDF cdf = new UnivariateGaussian.CDF(gaussian); // Now, run a standard K-S test against the data and the ML gaussian return evaluateNullHypothesis(data, cdf); } /** * Computes the ConfidenceStatistic associated with a K-S test */ public static class Statistic extends AbstractConfidenceStatistic { /** * This is the D-statistic used in the K-S CDF, * usually known as the D-statistic, which is the maximum * difference between the two distributions. I use the two-tail * version of D. */ private double D; /** * This is the degrees of freedom in the K-S distribution for the * CDF calculation. */ private double Ne; /** * Creates a new instance of Statistic * @param D * This is the D-statistic used in the K-S CDF, * usually known as the D-statistic, which is the maximum * difference between the two distributions. I use the two-tail * version of D. * @param Ne * This is the degrees of freedom in the K-S distribution for the * CDF calculation. */ public Statistic( double Ne, double D) { super(Statistic.KSsignificance(Ne, D)); this.setNe(Ne); this.setD(D); } /** * Computes the significance of the K-S test from the given degrees of * freedom and D-statistic. This approximation is from Numerical * Recipes in C, p. 624 * @param Ne * Number of degrees of freedom in the data * @param D * This is the D-statistic used in the K-S CDF, * usually known as the D-statistic, which is the maximum * difference between the two distributions. I use the two-tail * version of D. * @return * Probability of the null hypothesis */ @PublicationReference( author={ "William H. Press", "Saul A. Teukolsky", "William T. Vetterling", "Brian P. Flannery" }, title="Numerical Recipes in C, Second Edition", type=PublicationType.Book, year=1992, pages=624, notes={ "Section 14.3", "Equation 14.3.9" } ) public static double KSsignificance( double Ne, double D) { double Nesqrt = Math.sqrt(Ne); double x = (Nesqrt + 0.12 + 0.11 / Nesqrt) * D; return 1.0 - KolmogorovDistribution.CDF.evaluate(x); } /** * Setter for D * @return * This is the D-statistic used in the K-S CDF, * usually known as the D-statistic, which is the maximum * difference between the two distributions. I use the two-tail * version of D. */ public double getD() { return this.D; } /** * Setter for D * @param D * This is the D-statistic used in the K-S CDF, * usually known as the D-statistic, which is the maximum * difference between the two distributions. I use the two-tail * version of D. 0.0 <= D <= 1.0 */ protected void setD( double D) { if ((D < 0.0) || (D > 1.0)) { throw new IllegalArgumentException("0.0 <= D <= 1.0"); } this.D = D; } /** * Getter for Ne * @return * This is the degrees of freedom in the K-S distribution for the * CDF calculation. */ public double getNe() { return this.Ne; } /** * Setter for Ne * @param Ne * This is the degrees of freedom in the K-S distribution for the * CDF calculation. */ protected void setNe( double Ne) { if (Ne <= 0.0) { throw new IllegalArgumentException("Ne > 0.0"); } this.Ne = Ne; } @Override public double getTestStatistic() { return this.getD(); } } }