/* * Apache License * Version 2.0, January 2004 * http://www.apache.org/licenses/ * * Copyright 2013 Aurelian Tutuianu * Copyright 2014 Aurelian Tutuianu * Copyright 2015 Aurelian Tutuianu * Copyright 2016 Aurelian Tutuianu * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package rapaio.core.tests; import rapaio.data.filter.var.VFSort; import rapaio.printer.Printable; import rapaio.core.distributions.Distribution; import rapaio.data.Var; import static rapaio.sys.WS.formatFlex; /** * Creates a new statistical Kolmogorov-Smirnoff test. The 1 sample test, with <tt>v</tt> * being the 1 sample. The 1 sample test compare the data to a given densities, * and see if it does not belong to the given densities. The 2 sample test is * designed to tell if the data is not from the same population. * * @author <a href="mailto:padreati@yahoo.com>Aurelian Tutuianu</a> */ public class KSTest implements Printable { private final Distribution cdf; private final Var v1; private final Var v2; private double D; // maximum distance between ECDF1 and F, or ECDF1 and ECFD2 private final double pValue; /** * One-sample K-S test. * <p> * D is the maximum distance between ECDF(v) and given cdf. * pValue is the computed p-value for the KS test against the given densities * <p> * The null hypothesis of this test is that the given data set belongs to the given densities. * The altString hypothesis is that the data set does not belong to the given densities. * * @param cdf the densities to compare against */ public static KSTest oneSampleTest(Var sample, Distribution cdf) { return new KSTest(sample, cdf); } /** * Two-samples K-S test * <p> * D is the maximum distance between ECDF(v1) and ECDF(v2) * pValue is the p-value for the 2 sample KS test * The null hypothesis of this test is that both data sets comes from the same densities, * The altString hypothesis is that the two samples comes from different densities. * * @param sample1 first sample * @param sample2 second sample */ public static KSTest twoSamplesTest(Var sample1, Var sample2) { return new KSTest(sample1, sample2); } private KSTest(Var sample, Distribution cdf) { this.v1 = new VFSort().fitApply(sample); this.cdf = cdf; this.v2 = null; D = 0; double n = v1.rowCount(); double fo = 0.0; for (int i = 0; i < v1.rowCount(); i++) { //ECDF(x) - F(x) double ff = cdf.cdf(v1.value(i)); double fn = (i + 1) / n; D = Math.max(D, Math.abs(fo - ff)); D = Math.max(D, Math.abs(fn - ff)); fo = fn; } n = Math.sqrt(n); pValue = probks((n + 0.12 + 0.11 / n) * D); } private KSTest(Var sample1, Var sample2) { this.v1 = new VFSort().fitApply(sample1); this.v2 = new VFSort().fitApply(sample2); this.cdf = null; D = 0; double fn1 = 0.0; double fn2 = 0.0; int i1 = 0; int i2 = 0; double n1 = v1.rowCount(); double n2 = v2.rowCount(); while (i1 < n1 && i2 < n2) { double d1 = v1.value(i1); double d2 = v2.value(i2); if (d1 <= d2) fn1 = i1++ / n1; if (d2 <= d1) fn2 = i2++ / n2; D = Math.max(D, Math.abs(fn1 - fn2)); } double n = (n1 * n2) / (n1 + n2); n = Math.sqrt(n); pValue = probks((n + 0.12 + 0.11 / n) * D); } private double probks(double x) { final double EPS1 = 0.001; final double EPS2 = 1.0e-10; double a2 = -2.0 * x * x; double fac = 2.0; double sum = 0.0, term, bf = 0.0; for (int i = 1; i <= 100; i++) { term = fac * Math.exp(a2 * i * i); sum += term; if (Math.abs(term) <= EPS1 * bf || Math.abs(term) <= EPS2 * sum) return Math.min(1.0, sum); fac = -fac; bf = Math.abs(term); } return 1.0; } /** * Returns maximum distance between ECDF and given cdf, for 1-sample test, * and returns maximum distance between the two given ECDFs for 2-sample test * * @return maximum distance between densities */ public double d() { return D; } /** * Gets p-value for the given test * * @return p-value */ public double pValue() { return pValue; } @Override public String summary() { StringBuilder sb = new StringBuilder(); if (cdf != null) oneSampleSummary(sb); else twoSamplesSummary(sb); return sb.toString(); } protected String getPValueStars() { if (pValue > 0.1) return ""; if (pValue > 0.05) return "."; if (pValue > 0.01) return "*"; if (pValue > 0.001) return "**"; return "***"; } private void oneSampleSummary(StringBuilder sb) { sb.append("\n > Kolmogorov-Smirnoff 1-sample test\n"); int ties = (int) (v1.rowCount() - v1.stream().mapToDouble().distinct().count()); sb.append(String.format("sample size: %d, ties: %d\n", v1.rowCount(), ties)); if (ties > 0) sb.append(" (warning: p-values will not be exact because of ties)\n"); sb.append(String.format("densities: %s\n", cdf.name())); sb.append("D statistic: ").append(formatFlex(D)).append("\n"); sb.append("p-value: ").append(formatFlex(pValue)).append(" ").append(getPValueStars()).append("\n"); sb.append("\n"); } private void twoSamplesSummary(StringBuilder sb) { sb.append("\n > Kolmogorov-Smirnoff 2-sample test\n"); int ties1 = (int) (v1.rowCount() - v1.stream().mapToDouble().distinct().count()); int ties2 = (int) (v2.rowCount() - v2.stream().mapToDouble().distinct().count()); sb.append(String.format("first sample size: %d, ties: %d\n", v1.rowCount(), ties1)); sb.append(String.format("second sample size: %d, ties: %d\n", v2.rowCount(), ties2)); if (ties1 + ties2 > 0) sb.append(" (warning: p-values will not be exact because of ties)\n"); sb.append(String.format("D statistic: %.6f\n", D)); sb.append(String.format("p-value: %.16f %s\n", pValue, getPValueStars())); sb.append("\n"); } }