/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.math.jet.random; import org.apache.commons.math.ConvergenceException; import org.apache.commons.math.FunctionEvaluationException; import org.apache.commons.math.analysis.UnivariateRealFunction; import org.apache.commons.math.analysis.integration.RombergIntegrator; import org.apache.commons.math.analysis.integration.UnivariateRealIntegrator; import org.junit.Assert; import java.util.Arrays; /** * Provides a consistency check for continuous distributions that relates the pdf, cdf and * samples. The pdf is checked against the cdf by quadrature. The sampling is checked * against the cdf using a G^2 (similar to chi^2) test. */ public final class DistributionChecks { private DistributionChecks() { } public static void checkDistribution(final AbstractContinousDistribution dist, double[] x, double offset, double scale, int n) throws ConvergenceException, FunctionEvaluationException { double[] xs = Arrays.copyOf(x, x.length); for (int i = 0; i < xs.length; i++) { xs[i] = xs[i]*scale+ offset; } Arrays.sort(xs); // collect samples double[] y = new double[n]; for (int i = 0; i < n; i++) { y[i] = dist.nextDouble(); } Arrays.sort(y); // compute probabilities for bins double[] p = new double[xs.length + 1]; double lastP = 0; for (int i = 0; i < xs.length; i++) { double thisP = dist.cdf(xs[i]); p[i] = thisP - lastP; lastP = thisP; } p[p.length - 1] = 1 - lastP; // count samples in each bin int[] k = new int[xs.length + 1]; int lastJ = 0; for (int i = 0; i < k.length - 1; i++) { int j = 0; while (j < n && y[j] < xs[i]) { j++; } k[i] = j - lastJ; lastJ = j; } k[k.length - 1] = n - lastJ; // now verify probabilities by comparing to integral of pdf UnivariateRealIntegrator integrator = new RombergIntegrator(); for (int i = 0; i < xs.length - 1; i++) { double delta = integrator.integrate(new UnivariateRealFunction() { @Override public double value(double v) { return dist.pdf(v); } }, xs[i], xs[i + 1]); Assert.assertEquals(delta, p[i + 1], 1.0e-6); } // finally compute G^2 of observed versus predicted. See http://en.wikipedia.org/wiki/G-test double sum = 0; for (int i = 0; i < k.length; i++) { if (k[i] != 0) { sum += k[i] * Math.log(k[i] / p[i] / n); } } sum *= 2; // sum is chi^2 distributed with degrees of freedom equal to number of partitions - 1 int dof = k.length - 1; // fisher's approximation is that sqrt(2*x) is approximately unit normal with mean sqrt(2*dof-1) double z = Math.sqrt(2 * sum) - Math.sqrt(2 * dof - 1); Assert.assertTrue(String.format("offset=%.3f scale=%.3f Z = %.1f", offset, scale, z), Math.abs(z) < 3); } static void checkCdf(double offset, double scale, AbstractContinousDistribution dist, double[] breaks, double[] quantiles) { int i = 0; for (double x : breaks) { Assert.assertEquals(String.format("m=%.3f sd=%.3f x=%.3f", offset, scale, x), quantiles[i], dist.cdf(x * scale + offset), 1.0e-6); i++; } } }