/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.math.stats; import org.apache.mahout.common.RandomUtils; import org.apache.mahout.math.MahoutTestCase; import org.apache.mahout.math.jet.random.AbstractContinousDistribution; import org.apache.mahout.math.jet.random.Gamma; import org.junit.Test; import java.util.Arrays; import java.util.Random; public final class OnlineSummarizerTest extends MahoutTestCase { @Test public void testStats() { /** the reference limits here were derived using a numerical simulation where I took 10,000 samples from the distribution in question and computed the stats from that sample to get min, 25%-ile, median and so on. I did this 1000 times to get 5% and 95% confidence limits for those values. */ //symmetrical, well behaved System.out.printf("normal\n"); check(normal(10000)); //asymmetrical, well behaved. The range for the maximum was fudged slightly to all this to pass. System.out.printf("exp\n"); check(exp(10000)); //asymmetrical, wacko distribution where mean/median is about 200 System.out.printf("gamma\n"); check(gamma(10000, 0.1)); } private static void check(double[] samples) { OnlineSummarizer s = new OnlineSummarizer(); double mean = 0; double sd = 0; int n = 1; for (double x : samples) { s.add(x); double old = mean; mean += (x - mean) / n; sd += (x - old) * (x - mean); n++; } sd = Math.sqrt(sd / samples.length); Arrays.sort(samples); for (int i = 0; i < 5; i++) { int index = Math.abs(Arrays.binarySearch(samples, s.getQuartile(i))); assertEquals("quartile " + i, i * (samples.length - 1) / 4.0, index, 10); } assertEquals(s.getQuartile(2), s.getMedian(), 0); assertEquals("mean", s.getMean(), mean, 0); assertEquals("sd", s.getSD(), sd, 1e-8); } private static double[] normal(int n) { double[] r = new double[n]; Random gen = RandomUtils.getRandom(1L); for (int i = 0; i < n; i++) { r[i] = gen.nextGaussian(); } return r; } private static double[] exp(int n) { double[] r = new double[n]; Random gen = RandomUtils.getRandom(1L); for (int i = 0; i < n; i++) { r[i] = -Math.log1p(-gen.nextDouble()); } return r; } private static double[] gamma(int n, double shape) { double[] r = new double[n]; Random gen = RandomUtils.getRandom(); AbstractContinousDistribution gamma = new Gamma(shape, shape, gen); for (int i = 0; i < n; i++) { r[i] = gamma.nextDouble(); } return r; } }