package com.ewjordan.util; /** * A holder for summary statistics for observations of a particular value * that stores mean and standard deviation. Unable to handle medians or * other distributional characteristics because the observations are not * stored. * <p> * This class is meant to be used for extremely large numbers of observations - * if you have less than a million, more functionality is available in * {@link com.ewjordan.PaddyPower.Analysis.StoredObservationSummary} * * @author eric */ public class ObservationSummary { /** Number of observations. */ protected long observationCount; /** Sum of observations. */ protected double sum; /** Product of observations. */ protected double product; /** Sum of squares of observations. */ protected double sumOfSquares; /** Min */ protected double min; /** Max */ protected double max; /** Create a new ObservationSummary. */ public ObservationSummary() { sum = 0.0; product = 1.0; sumOfSquares = 0.0; observationCount = 0; min = Double.MAX_VALUE; max = -Double.MAX_VALUE; } /** * Add a new observation to the summary. * @param num new observation to add */ public void add(double num) { sum += num; product *= num; sumOfSquares += num*num; ++observationCount; if (num > max) max = num; if (num < min) min = num; } /** Add a list of observations to the summary. */ public void add(double ... nums) { for (double num:nums) { add(num); } } /** Add a boolean value as an observation, with 1 <-> true and 0 <-> false. */ public void add(boolean bool) { add(bool?1.0:0.0); } /** Get the number of observations. */ public long getCount() { return observationCount; } /** Minimum observation. */ public double getMin() { return min; } /** Maximum observation. */ public double getMax() { return max; } private boolean zeroMeansIfNoData; /** * Set to true if you prefer zero observation count means to be * reported as 0.0 instead of Double.MAX_VALUE. * @param tf */ public void setZeroMeansIfNoData(boolean tf) { zeroMeansIfNoData = tf; } /** Return the estimated arithmetic mean of the sample. */ public double getArithmeticMean() { if (zeroMeansIfNoData && observationCount <= 0) return 0.0; if (observationCount <= 0) return Double.MAX_VALUE; return sum / observationCount; } /** Return the estimated geometric mean of the sample. */ public double getGeometricMean() { if (zeroMeansIfNoData && observationCount <= 0) return 1.0; if (observationCount <= 0) return Double.MAX_VALUE; return Math.pow(product, 1.0 / observationCount); } /** Return an unbiased estimate of the sample variance. */ public double getVariance() { if (observationCount <= 1) return Double.MAX_VALUE; return (sumOfSquares/(observationCount-1) - sum*sum/(observationCount*(observationCount-1))); } /** Return an unbiased estimate of the sample standard deviation. */ public double getStandardDeviation() { if (zeroMeansIfNoData && observationCount <= 0) return 0.0; if (observationCount <= 1) return Double.MAX_VALUE; return Math.sqrt(getVariance()); } /** Return the z score of the input relative to this set of observations. */ public double getZScore(double num) { return ((num - getArithmeticMean()) / getStandardDeviation()); } /** Lookup table for mean confidence intervals. */ private static final double[] confidenceProbs = { 0.05, 0.062706778, 0.1, 0.125661347, 0.15, 0.189118426, 0.2, 0.253347103, 0.25, 0.318639364, 0.3, 0.385320466, 0.35, 0.45376219, 0.4, 0.524400513, 0.45, 0.597760126, 0.5, 0.67448975, 0.55, 0.755415026, 0.6, 0.841621234, 0.65, 0.934589291, 0.7, 1.036433389, 0.75, 1.15034938, 0.8, 1.281551566, 0.85, 1.439531471, 0.9, 1.644853627, 0.91, 1.69539771, 0.92, 1.750686071, 0.93, 1.811910673, 0.94, 1.880793608, 0.95, 1.959963985, 0.96, 2.053748911, 0.97, 2.170090378, 0.98, 2.326347874, 0.99, 2.575829304, 0.991, 2.612054141, 0.992, 2.652069808, 0.993, 2.696844261, 0.994, 2.747781385, 0.995, 2.807033768, 0.996, 2.878161739, 0.997, 2.967737925, 0.998, 3.090232306, 0.999, 3.290526731 }; private double confidenceLookup(double probability) { if (probability < 0.05 || probability > 0.999) throw new RuntimeException("Requested confidence interval outside of stored range."); //Hardcode the most common cases...these three will cover _almost_ every use of this code if (probability == 0.95) return 1.959963985; else if (probability == 0.99) return 2.575829304; else if (probability == 0.9) return 1.644853627; //Just loop through the list to do the lookup - list is short, so this is probably not worth optimizing right now for (int i=0; i<confidenceProbs.length-3; i += 2) { if (confidenceProbs[i+2] > probability) { return MathUtil.map(probability, confidenceProbs[i], confidenceProbs[i+2], confidenceProbs[i+1], confidenceProbs[i+3]); } } assert(false); //should never get here return 0; } /** * Get the value err such that the mean should be reported * as mean = estimate (+/- err) at the given probability level. * @param probability * @return standard error of arithmetic mean */ public double getMeanConfidence(double probability) { return confidenceLookup(probability) * getStandardDeviation() / Math.sqrt(observationCount); } /** Tests */ static public void main(String[] args) { double[] nums = {1,2,3,4,5,6,7,8,9,8,7,6,5,4,3,2,1}; ObservationSummary summary = new ObservationSummary(); summary.add(nums); System.out.println("Count is "+summary.getCount()+": should be 17"); System.out.println("Mean is "+summary.getArithmeticMean()+": should be ~4.76"); System.out.println("Variance is "+summary.getVariance()+": should be ~6.44"); System.out.println("Stdev is "+summary.getStandardDeviation()+": should be ~2.54"); System.out.println("95% confidence interval around mean is +/-"+summary.getMeanConfidence(0.95)+": should be +/- ~1.206"); } }