/* * File: BinomialChebyshevConfidence.java * Authors: Kevin R. Dixon * Company: Sandia National Laboratories * Project: Cognitive Foundry * * Copyright October 4, 2007, Sandia Corporation. Under the terms of Contract * DE-AC04-94AL85000, there is a non-exclusive license for use of this work by * or on behalf of the U.S. Government. Export of this program may require a * license from the United States Government. See CopyrightHistory.txt for * complete details. * */ package gov.sandia.cognition.statistics.method; import gov.sandia.cognition.annotation.PublicationReference; import gov.sandia.cognition.annotation.PublicationType; import gov.sandia.cognition.math.ProbabilityUtil; import gov.sandia.cognition.util.AbstractCloneableSerializable; import java.util.Collection; /** * Computes the Bernoulli confidence interval. In other words, computes * the Bernoulli parameter based on * the given data and the desired level of confidence. This answers the * question, "What is true range of classification rates given a * collection of correct/incorrect guesses at a given level of confidence?" * For example, if my classifier gets * { Correct, Wrong, Correct, Correct, Correct, Wrong, Correct, Correct }, * the true classification rate of my classifier at 50% confidence is * Pr{ 0.5335 <= p <= 0.9665 } >= 0.5 * * * @author Kevin R. Dixon * @since 2.0 * */ public class BernoulliConfidence extends AbstractCloneableSerializable implements ConfidenceIntervalEvaluator<Collection<Boolean>> { /** * This class has no members, so here's a static instance. */ public static final BernoulliConfidence INSTANCE = new BernoulliConfidence(); /** Creates a new instance of BernoulliConfidence */ public BernoulliConfidence() { } /** * Computes the ConfidenceInterval for the Bernoulli parameter based on * the given data and the desired level of confidence. This answers the * question, "What is true range of classification rates given a * collection of correct/incorrect guesses at a given level of confidence?" * For example, if my classifier gets * { Correct, Wrong, Correct, Correct, Correct, Wrong, Correct, Correct }, * the true classification rate of my classifier at 50% confidence is * Pr{ 0.5335 <= p <= 0.9665 } >= 0.5 * @param data * Correct/Wrong data * @param confidence * Confidence level to place on the confidence interval, must be (0,1] * @return * Range of values for the accuracy of the classifier at the desired * confidence */ public ConfidenceInterval computeConfidenceInterval( Collection<Boolean> data, double confidence) { int n = 0; for( Boolean value : data ) { if( value == true ) { n++; } } double p = ((double) n) / data.size(); return BernoulliConfidence.computeConfidenceInterval( p, data.size(), confidence ); } /** * Computes the ConfidenceInterval for the Bernoulli parameter based on * the given data and the desired level of confidence. This answers the * question, "What is true range of classification rates given a * collection of correct/incorrect guesses at a given level of confidence?" * For example, if my classifier gets * { Correct, Wrong, Correct, Correct, Correct, Wrong, Correct, Correct }, * the true classification rate of my classifier at 50% confidence is * Pr{ 0.5335 <= p <= 0.9665 } >= 0.5 * * @param bernoulliParameter * Estimated Bernoulli parameter, classifier success rate, must be [0,1] * @param numSamples * Number of samples used in the determination * @param confidence * Confidence level to place on the confidence interval, must be (0,1] * @return * Range of values for the accuracy of the classifier at the desired * confidence */ @PublicationReference( author="Wikipedia", title="", type=PublicationType.WebPage, year=2009, url="http://en.wikipedia.org/wiki/Margin_of_error" ) public static ConfidenceInterval computeConfidenceInterval( double bernoulliParameter, int numSamples, double confidence ) { double p = bernoulliParameter; double pvar = p*(1-p) / numSamples; return INSTANCE.computeConfidenceInterval( p, pvar, numSamples,confidence); } @Override public ConfidenceInterval computeConfidenceInterval( double mean, double variance, int numSamples, double confidence) { ProbabilityUtil.assertIsProbability(mean); return ChebyshevInequality.INSTANCE.computeConfidenceInterval( mean, variance, numSamples, confidence ); } /** * Computes the number of samples needed to estimate the Bernoulli parameter * "p" (mean) within "accuracy" with probability at least "confidence". * Answers the question, "How many people do I need to survey to estimate * how many people would vote for Budweiser as the King of Beers within * a desired accuracy and a set confidence?" For example, to correctly * determine the accuracy within 0.01 with confidence=0.95, we need up to * 50000 samples. * @param accuracy * Desired accuracy to estimate, on the interval (0,1] * @param confidence * Desired confidence, on the interval (0,1] * @return * Maximum number of samples needed to achieve the accuracy with the level * of confidence */ @PublicationReference( author="Wikipedia", title="", type=PublicationType.WebPage, year=2009, url="http://en.wikipedia.org/wiki/Margin_of_error" ) public static int computeSampleSize( double accuracy, double confidence ) { if( (accuracy <= 0.0) || (accuracy > 1.0) ) { throw new IllegalArgumentException( "Accuracy must be (0,1]" ); } if( (confidence <= 0.0) || (confidence > 1.0) ) { throw new IllegalArgumentException( "Confidence must be (0,1]" ); } // We're using the Chebyshev Inequality with a Binomial assumption here: // Pr{ abs(X-mean) >= a } <= variance / a^2 // let a = k*sqrt(variance) // Pr{ abs(X-mean) >= k*sqrt(variance) } <= 1/k^2, // where k is the "number of standard deviations away from the mean" // // If we use a binomial assumption, then // mean = p, and variance=p(1-p)/n // Thus, confidence = Pr{ abs(X-p) < k*sqrt(p(1-p))/sqrt(n) } > 1-1/k^2 // We don't know what "p" is, but we do know that 0<=p<=1 and thus // sqrt(p(1-p)) <= 0.5 (equal when p=0.5). // So, confidence = Pr{ abs(X-p) < k*0.5/sqrt(n) } > 1-1/k^2 // However, we're interested in an "accuracy" value, when // accuracy = k*0.5/sqrt(n) // Number of standard deviations: confidence=1-1/k^2 double numStdDevs = Math.sqrt( 1.0 / (1-confidence) ); // accuracy = k*0.5/sqrt(n) double sqrtn = numStdDevs / (2*accuracy); int n = (int) Math.ceil( sqrtn*sqrtn ); return n; } }