/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.math.statistics.nonparametric.kernels; import xxl.core.math.Statistics; import xxl.core.math.functions.AbstractRealFunctionFunction; /** * One main problem of nonparametric statistics is the estimation of the <tt>probability density * function (pdf)</tt> of a random variable. The estimation typically relies on a given sample, in our case * data given by objects of type <tt>Number</tt>. For density estimation kernel based techniques are quite * robust and useful. * There, for each data point X_i a {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel} * is centered at X_i. Then, the sum is built over all kernels. * The kernel density estimator inherits the basic properties of the * kernel function. For this function different choices are possible * whereas a pdf is advisable (that is restricted to [-1,1] in our case). The mostly used kernels are positive * and symmetric with a compact support, e.g., the {@link xxl.core.math.statistics.nonparametric.kernels.EpanechnikowKernel * Epanechnikow kernel} or the {@link xxl.core.math.statistics.nonparametric.kernels.GaussianKernel Gaussian kernel}. * The choice of the bandwidth h, which controls the degree of * smoothness, is crucial for the quality of the estimation. There is * a trade-off in choosing the bandwidth. If h is too high, the * resulting estimate is smooth and may hide local features of the * density. For a small bandwidth, the estimate will be spiky and * artificial artifacts may be introduced. * <br> * The estimator itself extends {@link xxl.core.math.functions.AbstractRealFunctionFunction AbstractRealFunctionFunction}. * Thus, it is implemented as a one-dimensional * {@link xxl.core.functions.Function function} expecting * objects of type <tt>Number</tt> as well as a * {@link xxl.core.math.functions.RealFunction RealFunction} expecting * data of type <tt>double</tt>. * <br> * The estimator is updatable, so one can use it as an aggregate function * combined with a special aggregation iterator. * Since this class provides a preimplementation it is abstract and implementing classes * only need to implement the method {@link #evalKDE(double x)}. * * <b>Note</b>: A sample of any kind of data given by an iterator could * be easily obtained through the {@link xxl.core.cursors.mappers.ReservoirSampler reservoir sampler * cursor}. * <br> * Generally, a more detailed coverage of kernel based density estimation respectively cdf estimation * is given in [Sco92]: David W. Scott. Multivariate Density Estimation: * Theory, Practice, and Visualization. 1992. * * @see xxl.core.cursors.mappers.Aggregator * @see xxl.core.functions.Function * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction * @see xxl.core.math.statistics.nonparametric.kernels.KernelBandwidths * @see xxl.core.math.functions.AbstractRealFunctionFunction * @see xxl.core.math.statistics.nonparametric.kernels.NativeKernelDensityEstimator * @see xxl.core.math.statistics.nonparametric.kernels.ReflectionKernelDensityEstimator * */ public abstract class AbstractKernelDensityEstimator extends AbstractRealFunctionFunction { /** used kernel function for density estimation */ protected KernelFunction kf; /** used bandwidth for density estimation */ protected double h; /** used sample */ protected Object[] sample; /** indicates whether sample, variance or bounds have changed */ protected boolean hasChanged = true; /** confidence level*/ protected double alpha; /** last computed function value */ protected double y; /** last evaluated function value */ protected double xLast; /** * Gives a skeleton implementation for a kernel based density estimation function. * * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction Kernel function}. * @param sample sample of a data set given as <tt>Object[]</tt> containing * objects of type <tt>Number</tt>. * @param h used bandwidth for computing the estimation * @param alpha confidence level used for computing an asymptotic pointwise confidence interval */ public AbstractKernelDensityEstimator(KernelFunction kf, Object[] sample, double h, double alpha) { this.kf = kf; this.sample = sample; this.h = h; this.alpha = alpha; } /** * Gives a skeleton implementation for a kernel based density estimation function. * * The thumb rule respectively normal scale rule is used * for computing the bandwidth. * * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction Kernel function}. * @param sample sample of a data set given as <tt>Object[]</tt> containing * objects of type <tt>Number</tt>. * @param h used bandwidth for computing the estimation */ public AbstractKernelDensityEstimator(KernelFunction kf, Object[] sample, double h) { this(kf, sample, h, -1.0); } /** Evaluates the kernel based density estimator at a given point x. * * @param x argument where to evaluate the density estimation * @return value of the estimated density at x */ public double eval(double x) { if (hasChanged()) reinit(); y = evalKDE(x); // store the last computed function value for further using (i.e. confidence) xLast = x; return y; } /** Evaluates the density estimator at given point x. * * @param x argument where to evaluate the density estimation * @return value of the estimated density at x */ protected abstract double evalKDE(double x); /** Sets a new sample. If the sample has changed, i.e. old_sample.equals (new_sample) * returns false, <tt>changed</tt> will be set true. * *@param newSample new sample */ public void setSample(Object[] newSample) { if (!newSample.equals(sample)) { hasChanged = true; sample = newSample; } } /** Returns the last used bandwidth for estimation. * * @return the last used bandwidth */ public double getBandwidth() { return h; } /** Sets the new bandwidth used to compute the estimation. * * @param h new bandwidth */ public void setBandwidth(double h) { this.h = h; } /** Indicates whether something has changed. Thus, a recomputation may become necessary. * * @return true, if anything has changed. */ public boolean hasChanged() { return hasChanged; } /** * Returns the current confidence interval for the last computed function point. * <BR> * This method computes * a pointwise confidence interval for the kernel density estimator. * For further details see<br> * [Hae91]: W. Haerdle, Smoothing Techniques, With Implementations in S, * Springer, New York, 1991, p. 62.<br> * * @throws UnsupportedOperationException if no confidence level alpha has been given and * hence no confidence interval for the given value can be computed * @return current epsilon of the confidence interval given by [ f(x)-epsilon_low, f(x)+epsilon_up] */ public double[] epsilon() throws UnsupportedOperationException { if (alpha < 0) { throw new UnsupportedOperationException("computing of confidence intervals not supported by this estimator"); } else { double[] re = new double[2]; if (hasChanged) reinit(); double f2nd = Kernels.kernelDerivativeEstimator( xLast, sample, 2, KernelBandwidths.oversmoothingRule( sample.length, new GaussianKernel(), Math.sqrt(Statistics.sampleVariance(sample)))); double c = 1.0 / (h * Math.pow(sample.length, 0.2)); double r = (y * kf.r()) / c; double z = Statistics.normalQuantil(1.0 - (0.5 * alpha)); re[1] = 0.5 * c * c * Math.sqrt(kf.r()) * f2nd - z * Math.sqrt(r); // upper confidence interval re[0] = 0.5 * c * c * Math.sqrt(kf.r()) * f2nd + z * Math.sqrt(r); // lower confidence interval re[1] = (-1.0) * re[1] / Math.pow(sample.length, 0.4); re[0] = re[0] / Math.pow(sample.length, 0.4); return re; } } /** * Returns the current confidence interval for given value x. * <BR> * This method computes * a pointwise confidence interval for the kernel density estimator. * For further details see [Hae91]: W. Haerdle, Smoothing Techniques, With Implementations in S, * Springer, New Yor, 1991, p. 62.<br> * * @param x argument where to evaluate the confidence interval * @throws UnsupportedOperationException if no confidence level alpha has been given and * hence no confidence interval for the given value could be computed * @return current epsilon of the confidence interval given by [ f(x)-epsilon, f(x)+epsilon] */ public double[] epsilon(double x) throws UnsupportedOperationException { if (alpha < 0) throw new UnsupportedOperationException("computing of confidence intervals not supported by this estimator"); y = eval(x); return epsilon(); } /** Returns p=1-alpha. * * @return given confidence, i.e., p = 1-alpha * @throws UnsupportedOperationException if no confidence level has been given and * hence no confidence interval can be computed */ public double confidence() throws UnsupportedOperationException { if (alpha < 0) throw new UnsupportedOperationException("computing of confidence intervals not supported by this estimator"); else return 1 - alpha; } /** Reinitializes the estimator after changes.*/ protected void reinit() { hasChanged = false; } }