/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.math.statistics.nonparametric.kernels; import java.util.Arrays; import java.util.Comparator; import java.util.Iterator; import xxl.core.collections.queues.Heap; import xxl.core.util.Distance; /** * One main problem of nonparametric statistics is the estimation of the <tt>probability density * function (pdf)</tt> of a random variable. The estimation typically relies on a given sample, in our case * data given by objects of type <tt>Number</tt>. For density estimation kernel based techniques are quite * robust and useful. * There, for each data point X_i a {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel} * is centered at X_i. Then, the sum is built over all kernels. * The kernel density estimator inherits the basic properties of the * kernel function. For this function different choices are possible * whereas a pdf is advisable (that is restricted to [-1,1] in our case). The mostly used kernels are positive * and symmetric with a compact support, e.g., the {@link xxl.core.math.statistics.nonparametric.kernels.EpanechnikowKernel * Epanechnikow kernel} or the {@link xxl.core.math.statistics.nonparametric.kernels.GaussianKernel Gaussian kernel}. * The choice of the bandwidth h, which controls the degree of * smoothness, is crucial for the quality of the estimation. There is * a trade-off in choosing the bandwidth. If h is too high, the * resulting estimate is smooth and may hide local features of the * density. For a small bandwidth, the estimate will be spiky and * artificial artifacts may be introduced. * <br> * If the primitive of the kernel is known, this approach can be extended towards the estimation * of the <tt>cumulative distribution function (cdf)</tt> by simply integrating the kernel based density estimator appropriately. * <br> * Besides these applications estimators based on kernels may also be applicable in other scenarios. * Again, the choice of the bandwidth is crucial for the quality of these estimators. * <br> * This class provides some static methods for dealing with bandwidths for * kernel estimators. Since there are many choices for bandwiths we refer for a * more detailed coverage * to <br>[Sco92]: David W. Scott. Multivariate Density Estimation: * Theory, Practice, and Visualization. 1992 * and <br> * [WJ95]: M.P. Wand and M.C. Jones. Kernel Smoothing. * 1995. pages 60 ff. * * @see xxl.core.math.statistics.nonparametric.kernels.AbstractKernelDensityEstimator * @see xxl.core.math.statistics.nonparametric.kernels.AbstractKernelCDF * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction * */ public class KernelBandwidths { /** indicates the use of type {@link #oversmoothingRule(int,double,double,double) } as bandwidth strategy * @see #computeBandWidth1D(int type,Object[] sample,KernelFunction kf,double var,double min,double max) */ public static final int OVERSMOOTHING_RULE_1D = 0; /** indicates the use of type {@link #normalScaleRule(int,double,double,double)} as bandwidth strategy * @see #computeBandWidth1D(int type,Object[] sample,KernelFunction kf,double var,double min,double max) */ public static final int THUMB_RULE_1D = 1; /** indicates the use of type {@link #directNSPlugInRule2Stage(Object[],int,KernelFunction,double) } as bandwidth strategy * @see #computeBandWidth1D(int type,Object[] sample,KernelFunction kf,double var,double min,double max) */ public static final int DPI2_RULE_1D = 2; /** indicates the use of type {@link #maximumLikelihoodCV(Object[],KernelFunction,double) } as bandwidth strategy * @see #computeBandWidth1D(int type,Object[] sample,KernelFunction kf,double var,double min,double max) */ public static final int MLCV_RULE = 3; /** indicates the use of type {@link #adaBand(Object[],Iterator,Distance,int) } as bandwidth strategy * @see #computeBandWidth1D(int type,Object[] sample,KernelFunction kf,double var,double min,double max) */ public static final int ADABAND_RULE = 4; /** indicates the use of type {@link #scottsRule(double[],int) } as bandwidth strategy * @see #computeBandWidth1D(int type,Object[] sample,KernelFunction kf,double var,double min,double max) */ public static final int SCOTTS_RULE = 5; /** Computes the bandwidth of a specified type for a given kernel estimator. * The kernel estimator relies on a sample and a kernel function with a known variance. * * @param type bandwidth strategy * @param sample used sample to compute the estimation. Given as Objects of type <TT>Number</TT>. * @param kf kernel function * @param var variance of the kernel function * @param min minimum of the data * @param max maximum of the data * @return bandwidth according to the chosen strategy */ public static double computeBandWidth1D( int type, Object[] sample, KernelFunction kf, double var, double min, double max) { switch (type) { case KernelBandwidths.OVERSMOOTHING_RULE_1D : return KernelBandwidths.oversmoothingRule(sample.length, kf, var); case KernelBandwidths.DPI2_RULE_1D : return KernelBandwidths.directNSPlugInRule2Stage(sample, sample.length, kf, var); case KernelBandwidths.MLCV_RULE : return KernelBandwidths.maximumLikelihoodCV( sample, kf, KernelBandwidths.oversmoothingRule(sample.length, kf, var)); case KernelBandwidths.SCOTTS_RULE : return KernelBandwidths.scottsRule(new double[] { var }, sample.length)[0]; case KernelBandwidths.THUMB_RULE_1D : return KernelBandwidths.normalScaleRule(sample.length, kf, var); } throw new IllegalArgumentException("No bandwidth rule of this type is supported!"); } /** Computes a one-dimensional bandwidth for a kernel estimator * based upon the oversmoothing or maximal principle rule. * For further informations see * [WJ95]: M.P. Wand and M.C. Jones. Kernel Smoothing. * Chapman & Hall. 1995. pages 60 ff. * * @param n size of the used sample * @param var variance of the used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function}. * @param r R value (roughness) of the used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function}. * ( R(f) = integral( f(x)^2 dx) ). See [Sco92] page 281 (notation). * @param s estimator for the spread of the data, e.g., an estimator for the * standard deviation or the inter quartil range of the data * @return bandwidth computed with the oversmoothing rule * * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction#r() * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction#variance() * @see xxl.core.math.statistics.nonparametric.kernels.NativeKernelDensityEstimator */ public static double oversmoothingRule(int n, double var, double r, double s) { double re = 0.0; re = 243.0 * r; re = re / (35.0 * var * n); re = Math.pow(re, 0.2); re = re * s; return re; } /** Computes a one-dimensional bandwidth for a kernel estimator * based upon the oversmoothing oder maximal principle rule. * For further informations see * [WJ95]: M.P. Wand and M.C. Jones. Kernel Smoothing. * Chapman & Hall. 1995. pages 60 ff. * * @param n size of the used sample * @param kf Used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function} * @param s estimator for the spread of the data, e.g., an estimator for the * variance or the inter quartil range of the data * @return a bandwidth computed with the oversmoothing rule * * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction * @see xxl.core.math.statistics.nonparametric.kernels.NativeKernelDensityEstimator */ public static double oversmoothingRule(int n, KernelFunction kf, double s) { return oversmoothingRule(n, kf.variance(), kf.r(), s); } /** Computes a one-dimensional bandwidth for a kernel estimator * based upon the normal scale rule. * For further informations see * [Sco92]: David W. Scott. Multivariate Density Estimation: * Theory, Practice, and Visualization. John Wiley & Sons, inc. 1992. pages 131 ff. * * The normal scale rule is also known as <tt>thumb rule</tt>. * @param n size of the used sample * @param var variance of the used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function} * @param r R value (roughness) of the used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function}. * ( R(f) = integral( f(x)^2 dx) ). See [Sco92] page 281 (notation) * @param s estimator for the spread of the data, e.g., an estimator for the * standard deviation or the inter quartil range of the data * @return a bandwidth computed with the normal scale rule * * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction#r() * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction#variance() * @see xxl.core.math.statistics.nonparametric.kernels.NativeKernelDensityEstimator */ public static double normalScaleRule(int n, double var, double r, double s) { double re = 0.0; re = 8.0 * Math.sqrt(Math.PI) * r; re = re / (3.0 * var * n); re = Math.pow(re, 0.2); re = re * s; return re; } /** Computes a one-dimensional bandwidth for a kernel estimator * based upon the normal scale rule. * For further informations see <BR> * [Sco92]: David W. Scott. Multivariate Density Estimation: * Theory, Practice, and Visualization. John Wiley & Sons, inc. 1992. pages 131 ff. <BR> * * The normal scale rule is also known as <tt>thumb rule</tt>. * @param n size of the used sample * @param kf Used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function} * @param s estimator for the spread of the data, e.g., an estimator for the * variance or the inter quartil range of the data * @return a bandwidth computed with the normal scale rule * * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction * @see xxl.core.math.statistics.nonparametric.kernels.NativeKernelDensityEstimator */ public static double normalScaleRule(int n, KernelFunction kf, double s) { return normalScaleRule(n, kf.variance(), kf.r(), s); } /** Computes the bandwidths for each given sample value (equally for each dimension) * as the distance of the k-nearest neighbour of the sample values wrt. the given entirety. * <br> * Based upon [DG01]: Domeniconi, Carlotta. Gunopulos, Dimitios. An Efficent Approach for Approximation Multi-dimensional Range * Queries and Nearest Neighbor Classification in Large Datasets. 2001 * <br> * @param sample <tt>Object []</tt> containing a sample with values of type <tt>double []</tt> * @param data {@link java.util.Iterator iterator} delivering the entirety of the given sample * @param distance distance function * @param quantil use the distance of the quantil-nearest neighbuor to a sample value for bandwidth * @return the bandwidths (equally for each dimension) of the corresponding sample values */ public static double[] adaBand(final Object[] sample, Iterator data, final Distance distance, int quantil) { double[] bandwidths = new double[sample.length]; Heap[] heaps = new Heap[sample.length]; double[] kmax = new double[sample.length]; for (int i = 0; i < sample.length; i++) { final int j = i; (heaps[i] = new Heap(quantil, new Comparator() { Object reference = sample[j]; public int compare(Object o1, Object o2) { double dist = (distance.distance(reference, o1) - distance.distance(reference, o2)); return dist < 0 ? 1 : (dist > 0 ? -1 : 0); } })).open(); } int count = -1; Object x = null; while (data.hasNext()) { x = data.next(); count++; for (int i = 0; i < sample.length; i++) { if (count == 0) { //init kmax kmax[i] = distance.distance(sample[i], x); } if (count >= quantil) { if (distance.distance(sample[i], x) < kmax[i]) heaps[i].replace(x); kmax[i] = distance.distance(sample[i], heaps[i].peek()); } else { if (distance.distance(sample[i], x) > kmax[i]) kmax[i] = distance.distance(sample[i], x); heaps[i].enqueue(x); } } } for (int i = 0; i < sample.length; i++) { bandwidths[i] = distance.distance(sample[i], heaps[i].dequeue()); heaps[i].close(); } return bandwidths; } /** Returns the bandwidths for a n-dimensional kernel estimator based upon * Scott's rule. * For further informations see <BR> * [Sco92]: David W. Scott. Multivariate Density Estimation: * Theory, Practice, and Visualization. John Wiley & Sons, inc. 1992. pages 131 ff. * * @param stdDev standard deviations of the data in the corresponding dimensions * @param n sample size used for estimation * @return array of type <tt>double[]</tt> containing the bandwidths for the corresponding dimensions */ public static double[] scottsRule(double[] stdDev, int n) { int d = stdDev.length; double[] r = new double[d]; for (int i = 0; i < d; i++) { double p = (-1.0) / (d + 4); r[i] = stdDev[i] * Math.pow(n, p); } return r; } /** Computes a one-dimensional bandwidth for a kernel estimator * based upon the maximum likelihood cross validation. * For further informations see * [Har91]: W. Hardle. Smoothing techniques with implementation in S. 1991. * * @param sample used sample of the data given by Objects of type Number. * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function} * @param h used bandwidth for cross validation * @return a bandwidth computed with maximum likelihood cross validation. */ public static double maximumLikelihoodCV(Object[] sample, KernelFunction kf, double h) { return maximumLikelihoodCV(sample, kf, new double[] { h })[0]; } /** Computes a d-dimensional bandwidth for a kernel estimator * based upon the maximum likelihood cross validation. * For further informations see * [Har91]: W. Hardle. Smoothing techniques with implementation in S. 1991. * * @param sample used sample of the data given by Objects of type <tt>Number</tt>. * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function} * @param hs used bandwidths for cross validation for the corresponding dimension * @return bandwidths computed with maximum likelihood cross validation. */ public static double[] maximumLikelihoodCV(Object[] sample, KernelFunction kf, double[] hs) { double[] r = new double[hs.length]; int n = sample.length; Arrays.fill(r, 0.0); for (int k = 0; k < r.length; k++) { for (int i = 0; i < n; i++) { double xi = ((Number) sample[i]).doubleValue(); double score = 0.0; for (int j = 0; j < n; j++) { double xj = ((Number) sample[j]).doubleValue(); if (i != j) score += kf.eval((xi - xj) / hs[k]); } // if a sample xi has no neigbours, so that K( xi-xj / h) == 0, treat it like an outlier (i.e. ignore it), // otherwise log( score ) = -infinity occurs if (score > 0.0) r[k] += Math.log(score); else n--; // work around, if a sample x_i had been treated as an outlier (negative bandwiths!) } r[k] = r[k] / n - Math.log((n - 1) * hs[k]); } return r; } /** Computes a one-dimensional bandwidth for a kernel estimator * based upon the 2-stage direct plug in rule using a normal kernel function for * estimating the roughness of the plug-in values. * For further informations see * [WJ95]: M.P. Wand and M.C. Jones. Kernel Smoothing. * Chapman & Hall. 1995. pages 71 ff. * * @param sample used sample of the data given by Objects of type <tt>Number</tt> * @param n size of the used sample * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function} * @param s estimator for the spread of the data, e.g., an estimator for the * standard deviation or the inter quartil range of the data * @return bandwidth computed with the 2-stage direct plug in rule */ public static double directNSPlugInRule2Stage(Object[] sample, int n, KernelFunction kf, double s) { return directNSPlugInRule2Stage(sample, n, kf.variance(), kf.r(), s); } /** Computes a one-dimensional bandwidth for a kernel estimator * based upon the 2-stage direct plug in rule using a normal kernel function for * estimating the roughness of the plug-in values. * For further informations see * [WJ95]: M.P. Wand and M.C. Jones. Kernel Smoothing. * Chapman & Hall. 1995. pages 71 ff. * * @param sample used sample of the data given by Objects of type Number * @param n size of the used sample * @param var variance of the used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function}. * @param r R value (roughness) of the used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function}. * ( R(f) = integral( f(x)^2 dx) ). See [Sco92] page 281 (notation). * @param s estimator for the spread of the data, e.g., an estimator for the * standard deviation or the inter quartil range of the data. * @return bandwidth computed with the 2-stage direct plug in rule */ public static double directNSPlugInRule2Stage(Object[] sample, int n, double var, double r, double s) { // compute phi_8_NS double phi8 = 105.0 / (32.0 * Math.sqrt(Math.PI) * Math.pow(s, 9.0)); // compute g1 double k6 = Kernels.normalDerivativeAt0(6); double g1 = (-2.0 * k6) / (Math.sqrt(var) * phi8 * n); g1 = Math.pow(g1, (1.0 / 9.0)); // use g1 with the roughness estimator double phi6 = Kernels.roughnessEstimator(g1, sample, 6); // compute g2 double k4 = Kernels.normalDerivativeAt0(4); double g2 = (-2.0 * k4) / (Math.sqrt(var) * phi6 * n); g2 = Math.pow(g2, (1.0 / 7.0)); // use g2 with the roughness estimator double phi4 = Kernels.roughnessEstimator(g2, sample, 4); // compute the bandwidth double re = 0.0; re = r / (var * n * phi4); re = Math.pow(re, 0.2); return re; } /** * The default constructor has private access in order to ensure * non-instantiability. */ private KernelBandwidths() {} }