KernelBandwidths.java example

Explorer
xxl-master
/* XXL: The eXtensible and fleXible Library for data processing

Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger
                        Head of the Database Research Group
                        Department of Mathematics and Computer Science
                        University of Marburg
                        Germany

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 3 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library;  If not, see <http://www.gnu.org/licenses/>. 

    http://code.google.com/p/xxl/

*/

package xxl.core.math.statistics.nonparametric.kernels;

import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;

import xxl.core.collections.queues.Heap;
import xxl.core.util.Distance;

/**
 * One main problem of nonparametric statistics is the estimation of the <tt>probability density
 * function (pdf)</tt> of a random variable. The estimation typically relies on a given sample, in our case
 * data given by objects of type <tt>Number</tt>. For density estimation kernel based techniques are quite 
 * robust and useful.
 * There, for each data point X_i a {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel} 
 * is centered at X_i. Then, the sum is built over all kernels.
 * The kernel density estimator inherits the basic properties of the
 * kernel function. For this function different choices are possible
 * whereas a pdf is advisable (that is restricted to [-1,1] in our case). The mostly used kernels are positive
 * and symmetric with a compact support, e.g., the {@link xxl.core.math.statistics.nonparametric.kernels.EpanechnikowKernel 
 * Epanechnikow kernel} or the {@link xxl.core.math.statistics.nonparametric.kernels.GaussianKernel Gaussian kernel}.
 * The choice of the bandwidth h, which controls the degree of
 * smoothness, is crucial for the quality of the estimation. There is
 * a trade-off in choosing the bandwidth. If h is too high, the
 * resulting estimate is smooth and may hide local features of the
 * density. For a small bandwidth, the estimate will be spiky and
 * artificial artifacts may be introduced.
 * <br>
 * If the primitive of the kernel is known, this approach can be extended towards the estimation
 * of the <tt>cumulative distribution function (cdf)</tt> by simply integrating the kernel based density estimator appropriately. 
 * <br>
 * Besides these applications estimators based on kernels may also be applicable in other scenarios.
 * Again, the choice of the bandwidth is crucial for the quality of these estimators. 
 * <br>
 * This class provides some static methods for dealing with bandwidths for
 * kernel estimators. Since there are many choices for bandwiths we refer for a
 * more detailed coverage
 * to <br>[Sco92]: David W. Scott. Multivariate Density Estimation:
 * Theory, Practice, and Visualization. 1992 
 * and <br>
 * [WJ95]: M.P. Wand and M.C. Jones. Kernel Smoothing.
 * 1995. pages 60 ff.
 * 
 * @see xxl.core.math.statistics.nonparametric.kernels.AbstractKernelDensityEstimator
 * @see xxl.core.math.statistics.nonparametric.kernels.AbstractKernelCDF
 * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction
 *
 */

public class KernelBandwidths {

	/** indicates the use of type {@link #oversmoothingRule(int,double,double,double) } as bandwidth strategy
	 * @see #computeBandWidth1D(int type,Object[] sample,KernelFunction kf,double var,double min,double max)
	 */
	public static final int OVERSMOOTHING_RULE_1D = 0;

	/** indicates the use of type {@link #normalScaleRule(int,double,double,double)} as bandwidth strategy
	 * @see #computeBandWidth1D(int type,Object[] sample,KernelFunction kf,double var,double min,double max)
	 */
	public static final int THUMB_RULE_1D = 1;

	/** indicates the use of type {@link #directNSPlugInRule2Stage(Object[],int,KernelFunction,double) } as bandwidth strategy
	 * @see #computeBandWidth1D(int type,Object[] sample,KernelFunction kf,double var,double min,double max)
	 */
	public static final int DPI2_RULE_1D = 2;

	/** indicates the use of type {@link #maximumLikelihoodCV(Object[],KernelFunction,double) } as bandwidth strategy
	 * @see #computeBandWidth1D(int type,Object[] sample,KernelFunction kf,double var,double min,double max)
	 */
	public static final int MLCV_RULE = 3;

	/** indicates the use of type {@link #adaBand(Object[],Iterator,Distance,int) } as bandwidth strategy
	 * @see #computeBandWidth1D(int type,Object[] sample,KernelFunction kf,double var,double min,double max)
	 */
	public static final int ADABAND_RULE = 4;

	/** indicates the use of type {@link #scottsRule(double[],int) } as bandwidth strategy
	 * @see #computeBandWidth1D(int type,Object[] sample,KernelFunction kf,double var,double min,double max)
	 */
	public static final int SCOTTS_RULE = 5;

	/** Computes the bandwidth of a specified type for a given kernel estimator.
	 * The kernel estimator relies on a sample and a kernel function with a known variance.
	 * 
	 * @param type bandwidth strategy
	 * @param sample used sample to compute the estimation. Given as Objects of type <TT>Number</TT>.
	 * @param kf kernel function 
	 * @param var variance of the kernel function
	 * @param min minimum of the data
	 * @param max maximum of the data
	 * @return bandwidth according to the chosen strategy
	 */
	public static double computeBandWidth1D(
		int type,
		Object[] sample,
		KernelFunction kf,
		double var,
		double min,
		double max) {
		switch (type) {
			case KernelBandwidths.OVERSMOOTHING_RULE_1D :
				return KernelBandwidths.oversmoothingRule(sample.length, kf, var);
			case KernelBandwidths.DPI2_RULE_1D :
				return KernelBandwidths.directNSPlugInRule2Stage(sample, sample.length, kf, var);
			case KernelBandwidths.MLCV_RULE :
				return KernelBandwidths.maximumLikelihoodCV(
					sample,
					kf,
					KernelBandwidths.oversmoothingRule(sample.length, kf, var));
			case KernelBandwidths.SCOTTS_RULE :
				return KernelBandwidths.scottsRule(new double[] { var }, sample.length)[0];
			case KernelBandwidths.THUMB_RULE_1D :
				return KernelBandwidths.normalScaleRule(sample.length, kf, var);
		}
		throw new IllegalArgumentException("No bandwidth rule of this type is supported!");
	}

	/** Computes a one-dimensional bandwidth for a kernel estimator
	 * based upon the oversmoothing or maximal principle rule.
	 * For further informations see
	 * [WJ95]: M.P. Wand and M.C. Jones. Kernel Smoothing.
	 * Chapman & Hall. 1995. pages 60 ff.
	 *
	 * @param n size of the used sample
	 * @param var variance of the used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function}.
	 * @param r R value (roughness) of the used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function}.
	 * ( R(f) = integral( f(x)^2 dx) ). See [Sco92] page 281 (notation).
	 * @param s estimator for the spread of the data, e.g., an estimator for the 
	 * standard deviation or the inter quartil range of the data
	 * @return bandwidth computed with the oversmoothing rule
	 * 
	 * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction#r()
	 * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction#variance()
	 * @see xxl.core.math.statistics.nonparametric.kernels.NativeKernelDensityEstimator
	 */
	public static double oversmoothingRule(int n, double var, double r, double s) {
		double re = 0.0;
		re = 243.0 * r;
		re = re / (35.0 * var * n);
		re = Math.pow(re, 0.2);
		re = re * s;
		return re;
	}

	/** Computes a one-dimensional bandwidth for a kernel estimator
	 * based upon the oversmoothing oder maximal principle rule.
	 * For further informations see
	 * [WJ95]: M.P. Wand and M.C. Jones. Kernel Smoothing.
	 * Chapman & Hall. 1995. pages 60 ff.
	 *
	 * @param n size of the used sample
	 * @param kf Used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function}
	 * @param s estimator for the spread of the data, e.g., an estimator for the 
	 * variance or the inter quartil range of the data
	 * @return a bandwidth computed with the oversmoothing rule
	 * 
	 * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction
	 * @see xxl.core.math.statistics.nonparametric.kernels.NativeKernelDensityEstimator
	 */
	public static double oversmoothingRule(int n, KernelFunction kf, double s) {
		return oversmoothingRule(n, kf.variance(), kf.r(), s);
	}

	/** Computes a one-dimensional bandwidth for a kernel estimator
	 * based upon the normal scale rule.
	 * For further informations see
	 * [Sco92]: David W. Scott. Multivariate Density Estimation:
	 * Theory, Practice, and Visualization. John Wiley & Sons, inc. 1992. pages 131 ff.
	 *
	 * The normal scale rule is also known as <tt>thumb rule</tt>.
	 * @param n size of the used sample
	 * @param var variance of the used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function}
	 * @param r R value (roughness) of the used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function}.
	 * ( R(f) = integral( f(x)^2 dx) ). See [Sco92] page 281 (notation)
	 * @param s estimator for the spread of the data, e.g., an estimator for the 
	 * standard deviation or the inter quartil range of the data
	 * @return a bandwidth computed with the normal scale rule
	 * 
	 * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction#r()
	 * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction#variance()
	 * @see xxl.core.math.statistics.nonparametric.kernels.NativeKernelDensityEstimator
	 */
	public static double normalScaleRule(int n, double var, double r, double s) {
		double re = 0.0;
		re = 8.0 * Math.sqrt(Math.PI) * r;
		re = re / (3.0 * var * n);
		re = Math.pow(re, 0.2);
		re = re * s;
		return re;
	}

	/** Computes a one-dimensional bandwidth for a kernel estimator
	 * based upon the normal scale rule.
	 * For further informations see <BR>
	 * [Sco92]: David W. Scott. Multivariate Density Estimation:
	 * Theory, Practice, and Visualization. John Wiley & Sons, inc. 1992. pages 131 ff.  <BR>
	 *
	 * The normal scale rule is also known as <tt>thumb rule</tt>.
	 * @param n size of the used sample
	 * @param kf Used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function}
	 * @param s estimator for the spread of the data, e.g., an estimator for the 
	 * variance or the inter quartil range of the data	 
	 * @return a bandwidth computed with the normal scale rule
	 * 
	 * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction
	 * @see xxl.core.math.statistics.nonparametric.kernels.NativeKernelDensityEstimator
	 */
	public static double normalScaleRule(int n, KernelFunction kf, double s) {
		return normalScaleRule(n, kf.variance(), kf.r(), s);
	}

	/** Computes the bandwidths for each given sample value (equally for each dimension)
	 * as the distance of the k-nearest neighbour of the sample values wrt. the given entirety.
	 * <br>
	 * Based upon [DG01]: Domeniconi, Carlotta. Gunopulos, Dimitios. An Efficent Approach for Approximation Multi-dimensional Range
	 * Queries and Nearest Neighbor Classification in Large Datasets. 2001
	 * <br>
	 * @param sample <tt>Object []</tt> containing a sample with values of type <tt>double []</tt>
	 * @param data {@link java.util.Iterator iterator} delivering the entirety of the given sample
	 * @param distance distance function
	 * @param quantil use the distance of the quantil-nearest neighbuor to a sample value for bandwidth
	 * @return the bandwidths (equally for each dimension) of the corresponding sample values
	 */
	public static double[] adaBand(final Object[] sample, Iterator data, final Distance distance, int quantil) {
		double[] bandwidths = new double[sample.length];
		Heap[] heaps = new Heap[sample.length];
		double[] kmax = new double[sample.length];
		for (int i = 0; i < sample.length; i++) {
			final int j = i;
			(heaps[i] = new Heap(quantil, new Comparator() {
				Object reference = sample[j];
				public int compare(Object o1, Object o2) {
					double dist = (distance.distance(reference, o1) - distance.distance(reference, o2));
					return dist < 0 ? 1 : (dist > 0 ? -1 : 0);
				}
			})).open();
		}
		int count = -1;
		Object x = null;
		while (data.hasNext()) {
			x = data.next();
			count++;
			for (int i = 0; i < sample.length; i++) {
				if (count == 0) { //init kmax
					kmax[i] = distance.distance(sample[i], x);
				}
				if (count >= quantil) {
					if (distance.distance(sample[i], x) < kmax[i])
						heaps[i].replace(x);
					kmax[i] = distance.distance(sample[i], heaps[i].peek());
				} else {
					if (distance.distance(sample[i], x) > kmax[i])
						kmax[i] = distance.distance(sample[i], x);
					heaps[i].enqueue(x);
				}
			}
		}
		for (int i = 0; i < sample.length; i++) {
			bandwidths[i] = distance.distance(sample[i], heaps[i].dequeue());
			heaps[i].close();
		}
		return bandwidths;
	}

	/** Returns the bandwidths for a n-dimensional kernel estimator based upon
	 * Scott's rule.
	 * For further informations see <BR>
	 * [Sco92]: David W. Scott. Multivariate Density Estimation:
	 * Theory, Practice, and Visualization. John Wiley & Sons, inc. 1992. pages 131 ff.
	 *
	 * @param stdDev standard deviations of the data in the corresponding dimensions
	 * @param n sample size used for estimation
	 * @return array of type <tt>double[]</tt> containing the bandwidths for the corresponding dimensions
	 */
	public static double[] scottsRule(double[] stdDev, int n) {
		int d = stdDev.length;
		double[] r = new double[d];
		for (int i = 0; i < d; i++) {
			double p = (-1.0) / (d + 4);
			r[i] = stdDev[i] * Math.pow(n, p);
		}
		return r;
	}

	/** Computes a one-dimensional bandwidth for a kernel estimator
	 * based upon the maximum likelihood cross validation.
	 * For further informations see
	 * [Har91]: W. Hardle. Smoothing techniques with implementation in S. 1991.
	 *
	 * @param sample used sample of the data given by Objects of type Number.
	 * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function}
	 * @param h used bandwidth for cross validation
	 * @return a bandwidth computed with maximum likelihood cross validation.
	 */
	public static double maximumLikelihoodCV(Object[] sample, KernelFunction kf, double h) {
		return maximumLikelihoodCV(sample, kf, new double[] { h })[0];
	}

	/** Computes a d-dimensional bandwidth for a kernel estimator
	 * based upon the maximum likelihood cross validation.
	 * For further informations see
	 * [Har91]: W. Hardle. Smoothing techniques with implementation in S. 1991.
	 *
	 * @param sample used sample of the data given by Objects of type <tt>Number</tt>.
	 * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function}
	 * @param hs used bandwidths for cross validation for the corresponding dimension
	 * @return bandwidths computed with maximum likelihood cross validation.
	 */
	public static double[] maximumLikelihoodCV(Object[] sample, KernelFunction kf, double[] hs) {
		double[] r = new double[hs.length];
		int n = sample.length;
		Arrays.fill(r, 0.0);
		for (int k = 0; k < r.length; k++) {
			for (int i = 0; i < n; i++) {
				double xi = ((Number) sample[i]).doubleValue();
				double score = 0.0;
				for (int j = 0; j < n; j++) {
					double xj = ((Number) sample[j]).doubleValue();
					if (i != j)
						score += kf.eval((xi - xj) / hs[k]);
				}
				// if a sample xi has no neigbours, so that K( xi-xj / h) == 0, treat it like an outlier (i.e. ignore it),
				// otherwise log( score ) = -infinity occurs
				if (score > 0.0)
					r[k] += Math.log(score);
				else
					n--;
				// work around, if a sample x_i had been treated as an outlier (negative bandwiths!)
			}
			r[k] = r[k] / n - Math.log((n - 1) * hs[k]);
		}
		return r;
	}

	/** Computes a one-dimensional bandwidth for a kernel estimator
	 * based upon the 2-stage direct plug in rule using a normal kernel function for
	 * estimating the roughness of the plug-in values.
	 * For further informations see
	 * [WJ95]: M.P. Wand and M.C. Jones. Kernel Smoothing.
	 * Chapman & Hall. 1995. pages 71 ff.
	 *
	 * @param sample used sample of the data given by Objects of type <tt>Number</tt>
	 * @param n size of the used sample
	 * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function}
	 * @param s estimator for the spread of the data, e.g., an estimator for the 
	 * standard deviation or the inter quartil range of the data
	 * @return bandwidth computed with the 2-stage direct plug in rule
	 */
	public static double directNSPlugInRule2Stage(Object[] sample, int n, KernelFunction kf, double s) {
		return directNSPlugInRule2Stage(sample, n, kf.variance(), kf.r(), s);
	}

	/** Computes a one-dimensional bandwidth for a kernel estimator
	 * based upon the 2-stage direct plug in rule using a normal kernel function for
	 * estimating the roughness of the plug-in values.
	 * For further informations see
	 * [WJ95]: M.P. Wand and M.C. Jones. Kernel Smoothing.
	 * Chapman & Hall. 1995. pages 71 ff.
	 *
	 * @param sample used sample of the data given by Objects of type Number
	 * @param n size of the used sample
	 * @param var variance of the used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function}.
	 * @param r R value (roughness) of the used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function}.
	 * ( R(f) = integral( f(x)^2 dx) ). See [Sco92] page 281 (notation).
	 * @param s estimator for the spread of the data, e.g., an estimator for the 
	 * standard deviation or the inter quartil range of the data.
	 * @return bandwidth computed with the 2-stage direct plug in rule
	 */
	public static double directNSPlugInRule2Stage(Object[] sample, int n, double var, double r, double s) {
		// compute phi_8_NS 
		double phi8 = 105.0 / (32.0 * Math.sqrt(Math.PI) * Math.pow(s, 9.0));
		// compute g1
		double k6 = Kernels.normalDerivativeAt0(6);
		double g1 = (-2.0 * k6) / (Math.sqrt(var) * phi8 * n);
		g1 = Math.pow(g1, (1.0 / 9.0));
		// use g1 with the roughness estimator
		double phi6 = Kernels.roughnessEstimator(g1, sample, 6);
		// compute g2
		double k4 = Kernels.normalDerivativeAt0(4);
		double g2 = (-2.0 * k4) / (Math.sqrt(var) * phi6 * n);
		g2 = Math.pow(g2, (1.0 / 7.0));
		// use g2 with the roughness estimator
		double phi4 = Kernels.roughnessEstimator(g2, sample, 4);
		// compute the bandwidth	
		double re = 0.0;
		re = r / (var * n * phi4);
		re = Math.pow(re, 0.2);
		return re;
	}

	/**
	 * The default constructor has private access in order to ensure
	 * non-instantiability.
	 */
	private KernelBandwidths() {}
}