/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.math.statistics.nonparametric.kernels; import java.util.Iterator; import xxl.core.functions.AbstractFunction; import xxl.core.util.Distance; import xxl.core.util.DoubleArrays; /** * This class provides a native implementation of a n-dimensional kernel based density estimator * based upon a sample of data given by objects of type <tt>double []</tt>. * This class implements a native kernel density estimator, meaning * no boundary treatment is used nor any other optimizations. * To every given n-dimensional sample value a bandwidth is needed. This bandwidth * is used for every dimension the kernel estimator will be evaluated at. Per default * the {@link xxl.core.math.statistics.nonparametric.kernels.KernelBandwidths#adaBand(Object[] sample, Iterator data, Distance distance, int quantil) adaBand} * rule is used * for computing the bandwidths, if they are not given otherwise. But one is able to give * otherwise computed bandwidths to this class. * For further details about the adaBand rule see <br> * [DG01]: Domeniconi, Carlotta. Gunopulos, Dimitios. An Efficent Approach for Approximation Multi-dimensional Range * Queries and Nearest Neighbor Classification in Large Datasets. 2001. * <br> * The estimator is updatable, so one can use it as an aggregate function * combined with a special aggregation iterator. But it is recommended <b>not</b> * to do so because recomputing the bandwidths could be quiet expensive, * especially if {@link xxl.core.math.statistics.nonparametric.kernels.KernelBandwidths#adaBand(Object[] sample, Iterator data, Distance distance, int quantil) adaBand} is * used for computation. <br> * The estimator is implemented as a * {@link xxl.core.functions.Function function} expecting * objects of type <tt>double []</tt>. <br> * A sample of any kind of data given by an iterator could * be easily obtained through the {@link xxl.core.cursors.mappers.ReservoirSampler reservoir sampler * cursor}. * * @see xxl.core.cursors.mappers.Aggregator * @see java.util.Iterator * @see xxl.core.functions.Function * @see xxl.core.math.statistics.nonparametric.kernels.AbstractKernelDensityEstimator * @see xxl.core.math.statistics.nonparametric.kernels.AbstractKernelCDF * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunctionND * @see xxl.core.math.statistics.nonparametric.kernels.KernelBandwidths * @see xxl.core.math.statistics.nonparametric.kernels.KernelBandwidths#adaBand(Object[] sample, Iterator data, Distance distance, int quantil) */ public class AdaBandKernelDensityEstimatorND extends AbstractFunction { /** used kernel function to estimate the density */ protected KernelFunctionND kf; /** used bandwidths for estimation according to the sample */ protected double[] h; /** used sample given as objects of type <tt>double []</tt> */ protected Object[] sample; /** indicates whether sample or bandwidths have changed */ protected boolean hasChanged = true; /** * Constructs an estimator for a n-dimensional density function using the given * n-dimensional {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunctionND kernel function}. * This class implements a native kernel density estimator, meaning * no boundary treatment is used nor any other optimizations. * Per default * the {@link xxl.core.math.statistics.nonparametric.kernels.KernelBandwidths#adaBand(Object[] sample, Iterator data, Distance distance, int quantil) adaBand} rule is used * for computing the bandwidths, if they are not given otherwise. But one is able to give * otherwise computed bandwidths to this class. * * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction Kernel function}. * @param sample sample of a data set given as <tt>Object[]</tt> containing * objects of type <tt>double []</tt>. * @param h used bandwidths according to the used sample * @throws IllegalArgumentException if the number of bandwidths doesn't match the number of sample values */ public AdaBandKernelDensityEstimatorND(KernelFunctionND kf, Object[] sample, double[] h) throws IllegalArgumentException { this.kf = kf; this.sample = sample; this.h = h; } /** * Constructs an estimator for a n-dimensional density function using the given * n-dimensional {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunctionND kernel function}. * This class implements a native kernel density estimator, meaning * no boundary treatment is used nor any other optimizations. * Per default * the {@link xxl.core.math.statistics.nonparametric.kernels.KernelBandwidths#adaBand(Object[] sample, Iterator data, Distance distance, int quantil) adaBand} rule is used * for computing the bandwidths, if they are not given otherwise. * <br> * * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction Kernel function}. * @param sample sample of a data set given as <tt>Object[]</tt> containing * objects of type <tt>double []</tt>. * @param data data used for computing the bandwidths based upon adaBand * @param distance {@link xxl.core.util.Distance distance} used for computing the bandwidths based upon adaBand * (see {@link xxl.core.math.statistics.nonparametric.kernels.KernelBandwidths#adaBand(Object[] sample, Iterator data, Distance distance, int quantil) adaBand} for further details) * @param quantil parameter for computing the bandwidths based upon adaBand * (see {@link xxl.core.math.statistics.nonparametric.kernels.KernelBandwidths#adaBand(Object[] sample, Iterator data, Distance distance, int quantil) adaBand} for further details) * @throws IllegalArgumentException if the number of bandwidths doesn't match the number of sample values */ public AdaBandKernelDensityEstimatorND( KernelFunctionND kf, Object[] sample, Iterator data, Distance distance, int quantil) { this(kf, sample, KernelBandwidths.adaBand(sample, data, distance, quantil)); } /** Evaluates the n-dimensional kernel based density estimator at a given point x represented by an * object of type <tt>double []</tt>. * * @param x argument where to evaluate the density estimation * @throws IllegalStateException if after calling {@link #setSample(Object[]) setSample(Object[])} or * calling {@link #setBandwidth(double[]) setBandwidth(double[])} the number of bandwidth values doesn't * match the number of sample values * @return value of the estimated density at x */ public Object invoke(Object x) throws IllegalStateException { if (hasChanged) { if (h.length != sample.length) throw new IllegalStateException("number of bandwidths doesn't match number of sample values!"); } double[] x0 = (double[]) x; double[] xi = null; double b = 0.0; int size = sample.length; double r = 0.0; for (int i = 0; i < size; i++) { xi = (double[]) sample[i]; // sample value X_i b = h[i]; // bandwidth for the sample value X_i // r = r + (kf.eval(DoubleArrays.mult(DoubleArrays.substract(x0, xi), (1.0 / b))) / b); } r = r / size; return new Double(r); } /** Sets a new sample. If the sample has changed, i.e. old_sample.equals (new_sample) * returns false, <tt>changed</tt> will be set true. * * @param newSample new sample to use */ public void setSample(Object[] newSample) { if (!newSample.equals(sample)) { hasChanged = true; sample = newSample; } } /** Sets the new bandwidths. If the bandwidths have changed, * <tt>changed</tt> will be set true. * * @param h new bandwidths to use */ public void setBandwidth(double[] h) { if (!(this.h == h)) { hasChanged = true; this.h = h; } } /** Returns the last used bandwidths for estimation. * * @return last used bandwidths */ public double[] getBandwidth() { return h; } /** Indicates whether something has changed. If so, a recomputation may become necessary. * * @return true, if anything has changed. */ public boolean hasChanged() { return hasChanged; } }