/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.math.statistics.nonparametric.kernels; import java.util.Arrays; import java.util.List; import xxl.core.functions.AbstractFunction; import xxl.core.functions.Function; import xxl.core.math.Statistics; /** * This class provides an implementation of a kernel based estimator for * a <tt>cumulative distribution function (cdf)</tt>. * based upon a simple random sample * for data given by objects of type <tt>Number</tt>. * The estimator is updatable, so one can use it as an aggregate function * combined with a special {@link xxl.core.cursors.mappers.Aggregator aggregation iterator}. * <br> * This class implements a special kind of kernel cdf estimators using a hybrid * technique first performed by Blohsfeld et. al. in <BR> * [BKS99]: Bjoern Blohsfeld, Dieter Korus, Bernhard Seeger: * A Comparison of Selectivity Estimators for Range Queries on Metric Attributes. * SIGMOD Conference 1999: 239-250. * <BR> * Generally, a set of separate kernel based cdf estimators is given whereas each one * is assigned to a partial interval respectively bin. The bins in turn build a * partition of the x-Axis. There, the borders should correspond to change points of the * estimated pdf. * These change points can be estimated, if they are not a priori known. * The estimators of the bins are kernel cdf estimators * using reflection as a boundary treatment. See * {@link xxl.core.math.statistics.nonparametric.kernels.ReflectionKernelCDF ReflectionKernelCDFEstimator} * for details about using reflection. <BR> * <br> * The estimator is implemented as a one-dimensional * {@link xxl.core.functions.Function function} expecting * objects of type <tt>Number</tt> as well as a * {@link xxl.core.math.functions.RealFunction RealFunction} expecting * data of type <tt>double</tt>. * A sample of any kind of data given by an iterator could * be easily obtained through the {@link xxl.core.cursors.mappers.ReservoirSampler reservoir sampler * cursor}. * * @see xxl.core.cursors.mappers.Aggregator * @see java.util.Iterator * @see xxl.core.functions.Function * @see xxl.core.math.statistics.nonparametric.kernels.KernelFunction * @see xxl.core.math.statistics.nonparametric.kernels.KernelBandwidths * @see xxl.core.math.statistics.nonparametric.kernels.AbstractKernelDensityEstimator * @see xxl.core.math.statistics.nonparametric.kernels.NativeKernelDensityEstimator * @see xxl.core.math.statistics.nonparametric.kernels.ReflectionKernelDensityEstimator * @see xxl.core.math.statistics.nonparametric.kernels.NativeKernelDensityEstimator#FACTORY * @see xxl.core.math.statistics.nonparametric.kernels.ReflectionKernelDensityEstimator#FACTORY */ public class HybridKernelCDF extends AbstractKernelCDF { /** Provides a factory for a * {@link xxl.core.math.statistics.nonparametric.kernels.HybridKernelCDF hybrid kernel distribution estimator}. * The parameters needed for construction are passed to the factory by an * <tt>Object[]</tt> <code>o</code> containing: <BR> * <code>o[0]</code>: used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function} <BR> * <code>o[1]</code>: used sample as <tt>Object[]</tt> containing numerical data <BR> * <code>o[2]</code>: indicates the type of the used bandwidths as Object of type <tt>Integer</tt> <BR> * <code>o[3]</code>: minimum of the data space as Object of type <tt>Number</tt><BR> * <code>o[4]</code>: maximum of the data space as Object of type <tt>Number</tt><BR> * <code>o[5]</code>: used bin borders to build up the estimator given as Objects of type <tt>Number</tt><BR> */ public static Function FACTORY = new AbstractFunction<Object,HybridKernelCDF>() { public HybridKernelCDF invoke(List<? extends Object> list) { int h = KernelBandwidths.THUMB_RULE_1D; try { h = ((Integer) list.get(2)).intValue(); } catch (ClassCastException cce) {} // return new HybridKernelCDF((KernelFunction) list.get(0), // kernel function (Object[]) list.get(1), // sample h, // type of the used bandwidths ((Number) list.get(3)).doubleValue(), // minimum ((Number) list.get(4)).doubleValue(), // maximum (Object[]) list.get(5) // jump points/change points/bin borders ); } }; /** minimum of the data (estimated or computed), i.e., left border of the data*/ protected double min; /** maximum of the data (estimated or computed), i.e., right border of the data */ protected double max; /** border points of the used partitioning of type <tt>Number</tt>, * so called change points */ protected Object[] changePoints; /** bandwidth for every bin. If set to <tt>null</tt> automatic smoothing will be performed */ protected double[] hs; /** functions estimating the assigned bin */ protected Function[] binEstimators; /** factory delivering the estimators used for the bins */ protected Function cdfEstimatorFactory; /** index of last used estimator */ protected int lastUsedFunction = -1; /** sample assigned to the enclosing bins */ protected Object[] binSamples; /** indicates whether the bandwidths are computed based upon the normal scale rule * using the sample. If set to false, the bandwidths needed to be given by the user */ protected boolean automaticBandwidthMode = false; /** used type of bandwidth */ protected int bandwidthType = -1; /** * Constructs an estimator for a cdf using the given * kernel function by defining separate estimators for a well-defined partition * of the x-axis. * * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction Kernel function}. * @param sample sample of a data set given as <tt>Object[]</tt> containing * objects of type <tt>Number</tt>. * @param bandwidthType type of bandwidth used in automatic mode * {@link xxl.core.math.statistics.nonparametric.kernels.KernelBandwidths#normalScaleRule(int,double,double,double) normal scale rule} * will be performed * @param min left border of the data space * @param max right border of the data space * @param changePoints points where to "break" the sample * @param cdfEstimatorFactory a factory delivering kernel estimators used for the bins */ public HybridKernelCDF( KernelFunction kf, Object[] sample, int bandwidthType, double min, double max, Object[] changePoints, Function cdfEstimatorFactory) { super(kf, sample, 0.0); this.min = min; this.max = max; hs = null; this.bandwidthType = bandwidthType; automaticBandwidthMode = true; // this.changePoints = changePoints; Arrays.sort(this.changePoints); this.cdfEstimatorFactory = cdfEstimatorFactory; hasChanged = true; } /** * Constructs an estimator for a cdf using the given * kernel function by defining separate estimators for a well-defined partition * of the x-axis. The factory delivers reflection kernel based cdf's. * * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction Kernel function}. * @param sample sample of a data set given as <tt>Object[]</tt> containing * objects of type <tt>Number</tt>. * @param bandwidthType type of bandwidth used in automatic mode * {@link xxl.core.math.statistics.nonparametric.kernels.KernelBandwidths#normalScaleRule(int,double,double,double) normal scale rule} * will be performed. * @param min left border of the data space * @param max right border of the data space * @param changePoints points where to "break" the sample */ public HybridKernelCDF( KernelFunction kf, Object[] sample, int bandwidthType, double min, double max, Object[] changePoints) { this(kf, sample, bandwidthType, min, max, changePoints, ReflectionKernelCDF.FACTORY); } /** * Constructs an estimator for a cdf using the given * kernel function by defining separate estimators for a well-defined partition * of the x-axis. * * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction Kernel function}. * @param sample sample of a data set given as <tt>Object[]</tt> containing * objects of type <tt>Number</tt>. * @param hs used bandwidths for the bins. If set to <tt>null</tt> automatic smoothing using the * {@link xxl.core.math.statistics.nonparametric.kernels.KernelBandwidths#normalScaleRule(int,double,double,double) normal scale rule} * will be performed. * @param min left border of the data space * @param max right border of the data space * @param changePoints points where to "break" the sample * @param cdfEstimatorFactory a factory delivering kernel estimators used for the bins */ public HybridKernelCDF( KernelFunction kf, Object[] sample, double[] hs, double min, double max, Object[] changePoints, Function cdfEstimatorFactory) { super(kf, sample, 0.0); this.min = min; this.max = max; this.hs = hs; automaticBandwidthMode = false; this.changePoints = changePoints; Arrays.sort(this.changePoints); this.cdfEstimatorFactory = cdfEstimatorFactory; hasChanged = true; } /** * Constructs an estimator for a cdf using the given * kernel function by defining separate estimators for a well-defined partition * of the x-axis. * * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction Kernel function}. * @param sample sample of a data set given as <tt>Object[]</tt> containing * objects of type <tt>Number</tt>. * @param hs used bandwidths for the bins. If set to <tt>null</tt> automatic smoothing using the * {@link xxl.core.math.statistics.nonparametric.kernels.KernelBandwidths#normalScaleRule(int,double,double,double) normal scale rule} * will be performed. * @param min left border of the data space * @param max right border of the data space * @param changePoints points where to "break" the sample rsp. where to bound the bins */ public HybridKernelCDF( KernelFunction kf, Object[] sample, double[] hs, double min, double max, Object[] changePoints) { this(kf, sample, hs, min, max, changePoints, ReflectionKernelCDF.FACTORY); } /** Evaluates the kernel based density estimator at given point x. * * @param x argument where to evaluate the cdf estimation * @return value of the estimated density at x */ protected double evalKDE(double x) { if (x < min) return 0.0; // return zero because function is bounded if (x > max) return 1.0; // return 1 because function is bounded // else double r = 0.0; // finding the "right" bin int bin = 0; if (x > ((Number) changePoints[changePoints.length - 1]).doubleValue()) { // on the right bin = changePoints.length; } else { for (int i = 0; i < changePoints.length - 1; i++) { if ((x > ((Number) changePoints[i]).doubleValue()) & (x <= ((Number) changePoints[i + 1]).doubleValue())) { bin = i + 1; break; } } } // found the bin for (int i = 0; i <= bin - 1; i++) r += ((Object[]) binSamples[i]).length; r += ((Object[]) binSamples[bin]).length * ((Number) binEstimators[bin].invoke(new Double(x))).doubleValue(); // --- return r / sample.length; } /** Returns an estimation of the window (interval) given. * * @param left left border of the window * @param right right border of the window * @return an estimation of the given window * @throws IllegalArgumentException invalid arguments */ public double windowQuery(Object left, Object right) throws IllegalArgumentException { double a = ((Number) left).doubleValue(); double b = ((Number) right).doubleValue(); if (b <= a) throw new IllegalArgumentException("Invalid window query given!"); return eval(b) - eval(a); } /** Sets the new bounds ,e.g., lower and upper border of the given data or the treated domain space. * * @param newMin new lower border * @param newMax new upper border */ public void setBounds(double newMin, double newMax) { min = newMin; max = newMax; hasChanged = true; } /** Sets the new change points ,e.g., the data points determining the bins. * If the number of used change points needs to be changed do it first, than * give the new bandwidths or set to automatic. * * @param cp new change points used for partitioning */ public void setChangePoints(final Object[] cp) { changePoints = cp; Arrays.sort(changePoints); hasChanged = true; } /** Sets the new used bandwidths. If the number of used change points needs to be changed, * the new change points must be given BEFORE updating the bandwidths or an * {@link java.lang.IllegalArgumentException IllegalArgumentException} occurs. * The given bandwidths are used in the given order corresponding to the ascending * ordered bins. * * @param newBandwidths new bandwidths to use. If set to <tt>null</tt> automatic smoothing is enabled. * @throws IllegalArgumentException if the number of bandwidths does not correspond to the number of bins used for estimation * (the number of change points +1) */ public void setBandwidths(double[] newBandwidths) throws IllegalArgumentException { if (newBandwidths == null) { automaticBandwidthMode = true; return; } else { if (newBandwidths.length != changePoints.length + 1) throw new IllegalArgumentException("wrong number of bandwidths given!"); automaticBandwidthMode = false; hs = newBandwidths; //hasChanged = true; } } /** Switches to automatic bandwidth mode and sets the used strategy to obtain * the bandwidths needed. * * @param newBandwidthType new strategy for computing the bandwidths */ public void setBandwidthType(int newBandwidthType) { hs = null; automaticBandwidthMode = true; bandwidthType = newBandwidthType; hasChanged = true; } /** Reinitilizes the estimator after changes */ protected void reinit() { // splitting the sample // building from change points and borders a new Object[] Object[] borders = new Object[changePoints.length + 2]; borders[0] = new Double(min); borders[borders.length - 1] = new Double(max); // if (changePoints.length == 0) { binSamples = new Object[] { sample }; } else { System.arraycopy(changePoints, 0, borders, 1, changePoints.length); // split the data with the borders binSamples = HybridKernelDensityEstimator.split(sample, borders); } // automatic bandwidth mode? -> computed the variances and the bandwidth for each splitted sample if (automaticBandwidthMode) { hs = new double[binSamples.length]; for (int i = 0; i < hs.length; i++) { double var = Statistics.variance((Object[]) binSamples[i]); hs[i] = KernelBandwidths.computeBandWidth1D(bandwidthType, (Object[]) binSamples[i], kf, var, 0.0, 0.0); if ((h == Double.NaN) && (h == 0.0)) { throw new IllegalArgumentException("The combination of jump points (bin borders) and sample doesn't work! Please check the number of samples for each bin and the strategy for computing the bandwidth!"); } } } // computing new bandwidth ??? if number of change points changed? // --> No, changed bandwidths needed to be given (from outer) // producing new estimators for the bins binEstimators = new Function[binSamples.length]; List<Object> parameters = null; for (int i = 0; i < binEstimators.length; i++) { parameters = Arrays.asList( kf, binSamples[i], new Double(hs[i]), borders[i], borders[i + 1] ); binEstimators[i] = (Function) cdfEstimatorFactory.invoke(parameters); } hasChanged = false; } }