/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.math.statistics.nonparametric.kernels; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import xxl.core.comparators.ComparableComparator; import xxl.core.cursors.Cursor; import xxl.core.cursors.mappers.Aggregator; import xxl.core.functions.AbstractFunction; import xxl.core.functions.Function; import xxl.core.math.Maths; import xxl.core.math.functions.AggregationFunction; import xxl.core.math.statistics.parametric.aggregates.LastN; import xxl.core.math.statistics.parametric.aggregates.Maximum; import xxl.core.math.statistics.parametric.aggregates.Minimum; import xxl.core.math.statistics.parametric.aggregates.StatefulVarianceEstimator; /** In the context of online aggregation, running aggregates are built. Given an * iterator of data, an {@link xxl.core.cursors.mappers.Aggregator Aggregator} * computes iteratively aggregates. For instance, the current maximum * of the already processed data is determined. An internal aggregation function processes * the computation of the new element by consuming the old aggregate and the new element * from the input cursor. * * Generally, each aggregation function must support a function call of the following type:<br> * <tt>agg_n = f (agg_n-1, next)</tt>. <br> * There, <tt>agg_n</tt> denotes the computed aggregation value after <tt>n</tt> steps, * <tt>f</tt> represents the aggregation function, * <tt>agg_n-1</tt> the computed aggregation value after <tt>n-1</tt> steps * and <tt>next</tt> the next object to use for computation. * <br> * This class implements an aggregation function that computes kernel based estimators. There, * the data is processed in blocks of a predefined size. Given such a block of data, a kernel based * estimator is established. For determining parameters required for the computation of the * bandwidth, also the variance, the minimum and the maximum are iteratively computed. * Thus, given an input iterator, this kernel based block estimator aggregation function * computes a new kernel based estimator with the corresponding parameters for a new * data block. * <br> * Consider the following example that displays a concrete application of a * kernel based block estimator aggregation function combined with an aggregator: * <code><pre> Aggregator aggregator = new Aggregator( KernelBasedBlockEstimatorAggregationFunction.inputCursor(cursor, blockSize), new KernelBasedBlockEstimatorAggregationFunction( Function estimatorFactory, new BiweightKernel(), int bandwidthType)); * </pre></code> * * @see xxl.core.cursors.mappers.Aggregator * @see xxl.core.math.functions.AdaptiveAggregationFunction * @see xxl.core.math.statistics.nonparametric.kernels.AbstractKernelDensityEstimator * */ public class KernelBasedBlockEstimatorAggregationFunction extends AggregationFunction { /** indicates the return of the last n objects */ public static final int LASTN = 0; /** indicates the return of the current variance */ public static final int VARIANCE = 1; /** indicates the return of the current minimum */ public static final int MINIMUM = 2; /** indicates the return of the current maximum */ public static final int MAXIMUM = 3; /** Returns the current variance, maximum or minimum of the wrapped input iterator in accordance to the * defined type. * * @param type indicates whether variance, minimum or maximum has to be returned * @return current variance, maximum or minimum */ public static Function accessValue(final int type) { return new AbstractFunction() { public Object invoke(Object o) { return ((Object[]) o)[type]; } }; } /** Constructs a cursor that builds data blocks and online aggregates for a given iterator. * Namely, the variance, the minimum and the maximum * are iteratively computed respectively estimated. Generally, the aggregator delivers * data blocks of a predefined size and the estimated variance, minimum and maximum. * * @param input * @param blockSize * @return a cursor delivering Objects of type <code>Object[]</code> * containing:<BR> * 0. last seen n objects<BR> * 1. estimation of the variance of the whole data<BR> * 2. minimum of the previous data<BR> * 3. maximum of the previous data<BR> */ public static Cursor inputCursor(Iterator input, int blockSize) { return new Aggregator( input, Maths.multiDimAggregateFunction(new AggregationFunction[] { new LastN(blockSize), new StatefulVarianceEstimator(), new Minimum(), new Maximum()})); } /** indicates what type of bandwidth to use */ protected int bandwidthType; /** factory for kernel based estimators */ protected Function estimatorFactory; /** used kernel function for the estimators */ protected KernelFunction kf; /** internal counter to determine how many objects are processed */ protected int c; /** index of the last built estimator */ protected int last; /** indicates whether this instance is initialized */ protected boolean init; /** Constructs a KernelBasedBlockEstimatorAggregationFunction. The factory for * building the block estimators, the kernel function and the bandwith type are given. * * @param estimatorFactory factory for the estimators * @param kf used kernel function * @param bandwidthType used bandwidth type * */ public KernelBasedBlockEstimatorAggregationFunction( Function estimatorFactory, KernelFunction kf, int bandwidthType) { this.kf = kf; this.estimatorFactory = estimatorFactory; this.bandwidthType = bandwidthType; c = 0; last = 0; init = false; } /** Constructs a KernelBasedBlockEstimatorAggregationFunction. The factory for * building the block estimators is given. Concerning the kernel estimators * Biweigth kernel functions and the normal scale rule for the bandwidth are used. * * @param estimatorFactory factory for the estimators */ public KernelBasedBlockEstimatorAggregationFunction(Function estimatorFactory) { this(estimatorFactory, new BiweightKernel(), KernelBandwidths.THUMB_RULE_1D); } /** Constructs a KernelBasedBlockEstimatorAggregationFunction. The factory for * building the block estimators returns reflection kernel based block estimators. * Concerning the kernel estimators * Biweight kernel functions and the normal scale rule for the bandwidth are used. */ public KernelBasedBlockEstimatorAggregationFunction() { this(ReflectionKernelDensityEstimator.FACTORY, new BiweightKernel(), KernelBandwidths.THUMB_RULE_1D); } /** Two-figured function call for supporting aggregation by this function. * Each aggregation function must support a function call like <tt>agg_n = f (agg_n-1, next)</tt>, * where <tt>agg_n</tt> denotes the computed aggregation value after <tt>n</tt> steps, <tt>f</tt> * the aggregation function, <tt>agg_n-1</tt> the computed aggregation value after <tt>n-1</tt> steps * and <tt>next</tt> the next object to use for computation. * This method delivers only <tt>null</tt> as aggregation result as long as the aggregation * has not yet initialized. * As result of the aggregation a kernel based block estimator, that relies on the current block, is returned. * * @param old result of the aggregation function in the previous computation step * @param next next object used for computation * @return new kernel based block estimator */ public Object invoke(Object old,Object next) { // next[0] = sample, next[1] = Double-Object with variance, next[2] = min , next[3] = max c++; if (next == null) return null; List aggregate = (List)next; boolean build = false; // indicates whether a new function must be built or not // all needed aggregates fully initialized? Object[] block = (Object[]) aggregate.get(0); if (block == null) // if the block did not init, this functions also did not init return null; if (!init) { // building up first function (block != null, but no functions returned so far) last = c; // storing time build = true; // building up init = true; } else { int blockSize = block.length; if (c >= last + blockSize) { // new block last = c; // storing time build = true; // building up } } if (build) { double var = ((Number) aggregate.get(1)).doubleValue(); double min = ((Number) aggregate.get(2)).doubleValue(); double max = ((Number) aggregate.get(3)).doubleValue(); double h = KernelBandwidths.computeBandWidth1D(bandwidthType, block, kf, var, min, max); // --- copying and sorting block treated as sample ------- Object[] sample = new Object[block.length]; System.arraycopy(block, 0, sample, 0, block.length); Arrays.sort(sample, new ComparableComparator()); // --- building up parameter array for function factory ---- List<Object> parameters = new ArrayList<Object>(aggregate.size() + 1); parameters.add(kf); parameters.add(sample); parameters.add(new Double(h)); parameters.add(new Double(min)); parameters.add(new Double(max)); // Further computed aggregates just forwarding for (int i = 4; i < aggregate.size(); i++) parameters.add(aggregate.get(i)); return estimatorFactory.invoke(parameters); } else return old; } }