/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.math.statistics.nonparametric.aggregates; import java.util.Iterator; import java.util.List; import xxl.core.cursors.mappers.Aggregator; import xxl.core.cursors.mappers.ReservoirSampler; import xxl.core.functions.Constant; import xxl.core.functions.Function; import xxl.core.math.Maths; import xxl.core.math.functions.AdaptiveAggregationFunction; import xxl.core.math.functions.AggregationFunction; import xxl.core.math.functions.RealFunction; import xxl.core.math.functions.SplineCompressedFunctionAggregateFunction; import xxl.core.math.statistics.nonparametric.EmpiricalCDF; import xxl.core.math.statistics.nonparametric.kernels.KernelBasedBlockEstimatorAggregationFunction; import xxl.core.math.statistics.nonparametric.kernels.KernelFunction; import xxl.core.math.statistics.parametric.aggregates.LastN; import xxl.core.math.statistics.parametric.aggregates.Maximum; import xxl.core.math.statistics.parametric.aggregates.Minimum; import xxl.core.math.statistics.parametric.aggregates.ReservoirSample; import xxl.core.math.statistics.parametric.aggregates.StatefulVariance; import xxl.core.predicates.EveryNth; /** * This class provides some static methods showing how one is able to use aggregation functions * and online aggregation functions of higher orders, i.e., initial statistical functions and * aggregation functions based on different aggregation functions. In regard to more complex applications, * preimplementations of kernel based methods are particularly provided. * * @see xxl.core.cursors.mappers.Aggregator * @see xxl.core.math.functions.AdaptiveAggregationFunction * @see xxl.core.cursors.mappers.ReservoirSampler * @see xxl.core.math.functions.AdaptiveWeightFunctions * @see xxl.core.math.functions.SplineCompressedFunctionAggregateFunction * @see xxl.core.math.statistics.nonparametric.aggregates * @see xxl.core.math.statistics.nonparametric.kernels */ public class Aggregators { /** * The default constructor has private access in order to ensure * non-instantiability. */ private Aggregators() {} /** Returns an {@link java.util.Iterator iterator} of type {@link xxl.core.cursors.mappers.Aggregator aggregator} * delivering a {@link xxl.core.math.statistics.nonparametric.kernels.NativeKernelDensityEstimator native kernel density estimator} * as result of the aggregation. * The aggregates base on an input iterator delivering data of type <tt>Object</tt>. While consuming * the iterator, an <tt>iid</tt> sample of the previously seen data is maintained that in turn is used * for establishing a new native kernel density estimator. * <br> * Generally, the following steps are required:<br> * <P>1. Use a {@link xxl.core.math.statistics.parametric.aggregates.ReservoirSample reservoir sampling * function} (or any other online sampling algorithm) to obtain * an {@link java.util.Iterator iterator} delivering samples of * an input iterator. * <BR> * 2. Use an aggregation function delivering an estimation * of the spread (e.g. standard deviation, inter quartil range, ...) of the data. * <BR> * 3. Combine the * aggregation functions above within a new aggregator and use the tuples delivered * by this iteration as input for an aggregation function of higher order. * * </P> * * <br><br> * <code><pre> * return new Aggregator( * new Aggregator( input, * new Function [] { * mapSamplingStrategy( sampleSize, samplingType), * new StatefulVariance()} * ), * new NKDEAggregateFunction( kf) * ); * </pre></code> * * * @param input data used to obtain an estimation of the pdf * @param kf used kernel function to obtain an estimator * @param sampleSize used sample size * @param samplingType used type of sampling * @param bandwidthType used bandwidth strategy * @throws IllegalArgumentException if the given samplingType is not known * @return an {@link xxl.core.cursors.mappers.Aggregator aggregator} delivering density estimators based on an * input iterator */ public static Aggregator getNKDEAggregator( Iterator input, KernelFunction kf, int sampleSize, int samplingType, int bandwidthType) throws IllegalArgumentException { return new Aggregator( new Aggregator(input, Maths.multiDimAggregateFunction(new AggregationFunction[] { mapSamplingStrategy(sampleSize, samplingType), new StatefulVariance()})), new NKDEAggregateFunction(kf, bandwidthType)); } /** Returns an {@link java.util.Iterator iterator} of type {@link xxl.core.cursors.mappers.Aggregator aggregator} * delivering a {@link xxl.core.math.statistics.nonparametric.kernels.NativeKernelCDF native kernel cdf} * as result of the aggregation. * The aggregates base on an input iterator delivering data of type <tt>Object</tt>. While consuming * the iterator, an <tt>iid</tt> sample of the previously seen data is maintained that in turn is used * for establishing a new native kernel cdf. * <br> * Generally, the following steps are required:<br> * <P>1. Use a {@link xxl.core.math.statistics.parametric.aggregates.ReservoirSample reservoir sampling * function} (or any other online sampling algorithm) to obtain * an {@link java.util.Iterator iterator} delivering samples of * an input iterator. * <BR> * 2. Use an aggregation function delivering an estimation * of the spread (e.g. standard deviation, inter quartil range, ...) of the data. * <BR> * 3. Combine the * aggregation functions above within a new aggregator and use the tuples delivered * by this iteration as input for an aggregation function of higher order. * * </P> * <code><pre> * return new Aggregator( new Aggregator( input, new Function[] { mapSamplingStrategy(sampleSize, samplingType), new StatefulVariance()}), new NativeKernelCDFAggregateFunction(kf, bandwidthType)); * </pre></code> * * @param input data used to obtain an estimation * @param kf used kernel function to obtain an estimator * @param sampleSize used sample size * @param samplingType used type of sampling * @param bandwidthType used bandwidth strategy * @throws IllegalArgumentException if the given samplingType is not known * @return an {@link xxl.core.cursors.mappers.Aggregator aggregator} delivering native kernel cdf's based on an * input iterator */ public static Aggregator getNKCDFAggregator( Iterator input, KernelFunction kf, int sampleSize, int samplingType, int bandwidthType) throws IllegalArgumentException { return new Aggregator( new Aggregator(input, Maths.multiDimAggregateFunction(new AggregationFunction[] { mapSamplingStrategy(sampleSize, samplingType), new StatefulVariance()})), new NativeKernelCDFAggregateFunction(kf, bandwidthType)); } /** Returns an {@link java.util.Iterator iterator} of type {@link xxl.core.cursors.mappers.Aggregator aggregator} * delivering an {@link xxl.core.math.statistics.nonparametric.kernels.ReflectionKernelDensityEstimator reflection kernel density estimator} * using reflection as the result of the aggregation. * The aggregates base on an input iterator delivering data of type <tt>Object</tt>. While consuming * the iterator, an <tt>iid</tt> sample of the previously seen data is maintained that in turn is used * for establishing a new reflection kernel density estimator. * <br> * Generally, the following steps are required:<br> * <P>1. Use a {@link xxl.core.math.statistics.parametric.aggregates.ReservoirSample reservoir sampling * function} (or any other online sampling algorithm) to obtain * an {@link java.util.Iterator iterator} delivering samples of * an input iterator. * <BR> * 2. Use an aggregation function delivering an estimation * of the spread (e.g. standard deviation, inter quartil range, ...) of the data. * <BR> * 3. Combine the * aggregation functions above within a new aggregator and use the tuples delivered * by this iteration as input for an aggregation function of higher order. * * </P> * * <br><br> * <code><pre> * return new Aggregator( * new Aggregator( input, * new Function [] { * mapSamplingStrategy( sampleSize, samplingType), * new StatefulVariance(), * new Minimum(), * new Maximum()} * ), * new RKDEAggregateFunction( kf) * ); * </pre></code> * * @param input data used to obtain an estimation * @param kf used kernel function to obtain an estimator * @param sampleSize used sample size * @param samplingType used type of sampling * @param bandwidthType used bandwidth strategy * @throws IllegalArgumentException if the given samplingType is not known * @return an {@link xxl.core.cursors.mappers.Aggregator aggregator} delivering reflection kernel * density estimators based on an * input iterator */ public static Aggregator getRKDEAggregator( Iterator input, KernelFunction kf, int sampleSize, int samplingType, int bandwidthType) throws IllegalArgumentException { return new Aggregator( new Aggregator( input, Maths.multiDimAggregateFunction(new AggregationFunction[] { mapSamplingStrategy(sampleSize, samplingType), new StatefulVariance(), new Minimum(), new Maximum() } )), new RKDEAggregateFunction(kf, bandwidthType)); } /** Returns an {@link java.util.Iterator iterator} of type {@link xxl.core.cursors.mappers.Aggregator aggregator} * delivering a {@link xxl.core.math.statistics.nonparametric.kernels.ReflectionKernelCDF reflection kernel cdf} * as result of the aggregation. * The aggregates base on an input iterator delivering data of type <tt>Object</tt>. While consuming * the iterator, an <tt>iid</tt> sample of the previously seen data is maintained that in turn is used * for establishing a new reflection kernel cdf. * <br> * Generally, the following steps are required:<br> * <P>1. Use a {@link xxl.core.math.statistics.parametric.aggregates.ReservoirSample reservoir sampling * function} (or any other on-line sampling algorithm) to obtain * an {@link java.util.Iterator iterator} delivering samples of * an input iterator. * <BR> * 2. Use an aggregation function delivering an estimation * of the spread (e.g. standard deviation, inter quartil range, ...) of the data. * <BR> * 3. Combine the * aggregation functions above within a new aggregator and use the tuples delivered * by this iteration as input for an aggregation function of higher order. * * </P> * <code><pre> * return new Aggregator( new Aggregator( input, new Function[] { mapSamplingStrategy(sampleSize, samplingType), new StatefulVariance(), new Minimum(), new Maximum() } ), new ReflectionKernelCDFAggregateFunction(kf, bandwidthType) ); * </pre></code> * * @param input data used to obtain an estimation * @param kf used kernel function to obtain an estimator * @param sampleSize used sample size * @param samplingType used type of sampling * @param bandwidthType used bandwidth strategy * @throws IllegalArgumentException if the given samplingType is not known * @return an {@link xxl.core.cursors.mappers.Aggregator aggregator} delivering reflection kernel cdf's based on an * input iterator */ public static Aggregator getRKCDFAggregator( Iterator input, KernelFunction kf, int sampleSize, int samplingType, int bandwidthType) throws IllegalArgumentException { return new Aggregator( new Aggregator( input, Maths.multiDimAggregateFunction(new AggregationFunction[] { mapSamplingStrategy(sampleSize, samplingType), new StatefulVariance(), new Minimum(), new Maximum() } )), new ReflectionKernelCDFAggregateFunction(kf, bandwidthType) ); } /** Returns an {@link java.util.Iterator iterator} of type {@link xxl.core.cursors.mappers.Aggregator aggregator} * delivering estimators * as result of the aggregation. They rely on a user-defined aggregate function for building * estimators based on a sample and statistical values (e.g. variance). * The aggregates base on an input iterator delivering data of type <tt>Object</tt>. While consuming * the iterator, an <tt>iid</tt> sample of the previously seen data is maintained that in turn is used * for establishing a new kernel based estimator. * <br> * Generally, the following steps are required:<br> * <P>1. Use a {@link xxl.core.math.statistics.parametric.aggregates.ReservoirSample reservoir sampling * function} (or any other online sampling algorithm) to obtain * an {@link java.util.Iterator iterator} delivering samples of * an input iterator. * <BR> * 2. Use an aggregation function delivering an estimation * of the spread (e.g. standard deviation, inter quartil range, ...) of the data. * <BR> * 3. Combine the * aggregation functions above within a new aggregator and use the tuples delivered * by this iteration as input for an aggregation function of higher order. * * </P> * * <br><br> * <code><pre> * return new Aggregator( new Aggregator( input, new Function[] { mapSamplingStrategy(sampleSize, samplingType), new StatefulVariance(), new Minimum(), new Maximum() } ), kernelBasedAggregateFunction ); * </pre></code> * * @param input data used to obtain an estimation * @param kernelBasedAggregateFunction aggregation function * @param sampleSize used sample size * @param samplingType used type of sampling * @throws IllegalArgumentException if the given samplingType is not known * @return an {@link xxl.core.cursors.mappers.Aggregator aggregator} delivering reflection kernel cdf's based on an * input iterator */ public static Aggregator getReservoirBasedKernelEstimatorAggregator( Iterator input, AggregationFunction kernelBasedAggregateFunction, int sampleSize, int samplingType) throws IllegalArgumentException { return new Aggregator( new Aggregator( input, Maths.multiDimAggregateFunction(new AggregationFunction[] { mapSamplingStrategy(sampleSize, samplingType), new StatefulVariance(), new Minimum(), new Maximum() } )), kernelBasedAggregateFunction ); } /** Returns an {@link java.util.Iterator iterator} of type {@link xxl.core.cursors.mappers.Aggregator aggregator} * delivering estimators as result of the aggregation. An iterator containing Objects of type <TT>Number</TT> * is partitioned into blocks of a predefined size. * While consuming the iterator, separate kernel based estimators with a chosen bandwidth strategy * are established for each block. In each step, after the new estimator is built, a convex linear combination * of the 'old' and the 'new' estimator will be returned as the actual aggregation result. There exist different strategies * for the weights of the estimators. They are provided in {xxl.core.math.functions.AdaptiveWeightFunctions * AdaptiveWeightFunctions}. The construction of the aggregates according to the current step and weights * is realized in {@link xxl.core.math.functions.AdaptiveAggregationFunction AdaptiveAggregationFunction}. * </P> * * <br> * <code><pre> * return new Aggregator( KernelBasedBlockEstimatorAggregationFunction.inputCursor(input, blocksize), new AdaptiveAggregationFunction( new KernelBasedBlockEstimatorAggregationFunction(factory, kf, bandwidthType), weights, realMode ) ); * </pre></code> * * @param factory factory delivering kernel based estimators for each block * @param input data stream to build an adaptive estimator (must contain Objects of type <TT>Number</TT>) * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function} to build up the block based estimator * @param bandwidthType type of bandwidth used by each block estimator * @param blocksize size of each block performed separately and used to build an estimator * @param weights RealFunction delivering weights used to combine the blockestimators * @param realMode indicates that the adaptive aggregation function assumes to combine Objects of type {@link xxl.core.math.functions.RealFunction} * instead of Objects of type {@link xxl.core.functions.Function} consuming Objects of type <TT>Number</TT>. * @throws IllegalArgumentException if the given samplingType is not known * @return an {@link xxl.core.cursors.mappers.Aggregator aggregator} delivering kernel based estimators that * are iteratively combined */ public static Aggregator getAdaptiveKernelBasedAggregator( Function factory, Iterator input, KernelFunction kf, int bandwidthType, int blocksize, RealFunction weights, boolean realMode) throws IllegalArgumentException { return new Aggregator( KernelBasedBlockEstimatorAggregationFunction.inputCursor(input, blocksize), new AdaptiveAggregationFunction( new KernelBasedBlockEstimatorAggregationFunction(factory, kf, bandwidthType), weights, realMode ) ); } /** Returns an {@link java.util.Iterator iterator} of type {@link xxl.core.cursors.mappers.Aggregator aggregator} * delivering kernel based estimators as result of the aggregation. An iterator containing Objects of type <TT>Number</TT> * is partitioned into blocks of a predefined size. * While consuming the iterator, separate kernel based estimators with a chosen bandwidth strategy * are established for each block. In each step, after the new estimator is built, a convex linear combination * of the 'old' and the 'new' estimator will be returned as the actual aggregation result. There exist different strategies * for the weights of the estimators. They are provided in {xxl.core.math.functions.AdaptiveWeightFunctions * AdaptiveWeightFunctions}. The construction of the aggregates according to the current step and weights * is realized in {@link xxl.core.math.functions.AdaptiveAggregationFunction AdaptiveAggregationFunction}. * In regard to the limited memory resources, the current aggregate is compressed. Therefore, the {@link * xxl.core.math.numerics.splines.CubicBezierSpline cubic Bezier-Spline interpolate} for the aggregate * is computed, whereas a predefined number of function values of the aggregate on an interval is computed. * The compression of a new aggregate is realized in * {@link xxl.core.math.functions.SplineCompressedFunctionAggregateFunction}. The compression range, i.e., * the interval for the compression, has to be known. * </P> * * <br> * <code><pre> * return new Aggregator( KernelBasedBlockEstimatorAggregationFunction.inputCursor(input, blocksize), new SplineCompressedFunctionAggregateFunction( new AdaptiveAggregationFunction( new KernelBasedBlockEstimatorAggregationFunction(factory, kf, bandwidthType), weights, realMode ), new EveryNth(blocksize), left, right, n, cdfMode ) ); * </pre></code> * * @param factory factory delivering kernel based estimators for each block * @param input data stream to build an adaptive estimator (must contain Objects of type <TT>Number</TT>) * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function} to build up the block based estimator * @param bandwidthType type of bandwidth used by each block estimator * @param blocksize size of each block performed separately and used to build up an estimator * @param weights RealFunction delivering weights used to combine the block estimators * @param left left border of the valid compression range * @param right right border of the valid compression range * @param n number of points in the compression interval * @param realMode indicates that the adaptive aggregation function assumes to combine Objects of type {@link xxl.core.math.functions.RealFunction} * instead of Objects of type {@link xxl.core.functions.Function} consuming Objects of type <TT>Number</TT>. * @param cdfMode indicates spline is in cdf mode, i.e., evaluating the spline at x > maximum causes the spline * to return 1.0 instead of 0.0 * @throws IllegalArgumentException if the given samplingType is not known * @return an {@link xxl.core.cursors.mappers.Aggregator aggregator} delivering kernel based estimators that * are iteratively combined and compressed in each aggregation step */ public static Aggregator getSplineCompressedAdaptiveKernelBasedAggregator( Function factory, Iterator input, KernelFunction kf, int bandwidthType, int blocksize, RealFunction weights, double left, double right, int n, boolean realMode, boolean cdfMode) throws IllegalArgumentException { return new Aggregator( KernelBasedBlockEstimatorAggregationFunction.inputCursor(input, blocksize), new SplineCompressedFunctionAggregateFunction( new AdaptiveAggregationFunction( new KernelBasedBlockEstimatorAggregationFunction(factory, kf, bandwidthType), weights, realMode ), new EveryNth(blocksize), left, right, n, cdfMode ) ); } /** Returns an {@link java.util.Iterator iterator} of type {@link xxl.core.cursors.mappers.Aggregator aggregator} * delivering kernel based estimators as result of the aggregation. An iterator containing Objects of type <TT>Number</TT> * is partitioned into blocks of a predefined size. * While consuming the iterator, separate kernel based estimators with a chosen bandwidth strategy * are established for each block. In each step, after the new estimator is built, a convex linear combination * of the 'old' and the 'new' estimator will be returned as the actual aggregation result. There exist different strategies * for the weights of the estimators. They are provided in {xxl.core.math.functions.AdaptiveWeightFunctions * AdaptiveWeightFunctions}. The construction of the aggregates according to the current step and weights * is realized in {@link xxl.core.math.functions.AdaptiveAggregationFunction AdaptiveAggregationFunction}. * In regard to the limited memory resources, the current aggregate is compressed. Therefore, the {@link * xxl.core.math.numerics.splines.CubicBezierSpline cubic Bezier-Spline interpolate} for the aggregate * is computed, whereas a predefined number of function values of the aggregate on an interval is computed. * The compression of a new aggregate is realized in * {@link xxl.core.math.functions.SplineCompressedFunctionAggregateFunction}. The compression range in turn relies on * the current extrema. * </P> * * <br> * <code><pre> * return new Aggregator( KernelBasedBlockEstimatorAggregationFunction.inputCursor(input, blocksize), new SplineCompressedFunctionAggregateFunction( new AdaptiveAggregationFunction( new KernelBasedBlockEstimatorAggregationFunction(factory, kf, bandwidthType), weights, realMode ), new EveryNth(blocksize), KernelBasedBlockEstimatorAggregationFunction.accessValue( KernelBasedBlockEstimatorAggregationFunction.MINIMUM), KernelBasedBlockEstimatorAggregationFunction.accessValue( KernelBasedBlockEstimatorAggregationFunction.MAXIMUM), new Constant(new Double(n)), cdfMode ) ); * </pre></code> * * @param factory factory delivering kernel based estimators for each block * @param input data stream to build an adaptive estimator (must contain Objects of type <TT>Number</TT>) * @param kf used {@link xxl.core.math.statistics.nonparametric.kernels.KernelFunction kernel function} to build up the block based estimator * @param bandwidthType type of bandwidth used by each block estimator * @param blocksize size of each block performed separately and used to build up an estimator * @param weights RealFunction delivering weights used to combine the block estimators * @param n number of points in the compression interval * @param realMode indicates that the adaptive aggregation function assumes to combine Objects of type {@link xxl.core.math.functions.RealFunction} * instead of Objects of type {@link xxl.core.functions.Function} consuming Objects of type <TT>Number</TT>. * @param cdfMode indicates spline is in cdf mode, i.e., evaluating the spline at x > maximum causes the spline * to return 1.0 instead of 0.0 * @throws IllegalArgumentException if the given samplingType is not known * @return an {@link xxl.core.cursors.mappers.Aggregator aggregator} delivering kernel based estimators that * are iteratively combined and compressed in each aggregation step */ public static Aggregator getSplineCompressedAdaptiveKernelBasedAggregator( Function factory, Iterator input, KernelFunction kf, int bandwidthType, int blocksize, RealFunction weights, int n, boolean realMode, boolean cdfMode) throws IllegalArgumentException { return new Aggregator( KernelBasedBlockEstimatorAggregationFunction.inputCursor(input, blocksize), new SplineCompressedFunctionAggregateFunction( new AdaptiveAggregationFunction( new KernelBasedBlockEstimatorAggregationFunction(factory, kf, bandwidthType), weights, realMode ), new EveryNth(blocksize), KernelBasedBlockEstimatorAggregationFunction.accessValue( KernelBasedBlockEstimatorAggregationFunction.MINIMUM), KernelBasedBlockEstimatorAggregationFunction.accessValue( KernelBasedBlockEstimatorAggregationFunction.MAXIMUM), new Constant(new Double(n)), cdfMode ) ); } /** Returns an {@link java.util.Iterator iterator} of type {@link xxl.core.cursors.mappers.Aggregator aggregator} * delivering estimators based on a FACTORY as result of the aggregation. An iterator containing Objects of type <TT>Number</TT> * is partitioned into blocks of a predefined size. * While consuming the iterator, separate estimators * are established for each block. In each step, after the new estimator is built, a convex linear combination * of the 'old' and the 'new' estimator will be returned as the actual aggregation result. There exist different strategies * for the weights of the estimators. They are provided in {xxl.core.math.functions.AdaptiveWeightFunctions * AdaptiveWeightFunctions}. The construction of the aggregates according to the current step and weights * is realized in {@link xxl.core.math.functions.AdaptiveAggregationFunction AdaptiveAggregationFunction}. * In regard to the limited memory resources, the current aggregate is compressed. Therefore, the {@link * xxl.core.math.numerics.splines.CubicBezierSpline cubic Bezier-Spline interpolate} for the aggregate * is computed, whereas a predefined number of function values of the aggregate on an interval is computed. * The compression of a new aggregate is realized in * {@link xxl.core.math.functions.SplineCompressedFunctionAggregateFunction}. The compression range, i.e., * the interval for the compression, has to be known. * </P> * * <br> * <code><pre> * return new Aggregator( input, new SplineCompressedFunctionAggregateFunction( new AdaptiveAggregationFunction( new AbstractFunction() { int c = 0; public Object invoke(Object old, Object next) { c++; if (next == null) return null; if (c % blocksize == 0) return factory.invoke((Object[]) next); else return old; } }, weights, realMode), new EveryNth(blocksize), left, right, n, cdfMode ) ); * </pre></code> * * @param factory factory delivering estimators for each block * @param input data stream to build an adaptive estimator (must contain Objects of type <TT>Number</TT>) * @param blocksize size of each block performed separately and used to build up an estimator * @param weights RealFunction delivering weights used to combine the block estimators * @param left left border of valid compression range * @param right right border of valid compression range * @param n number of points in the compression interval * @param realMode indicates that the adaptive aggregation function assumes to combine Objects of type {@link xxl.core.math.functions.RealFunction} * instead of Objects of type {@link xxl.core.functions.Function} consuming Objects of type <TT>Number</TT>. * @param cdfMode indicates spline is in cdf mode, i.e., evaluating the spline at x > maximum causes the spline * to return 1.0 instead of 0.0 * @throws IllegalArgumentException if the given samplingType is not known * @return an {@link xxl.core.cursors.mappers.Aggregator aggregator} delivering estimators that * are iteratively combined and compressed in each aggregation step */ public static Aggregator getSplineCompressedAdaptiveAggregator( final Function factory, Iterator input, final int blocksize, RealFunction weights, double left, double right, int n, boolean realMode, boolean cdfMode) throws IllegalArgumentException { return new Aggregator( input, new SplineCompressedFunctionAggregateFunction( new AdaptiveAggregationFunction( new AggregationFunction<List,Object>() { int c = 0; public Object invoke(Object old, List next) { c++; if (next == null) return null; if (c % blocksize == 0) return factory.invoke(next); else return old; } }, weights, realMode), new EveryNth(blocksize), left, right, n, cdfMode ) ); } /** Returns an {@link java.util.Iterator iterator} of type {@link xxl.core.cursors.mappers.Aggregator aggregator} * delivering an {@link xxl.core.math.statistics.nonparametric.EmpiricalCDF empirical cdf} * as result of the aggregation. * The aggregates base on an input iterator delivering data of type <tt>Object</tt>. While consuming * the iterator, an <tt>iid</tt> sample of the previously seen data is maintained that in turn is used * for establishing a new empirical cdf. * <br> * Generally, the following steps are required:<br> * <P>1. Use a {@link xxl.core.math.statistics.parametric.aggregates.ReservoirSample reservoir sampling * function} (or any other online sampling algorithm) to obtain * an {@link java.util.Iterator iterator} delivering samples of * an input iterator. * <BR> * 2. Use an aggregation function delivering an estimation * of the spread (e.g. standard deviation, inter quartil range, ...) of the data. * <BR> * 3. Combine the * aggregation functions above within a new aggregator and use the tuples delivered * by this iteration as input for an aggregation function of higher order. * * </P> * * <br><br> * <code><pre> * return new Aggregator( new Aggregator(input, mapSamplingStrategy(sampleSize, samplingType)), new EmpiricalCDFAggregateFunction() ); * </pre></code> * * @param input data used to obtain an estimation of the pdf * @param sampleSize used sample size * @param samplingType used type of sampling * @throws IllegalArgumentException if the given samplingType is not known * @return an {@link xxl.core.cursors.mappers.Aggregator aggregator} delivering the empirical cdf */ public static Aggregator getEmpiricalCDFAggregator(Iterator input, int sampleSize, int samplingType) throws IllegalArgumentException { return new Aggregator( new Aggregator(input, mapSamplingStrategy(sampleSize, samplingType)), new EmpiricalCDFAggregateFunction() ); } /** Returns an {@link java.util.Iterator iterator} of type {@link xxl.core.cursors.mappers.Aggregator aggregator} * delivering empirical cdf's as result of the aggregation. An iterator containing Objects of type <TT>Number</TT> * is partitioned into blocks of a predefined size. * While consuming the iterator, separate empirical cdf's * are established for each block. In each step, after the new estimator is built, a convex linear combination * of the 'old' and the 'new' estimator will be returned as the actual aggregation result. There exist different strategies * for the weights of the estimators. They are provided in {xxl.core.math.functions.AdaptiveWeightFunctions * AdaptiveWeightFunctions}. The construction of the aggregates according to the current step and weights * is realized in {@link xxl.core.math.functions.AdaptiveAggregationFunction AdaptiveAggregationFunction}. * In regard to the limited memory resources, the current aggregate is compressed. Therefore, the {@link * xxl.core.math.numerics.splines.CubicBezierSpline cubic Bezier-Spline interpolate} for the aggregate * is computed, whereas a predefined number of function values of the aggregate on an interval is computed. * The compression of a new aggregate is realized in * {@link xxl.core.math.functions.SplineCompressedFunctionAggregateFunction}. The compression range relies on * the current extrema. * </P> * * <br> * <code><pre> * return new Aggregator( new Aggregator(input, new Function[] { new LastN(blocksize), new Minimum(), new Maximum()}), new SplineCompressedFunctionAggregateFunction( new AdaptiveAggregationFunction( new AbstractFunction() { int c = 0; public Object invoke(Object old, Object next) { c++; if (next == null) return null; if (c % blocksize == 0) return EmpiricalCDF.FACTORY.invoke(((Object[]) next)[0]); else return old; } }, weights, true // real mode ), new EveryNth(blocksize), KernelBasedBlockEstimatorAggregationFunction.accessValue(1), KernelBasedBlockEstimatorAggregationFunction.accessValue(2), new Constant(new Double(n)), true // cdf mode ) ); * </pre></code> * * @param input data stream to build an adaptive estimator (must contain Objects of type <TT>Number</TT>) * @param blocksize size of each block performed separately and used to build up an estimator * @param weights RealFunction delivering weights used to combine the block estimators * @param n number of points in the compression interval * @throws IllegalArgumentException if the given samplingType is not known * @return an {@link xxl.core.cursors.mappers.Aggregator aggregator} delivering empirical cdf's that * are iteratively combined and compressed in each aggregation step */ public static Aggregator getSplineCompressedAdaptiveEmpiricalCDFAggregator( Iterator input, final int blocksize, RealFunction weights, int n) throws IllegalArgumentException { return new Aggregator( new Aggregator(input, Maths.multiDimAggregateFunction(new AggregationFunction[] { new LastN(blocksize), new Minimum(), new Maximum()})), new SplineCompressedFunctionAggregateFunction( new AdaptiveAggregationFunction( new AggregationFunction<List,Object>() { int c = 0; public Object invoke(Object old, List next) { c++; if (next == null) return null; if (c % blocksize == 0) return EmpiricalCDF.FACTORY.invoke(next.get(0)); else return old; } }, weights, true // real mode ), new EveryNth(blocksize), KernelBasedBlockEstimatorAggregationFunction.accessValue(1), KernelBasedBlockEstimatorAggregationFunction.accessValue(2), new Constant(new Double(n)), true // cdf mode ) ); } /** The method returns a {@link xxl.core.functions.Function function} representing a * strategy used with the {@link xxl.core.cursors.mappers.ReservoirSampler} cursor. * * @param sampleSize size of the sample * @param type type of the reservoir sampling strategy * @throws IllegalArgumentException if an unknown or not supported strategy has been given * @return a function representing a reservoir sampling strategy */ public static AggregationFunction mapSamplingStrategy(int sampleSize, int type) throws IllegalArgumentException { AggregationFunction function = null; switch (type) { case ReservoirSampler.RTYPE : function = new ReservoirSample(sampleSize, new ReservoirSample.RType(sampleSize)); break; case ReservoirSampler.XTYPE : function = new ReservoirSample(sampleSize, new ReservoirSample.XType(sampleSize)); break; case ReservoirSampler.YTYPE : throw new IllegalArgumentException("Type y is not supported so far. See javadoc xxl.core.functions.ReservoirSample for details!"); case ReservoirSampler.ZTYPE : function = new ReservoirSample(sampleSize, new ReservoirSample.ZType(sampleSize)); break; default : throw new IllegalArgumentException("unknown sampling strategy given!"); } return function; } }