/* XXL: The eXtensible and fleXible Library for data processing Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger Head of the Database Research Group Department of Mathematics and Computer Science University of Marburg Germany This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; If not, see <http://www.gnu.org/licenses/>. http://code.google.com/p/xxl/ */ package xxl.core.math.statistics.parametric.aggregates; import java.util.Random; import xxl.core.math.functions.AggregationFunction; import xxl.core.util.random.ContinuousRandomWrapper; import xxl.core.util.random.DiscreteRandomWrapper; import xxl.core.util.random.JavaContinuousRandomWrapper; import xxl.core.util.random.JavaDiscreteRandomWrapper; /** This class provides sampling algorithms for the usage with the * {@link xxl.core.cursors.mappers.Aggregator aggregator cursor}. * Anytime after initialization the returned array of objects * represents an <tt>iid</tt> (identically independent * distributed) random sample of the already consumed data. * Note that the size of the used entirety must not be known in advance. * Generally, there are four types of implementation available based on * <b>[Vit85]</b>: Jeffrey Scott Vitter, Random Sampling with a Reservoir, in * ACM Transactions on Mathematical Software, Vol. 11, NO. 1, March 1985, Pages 37-57. * The reservoir sampling strategy describes how the sample is drawn * from the given dataset. * In [Vit85] four types are intended: Type R, X, Y and Z. * <br> * <p>Performance of Algorithms R, X, Y, and Z</p> * <table width="100%" border="0"> * <tr> * <td width="14%"><b>Algorithm </b></td> * <td width="39%"><b>StatefulAverage number of uniform random variates</b></td> * <td width="47%"><b>StatefulAverage CPU time </b></td> * </tr> * <tr> * <td width="14%">R</td> * <td width="39%">N-n</td> * <td width="47%">O(N)</td> * </tr> * <tr> * <td width="14%">X</td> * <td width="39%">2n ln (N/n)</td> * <td width="47%"> O(N)</td> * </tr> * <tr> * <td width="14%">Y</td> * <td width="39%">2n ln (N/n)</td> * <td width="47%">O ( n^2 ( 1+ log (N/n)*log (log(N/n))))</td> * </tr> * <tr> * <td width="14%">Z</td> * <td width="39%">3n ln (N/n)</td> * <td width="47%">O(n(1+log(N/n)))</td> * </tr> *</table> * <br> * Type Y is not implemented so far due to the lack of information for computing * the needed distribution for the used random variable. All the other three types * of <tt>how to sample</tt> are implemented as described in [Vit85]. * * <br> * <p><b>Objects of this type are recommended for the usage with aggregator cursors!</b></p> * <br> * * Consider the following example: * <code><pre> * Iterator it = new Aggregator( new DiscreteRandomNumber(new JavaDiscreteRandomWrapper(100),50), new ReservoirSample ( 10, new XType(10)) ); while ( it.hasNext() ){ Object [] o = (Object []) it.next(); if(o==null) continue; for (int i=0; i< o.length; i++) System.out.print(": " + o[i]); // print the current output } * <\code><\pre> * <br> * * @see xxl.core.cursors.mappers.Aggregator * @see xxl.core.functions.Function * @see java.util.Iterator * @see xxl.core.cursors.mappers.ReservoirSampler * */ public class ReservoirSample extends AggregationFunction<Number,Number[]> { /** Interface providing a strategy for reservoir sampling techniques. * For further informations see {@link xxl.core.cursors.mappers.ReservoirSampler}. */ public static interface ReservoirSamplingStrategy { /** Computes the position where to store the currently treated object. * * @return position where to store the currently treated object * in the reservoir used for sampling. Returns -1 if the object * will be skipped. */ public int invoke(); } /** This class provides a preimplementation of the ReservoirSamplingStrategy * interface to minimize the effort required to implement this interface. * * For further informations see {@link xxl.core.cursors.mappers.ReservoirSampler}. */ public static abstract class AbstractStrategy implements ReservoirSamplingStrategy { /** Pseudo Random Number Generator (PRNG) used for continuous random variables */ protected ContinuousRandomWrapper cRandom; /** Pseudo Random Number Generator (PRNG) used for discrete random variables */ protected DiscreteRandomWrapper dRandom; /** size of the reservoir and also of the drawn sample */ protected int reservoirSize; /** number of objects to skip in the underlying stream for increasing * performance of the sampling strategy. */ protected int skip = -1; /** Number of objects already processed. Necessary to compute the sampling distribution */ protected int count = 0; // CAUTION: checking paper if count needs to start at 0 // ERROR in paper [Vit85] at formula (3.2), Z-Type alg.: /** Default constructor for random sampling strategies. * * @param reservoirSize size of the used reservoir * @param cRandom pseudo random number generator (PRNG) delivering * uniform distributed random numbers * @param dRandom pseudo random number generator (PRNG) delivering * equally discrete distributed random numbers */ public AbstractStrategy(int reservoirSize, ContinuousRandomWrapper cRandom, DiscreteRandomWrapper dRandom) { this.reservoirSize = reservoirSize; this.cRandom = cRandom; this.dRandom = dRandom; } /** Default constructor for random sampling strategies. * Using this constructor means using the java built-in PRNG. * * @param reservoirSize size of the used reservoir */ public AbstractStrategy(int reservoirSize) { this( reservoirSize, new JavaContinuousRandomWrapper(new Random()), new JavaDiscreteRandomWrapper(new Random())); } /** Preimplementation of the invoke-method to compute the skip value * for the sampling strategy. One only needs to implement computeSkip and * computePos to fulfill the requirements of reservoir sampling. * * @return computed position where to store the currently treated * object in the used reservoir. If the given position exceeds the * bounds of the reservoir the element has to be skipped. */ public int invoke() { if (skip == -1) computeSkip(); int pos = -1; if (skip > 0) { skip--; pos = -1; } else { // skip == 0 // Compute position in reservoir pos = computePos(); skip = -1; // compute new skip value } count++; return pos; } /** Computes the internally used skip value. * */ protected abstract void computeSkip(); /** Computes the internally used position where to store the element * in the used reservoir. * * @return the computed position */ protected abstract int computePos(); } /** * Type R strategy computes for every object of the underlying iterator * the designated position in the reservoir. If the computed * position exceeds the reservoir the object will be skipped. * For further informations see {@link xxl.core.cursors.mappers.ReservoirSampler}. */ public static class RType extends AbstractStrategy { /** Constructs a new object of the Rtype reservoir sampling strategy. * * @param reservoirSize size of the used reservoir * @param cRandom pseudo random number generator (PRNG) delivering * uniform distributed * random numbers between 0.0d (inclusively) and 1.0d (exclusively). * @param dRandom pseudo random number generator (PRNG) delivering * equally discrete distributed * random numbers between zero (inclusively) and any given integer (exclusively) */ public RType(int reservoirSize, ContinuousRandomWrapper cRandom, DiscreteRandomWrapper dRandom) { super(reservoirSize, cRandom, dRandom); } /** Constructs a new object of the Rtype reservoir sampling strategy. * Using this constructor * means using the Java Random Engine (which has a bad reputation). * In package connectivity.colt wrappers for the usage with the Colt * Library are available. * * @param reservoirSize size of the used reservoir */ public RType(int reservoirSize) { this( reservoirSize, new JavaContinuousRandomWrapper(new Random()), new JavaDiscreteRandomWrapper(new Random())); } /** Computes the skip value for type R strategy. */ protected void computeSkip() { int r = count; // U is denoted in uppercase in cause of indicating the presence of a random variable double U = cRandom.nextDouble(); while (U * (reservoirSize + r + 1) > reservoirSize) { //s++; r++; U = cRandom.nextDouble(); } skip = r - count; } /** Computes the position where to store the element in the reservoir. * * @return computed position */ protected int computePos() { //return dRandom.nextInt (reservoirSize) ; double t = cRandom.nextDouble(); t = t * reservoirSize; if (t == 1.0) return reservoirSize - 1; else return (int) Math.floor(t); } } /** * Type X strategy computes first the number of objects to skip and then * the position in the reservoir for the object followed by the last one skipped. * For further informations see {@link xxl.core.cursors.mappers.ReservoirSampler}. */ public static class XType extends AbstractStrategy { /** Constructs a new object of the Xtype reservoir sampling strategy. * * @param reservoirSize size of the used reservoir * @param cRandom pseudo random number generator (PRNG) delivering * uniform distributed * random numbers between 0.0d (inclusively) and 1.0d (exclusively) * @param dRandom pseudo random number generator (PRNG) delivering * equally discrete distributed * random numbers between zero (inclusively) and any given integer (exclusively) */ public XType(int reservoirSize, ContinuousRandomWrapper cRandom, DiscreteRandomWrapper dRandom) { super(reservoirSize, cRandom, dRandom); } /** Constructs a new object of the Xtype reservoir sampling strategy. * Using this constructor * means using the Java Random Engine (which has a bad reputation). * In package connectivity.colt wrappers for the usage with the Colt * Library are available. * * @param reservoirSize size of the used reservoir */ public XType(int reservoirSize) { this( reservoirSize, new JavaContinuousRandomWrapper(new Random()), new JavaDiscreteRandomWrapper(new Random())); } /** Computes the skip value for type X strategy. */ protected void computeSkip() { // double rand = random number in [0,1] double U = cRandom.nextDouble(); int s = 0; //int z = reservoirSize + count - 1 ; int z = count + 1; int n = count + reservoirSize + 1; double val = (double) z / (double) n; while (val >= U) { z++; n++; val = val * z * (1.0 / n); s++; } skip = s; } /** Computes the position where to store the element in the reservoir. * * @return computed position */ protected int computePos() { //return dRandom.nextInt(reservoirSize); double t = cRandom.nextDouble(); t = t * reservoirSize; if (t == 1.0) return reservoirSize - 1; else return (int) Math.floor(t); } } /** * Type Z strategy computes first the number of objects to skip and then * the position in the reservoir for the object followed by the last one skipped. * For further informations see {@link xxl.core.cursors.mappers.ReservoirSampler}. */ public static class ZType extends AbstractStrategy { /** Constructs a new object of the Ztype reservoir sampling strategy. * * @param reservoirSize size of the used reservoir * @param cRandom pseudo random number generator (PRNG) delivering * uniform distributed * random numbers between 0.0d (inclusively) and 1.0d (exclusively). * @param dRandom pseudo random number generator (PRNG) delivering * equally discrete distributed * random numbers between zero (inclusively) and any given integer (exclusively) */ public ZType(int reservoirSize, ContinuousRandomWrapper cRandom, DiscreteRandomWrapper dRandom) { super(reservoirSize, cRandom, dRandom); } /** Constructs a new object of the Ztype reservoir sampling strategy. * Using this constructor * means using the Java Random Engine (which has a bad reputation). * In package connectivity.colt wrappers for the usage with the Colt * Library are available. * * @param reservoirSize size of the used reservoir */ public ZType(int reservoirSize) { this( reservoirSize, new JavaContinuousRandomWrapper(new Random()), new JavaDiscreteRandomWrapper(new Random())); } /** Computes the skip value for type Z strategy. */ protected void computeSkip() { boolean isValid = false; do { isValid = true; double U = cRandom.nextDouble(); double V = cRandom.nextDouble(); double G = (count + reservoirSize) * Math.pow(1.0 / V, 1.0 / reservoirSize) - (count + reservoirSize); double f; double g = (reservoirSize / (reservoirSize + count + G)) * Math.pow( (reservoirSize + count) / (reservoirSize + count + G), reservoirSize); double h = ((double) reservoirSize / (double) (reservoirSize + count + 1)) * Math.pow( (double) (count + 1) / (double) (count + 1 + (int) Math.floor(G)), (reservoirSize + 1)); double c = (double) (reservoirSize + count + 1) / (double) (count + 1); // --- if (U <= h / (c * g)) { skip = (int) Math.floor(G); } else { double z = (double) count + 1; double n = (reservoirSize + count + 1); double val = 1.0; // This if is used due to a bug in the original paper [Vit85]: // The authors transformation in formula (3.2) holds for // every n and t but not for n=t. In this case // in the process of transforming the formula there occurs // a division by zero ( 1/ (t-n))! if (Math.floor(G) == 0) { f = (double) reservoirSize / (double) (reservoirSize + count + 1); } else { for (int i = 1; i <= (int) Math.floor(G); i++) { val *= z / n; z++; n++; } f = (reservoirSize / (reservoirSize + count + Math.floor(G) + 1)) * val; } //end of else if (U <= f / (c * g)) { skip = (int) Math.floor(G); } else { isValid = false; } } } while (!isValid); } /** Computes the position where to store the element in the reservoir. * * @return computed position */ protected int computePos() { //return dRandom.nextInt(reservoirSize); double t = cRandom.nextDouble(); t = t * reservoirSize; if (t == 1.0) return reservoirSize - 1; else return (int) Math.floor(t); } } /** strategy used for sampling */ protected ReservoirSamplingStrategy strategy; /** indicates whether the reservoir is initialized or not */ protected boolean initialized = false; /** number of already processed objects */ protected long counts = 0; /** reservoir used for storing the sampled objects */ protected Number[] reservoir; /** Constructs a new aggregation function of type sampling. * * @param n size of the sample to draw * @param strategy sampling strategy used for determine the * position of the treated object * in the sampling reservoir * @throws IllegalArgumentException if the size of the reservoir isn't at least 1 */ public ReservoirSample(int n, ReservoirSamplingStrategy strategy) throws IllegalArgumentException { this(new Number[n], strategy); } /** Constructs a new aggregation function of type sampling. * * @param reservoir reservoir used to store the sampling * @param strategy sampling strategy used for determine the * position of the treated object * in the sampling reservoir * @throws IllegalArgumentException if the size of the reservoir isn't at least 1 */ public ReservoirSample(Number[] reservoir, ReservoirSamplingStrategy strategy) throws IllegalArgumentException { if (reservoir.length < 1) throw new IllegalArgumentException("reservoir must have at least size 1"); this.reservoir = reservoir; this.strategy = strategy; } /** Two-figured function call for supporting aggregation by this function. * Each aggregation function must support a function call like <tt>agg_n = f (agg_n-1, next)</tt>, * where <tt>agg_n</tt> denotes the computed aggregation value after <tt>n</tt> steps, <tt>f</tt> * the aggregation function, <tt>agg_n-1</tt> the computed aggregation value after <tt>n-1</tt> steps * and <tt>next</tt> the next object to use for computation. * This method delivers only <tt>null</tt> as aggregation result as long as the aggregation * has not yet initialized. * * @param oldReservoir result of the aggregation function in the previous computation step * (<tt>iid</tt> sample of previous treated data) * @param next next object used for computation * @return aggregation value after n steps (new reservoir containing an <tt>iid>/tt> sample) */ public Number[] invoke(Number[] oldReservoir, Number next) { if (!initialized) { reservoir[(int) counts] = next; counts++; if (counts >= reservoir.length) { initialized = true; return reservoir; } else { return null; } } else { counts++; int pos = strategy.invoke(); //if ( ( pos >= 0) && ( pos < reservoir.length) ) reservoir [pos] = next; try { reservoir[pos] = next; } catch (ArrayIndexOutOfBoundsException e) {} return reservoir; } } }