package com.twitter.common.stats;
import java.util.Vector;
import com.google.common.base.Preconditions;
import com.twitter.common.util.Random;
/**
* An in memory implementation of Reservoir Sampling for sampling from
* a population.
* <p>Several optimizations can be done.
* Especially, one can avoid rolling the dice as many times as the
* size of the population with an involved trick.
* See "Random Sampling with a Reservoir", Vitter, 1985</p>
* <p>TODO (delip): Fix this when the problem arises</p>
*
* @param <T> Type of the sample
* @author Delip Rao
*/
public class ReservoirSampler<T> {
private final Vector<T> reservoir = new Vector<T>();
private final int numSamples;
private final Random random;
private int numItemsSeen = 0;
/**
* Create a new sampler with a certain reservoir size using
* a supplied random number generator.
*
* @param numSamples Maximum number of samples to
* retain in the reservoir. Must be non-negative.
* @param random Instance of the random number generator
* to use for sampling
*/
public ReservoirSampler(int numSamples, Random random) {
Preconditions.checkArgument(numSamples > 0,
"numSamples should be positive");
Preconditions.checkNotNull(random);
this.numSamples = numSamples;
this.random = random;
}
/**
* Create a new sampler with a certain reservoir size using
* the default random number generator.
*
* @param numSamples Maximum number of samples to
* retain in the reservoir. Must be non-negative.
*/
public ReservoirSampler(int numSamples) {
this(numSamples, Random.Util.newDefaultRandom());
}
/**
* Sample an item and store in the reservoir if needed.
*
* @param item The item to sample - may not be null.
*/
public void sample(T item) {
Preconditions.checkNotNull(item);
if (reservoir.size() < numSamples) {
// reservoir not yet full, just append
reservoir.add(item);
} else {
// find a sample to replace
int rIndex = random.nextInt(numItemsSeen + 1);
if (rIndex < numSamples) {
reservoir.set(rIndex, item);
}
}
numItemsSeen++;
}
/**
* Get samples collected in the reservoir.
*
* @return A sequence of the samples. No guarantee is provided on the order of the samples.
*/
public Iterable<T> getSamples() {
return reservoir;
}
}