/**
* Licensed to Cloudera, Inc. under one or more contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. Cloudera, Inc. licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You
* may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*/
package org.talend.dataquality.sampling;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
/**
* Reservoir sampling is a method for getting a uniform random sampling of n data elements from a stream of N data
* elements without a priori knowing the total number of elements N.
*
* This is based off of Algorithm R from this paper: http://portal.acm.org/citation.cfm?id=3165
*
* This implementation is uses a push-based observer instead of a pull based iterator found in the paper. The
* "fast forwarding" methods in the paper are not relevant to our application..
*/
public class ReservoirSampler<T> {
private final int samples; // number of elements to sample.
private List<T> candidates; // the reservoir.
private long count = 0;
private boolean done = false;
private Random rand;
public ReservoirSampler(int samples, long seed) {
this.samples = samples;
this.candidates = new ArrayList<T>(samples);
this.rand = new Random(seed);
}
public ReservoirSampler(int samples) {
this(samples, System.currentTimeMillis());
}
public void onCompleted(boolean b) {
done = b;
}
public void onError(Exception e) {
done = true;
}
public void onNext(T v) {
if (done) {
return;
}
if (candidates.size() < samples) {
// for the first n elements.
candidates.add(v);
count++;
return;
}
// do reservoir sampling.
count++;
// rand.nextDouble gets a pseudo random value between 0.0 and 1.0
long replace = (long) Math.floor(count * rand.nextDouble());
if (replace < samples) {
// probability says replace.
candidates.set((int) replace, v);
}
// else keep the current sample reservoir
}
/**
* Returns an unmodifiable reference to the sample list.
*/
public List<T> sample() {
return Collections.unmodifiableList(candidates);
}
public void clear() {
candidates.clear();
}
}