/** * Licensed to Cloudera, Inc. under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Cloudera, Inc. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.util; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Random; /** * Reservoir sampling is a method for getting a uniform random sampling of n * data elements from a stream of N data elements without a priori knowing the * total number of elements N. * * This is based off of Algorithm R from this paper: * http://portal.acm.org/citation.cfm?id=3165 * * This implementation is uses a push-based observer instead of a pull based * iterator found in the paper. The "fast forwarding" methods in the paper are * not relevant to our application.. */ public class ReservoirSampler<T> { int samples; // number of elements to sample. List<T> candidates; // the reservoir. int count = 0; boolean done = false; Random rand; public ReservoirSampler(int samples, long seed) { this.samples = samples; this.candidates = new ArrayList<T>(samples); this.rand = new Random(seed); } public ReservoirSampler(int samples) { this(samples, Clock.unixTime()); } public void onCompleted(boolean b) { done = b; } public void onError(Exception e) { done = true; } public void onNext(T v) { if (done) return; if (candidates.size() < samples) { // for the first n elements. candidates.add(v); count++; return; } // do reservoir sampling. count++; // rand.nextDouble gets a pseudo random value between 0.0 and 1.0 int replace = (int) Math.floor((double) count * rand.nextDouble()); if (replace < samples) { // probability says replace. candidates.set(replace, v); } // else keep the current sample reservoir } /** * Returns an unmodifiable reference to the sample list. */ public List<T> sample() { return Collections.unmodifiableList(candidates); } public void clear() { candidates.clear(); } }