/* * Copyright (c) 2008, SQL Power Group Inc. * * This file is part of SQL Power Library. * * SQL Power Library is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * SQL Power Library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package ca.sqlpower.util.reservoir; import java.lang.reflect.Array; import java.util.ArrayList; import java.util.List; import java.util.Random; /** * Implementation of Reservoir using a slightly modified version of <i>Algorithm * R</i> as described in "<a * href="http://www.cs.duke.edu/~jsv/Papers/Vit85.Reservoir.pdf">Random * Sampling with a Reservoir</a>" by J. S. Vitter. * <p> * Note that Algorithm R was presented in this paper as the basic well-known * reservoir sampling technique, and was used as the base line for comparison * to a number of more efficient algorithms (called X, Y, and Z). The more * efficient algorithms use the same amount of I/O (a sequential scan of the input), * but significantly less CPU time. The more efficient algorithms are also * significantly more complicated, and require some experimentation with threshold * values to yield maximum benefit. If you choose to implement Algorithm Z, * and testing shows it to be faster, let us know and we'll add it to the * library. */ public class BasicReservoir<T> implements Reservoir<T> { Random r = new Random(); public T[] getSample(ReservoirDataSource<T> dataSource, int n) throws ReservoirDataException { if (n == 0) { return makeArray(dataSource.getElementType(), 0); } // The reservoir. List<T> C = new ArrayList<T>(n); // Make the first n records candidates for the sample for (int j = 0; j < n && dataSource.hasNext(); j++) { C.add(dataSource.readNextRecord()); } int t = n; // t is the number of records processed so far // Process the rest of the records while (dataSource.hasNext()) { t++; int m = (int) (t * r.nextDouble()); // m is random in the range 0 <= m <= t - 1 if (m < n) { // Make the next record a candidate, replacing one at random C.set(m, dataSource.readNextRecord()); } else { // Skip over the next record dataSource.skipRecords(1); } } return C.toArray(makeArray(dataSource.getElementType(), C.size())); } public void setRandomSeed(long s) { r.setSeed(s); } /** * Creates an array of the given size having elements of the given type. * This is in a separate method because it uses a cast that causes a type * safety warning. Don't worry though: it's type safe. * * @param elemType The type of the array elements * @param size The number of elements in the array * @return A new array of the size and type requested. Every element will * have the value <tt>null</tt>. */ @SuppressWarnings("unchecked") private T[] makeArray(Class<T> elemType, int size) { return (T[]) Array.newInstance(elemType, size); } }