package edu.washington.escience.myria.operator;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import edu.washington.escience.myria.util.SamplingType;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleBatchBuffer;
import edu.washington.escience.myria.util.TestEnvVars;
/**
* Tests the SamplingDistribution operator by verifying the results of various
* scenarios.
*/
public class SamplingDistributionTest {
final long RANDOM_SEED = 42;
final Schema inputSchema =
Schema.ofFields("WorkerID", Type.INT_TYPE, "PartitionSize", Type.INT_TYPE);
final Schema expectedResultSchema =
Schema.ofFields(
"WorkerID",
Type.INT_TYPE,
"StreamSize",
Type.INT_TYPE,
"SampleSize",
Type.INT_TYPE,
"SampleType",
Type.STRING_TYPE);
TupleBatchBuffer input;
SamplingDistribution sampOp;
@Before
public void setup() {
// (WorkerID, PartitionSize)
input = new TupleBatchBuffer(inputSchema);
input.putInt(0, 1);
input.putInt(1, 300);
input.putInt(0, 2);
input.putInt(1, 200);
input.putInt(0, 3);
input.putInt(1, 400);
input.putInt(0, 4);
input.putInt(1, 100);
}
/** Sample size 0. */
@Test
public void testSampleWRSizeZero() throws DbException {
int sampleSize = 0;
SamplingType sampleType = SamplingType.WithReplacement;
final int[][] expectedResults = {{1, 300, 0}, {2, 200, 0}, {3, 400, 0}, {4, 100, 0}};
verifyExpectedResults(sampleSize, sampleType, expectedResults);
}
@Test
public void testSampleWoRSizeZero() throws DbException {
int sampleSize = 0;
SamplingType sampleType = SamplingType.WithoutReplacement;
final int[][] expectedResults = {{1, 300, 0}, {2, 200, 0}, {3, 400, 0}, {4, 100, 0}};
verifyExpectedResults(sampleSize, sampleType, expectedResults);
}
/** Sample size 0%. */
@Test
public void testSampleWRPctZero() throws DbException {
float samplePct = 0;
SamplingType sampleType = SamplingType.WithReplacement;
final int[][] expectedResults = {{1, 300, 0}, {2, 200, 0}, {3, 400, 0}, {4, 100, 0}};
verifyExpectedResults(samplePct, sampleType, expectedResults);
}
@Test
public void testSampleWoRPctZero() throws DbException {
float samplePct = 0;
SamplingType sampleType = SamplingType.WithoutReplacement;
final int[][] expectedResults = {{1, 300, 0}, {2, 200, 0}, {3, 400, 0}, {4, 100, 0}};
verifyExpectedResults(samplePct, sampleType, expectedResults);
}
/** Sample size 1. */
@Test
public void testSampleWRSizeOne() throws DbException {
int sampleSize = 1;
SamplingType sampleType = SamplingType.WithReplacement;
verifyPossibleDistribution(sampleSize, sampleType);
}
@Test
public void testSampleWoRSizeOne() throws DbException {
int sampleSize = 1;
SamplingType sampleType = SamplingType.WithoutReplacement;
verifyPossibleDistribution(sampleSize, sampleType);
}
/** Sample size 50. */
@Test
public void testSampleWRSizeFifty() throws DbException {
int sampleSize = 50;
SamplingType sampleType = SamplingType.WithReplacement;
verifyPossibleDistribution(sampleSize, sampleType);
}
@Test
public void testSampleWoRSizeFifty() throws DbException {
int sampleSize = 50;
SamplingType sampleType = SamplingType.WithoutReplacement;
verifyPossibleDistribution(sampleSize, sampleType);
}
/** Sample size 50%. */
@Test
public void testSampleWRPctFifty() throws DbException {
float samplePct = 50;
SamplingType sampleType = SamplingType.WithReplacement;
verifyPossibleDistribution(samplePct, sampleType);
}
@Test
public void testSampleWoRPctFifty() throws DbException {
float samplePct = 50;
SamplingType sampleType = SamplingType.WithoutReplacement;
verifyPossibleDistribution(samplePct, sampleType);
}
/** Sample all but one tuple. */
@Test
public void testSampleWoRSizeAllButOne() throws DbException {
int sampleSize = 999;
SamplingType sampleType = SamplingType.WithoutReplacement;
verifyPossibleDistribution(sampleSize, sampleType);
}
@Test
public void testSampleWRSizeAllButOne() throws DbException {
int sampleSize = 999;
SamplingType sampleType = SamplingType.WithReplacement;
verifyPossibleDistribution(sampleSize, sampleType);
}
/** SamplingWoR the entire population == return all. */
@Test
public void testSampleWoRSizeMax() throws DbException {
int sampleSize = 1000;
SamplingType sampleType = SamplingType.WithoutReplacement;
final int[][] expectedResults = {{1, 300, 300}, {2, 200, 200}, {3, 400, 400}, {4, 100, 100}};
verifyExpectedResults(sampleSize, sampleType, expectedResults);
}
@Test
public void testSampleWoRPctMax() throws DbException {
float samplePct = 100;
SamplingType sampleType = SamplingType.WithoutReplacement;
final int[][] expectedResults = {{1, 300, 300}, {2, 200, 200}, {3, 400, 400}, {4, 100, 100}};
verifyExpectedResults(samplePct, sampleType, expectedResults);
}
/** SamplingWR the entire population. */
@Test
public void testSampleWRSizeMax() throws DbException {
int sampleSize = 1000;
SamplingType sampleType = SamplingType.WithReplacement;
verifyPossibleDistribution(sampleSize, sampleType);
}
@Test
public void testSampleWRPctMax() throws DbException {
float samplePct = 100;
SamplingType sampleType = SamplingType.WithReplacement;
verifyPossibleDistribution(samplePct, sampleType);
}
/** Cannot sample more than total size. */
@Test(expected = IllegalStateException.class)
public void testSampleWoRSizeTooMany() throws DbException {
int sampleSize = 1001;
SamplingType sampleType = SamplingType.WithoutReplacement;
drainOperator(sampleSize, sampleType);
}
@Test(expected = IllegalArgumentException.class)
public void testSampleWoRPctTooMany() throws DbException {
float samplePct = 100.1f;
SamplingType sampleType = SamplingType.WithoutReplacement;
drainOperator(samplePct, sampleType);
}
@Test(expected = IllegalStateException.class)
public void testSampleWRSizeTooMany() throws DbException {
int sampleSize = 1001;
SamplingType sampleType = SamplingType.WithReplacement;
drainOperator(sampleSize, sampleType);
}
@Test(expected = IllegalArgumentException.class)
public void testSampleWRPctTooMany() throws DbException {
float samplePct = 100.1f;
SamplingType sampleType = SamplingType.WithReplacement;
drainOperator(samplePct, sampleType);
}
/** Cannot sample a negative number of samples. */
@Test(expected = IllegalArgumentException.class)
public void testSampleWoRSizeNegative() throws DbException {
int sampleSize = -1;
SamplingType sampleType = SamplingType.WithoutReplacement;
drainOperator(sampleSize, sampleType);
}
@Test(expected = IllegalArgumentException.class)
public void testSampleWoRPctNegative() throws DbException {
float samplePct = -0.01f;
SamplingType sampleType = SamplingType.WithoutReplacement;
drainOperator(samplePct, sampleType);
}
@Test(expected = IllegalArgumentException.class)
public void testSampleWRSizeNegative() throws DbException {
int sampleSize = -1;
SamplingType sampleType = SamplingType.WithoutReplacement;
drainOperator(sampleSize, sampleType);
}
@Test(expected = IllegalArgumentException.class)
public void testSampleWRPctNegative() throws DbException {
float samplePct = -0.01f;
SamplingType sampleType = SamplingType.WithoutReplacement;
drainOperator(samplePct, sampleType);
}
/** Worker cannot report a negative partition size. */
@Test(expected = IllegalStateException.class)
public void testSampleWoRWorkerNegative() throws DbException {
int sampleSize = 50;
SamplingType sampleType = SamplingType.WithoutReplacement;
input.putInt(0, 5);
input.putInt(1, -1);
drainOperator(sampleSize, sampleType);
}
@Test(expected = IllegalStateException.class)
public void testSampleWRWorkerNegative() throws DbException {
int sampleSize = 50;
SamplingType sampleType = SamplingType.WithReplacement;
input.putInt(0, 5);
input.putInt(1, -1);
drainOperator(sampleSize, sampleType);
}
@After
public void cleanup() throws DbException {
if (sampOp != null && sampOp.isOpen()) {
sampOp.close();
}
}
/** Compare output results compared to some known expectedResults. */
private void verifyExpectedResults(SamplingDistribution sampOp, int[][] expectedResults)
throws DbException {
int rowIdx = 0;
while (!sampOp.eos()) {
TupleBatch result = sampOp.nextReady();
if (result != null) {
assertEquals(expectedResultSchema, result.getSchema());
for (int i = 0; i < result.numTuples(); ++i, ++rowIdx) {
assertEquals(expectedResults[rowIdx][0], result.getInt(0, i));
assertEquals(expectedResults[rowIdx][1], result.getInt(1, i));
assertEquals(expectedResults[rowIdx][2], result.getInt(2, i));
}
}
}
assertEquals(expectedResults.length, rowIdx);
}
private void verifyExpectedResults(
int sampleSize, SamplingType sampleType, int[][] expectedResults) throws DbException {
sampOp =
new SamplingDistribution(new BatchTupleSource(input), sampleSize, sampleType, RANDOM_SEED);
sampOp.open(TestEnvVars.get());
verifyExpectedResults(sampOp, expectedResults);
}
private void verifyExpectedResults(
float samplePct, SamplingType sampleType, int[][] expectedResults) throws DbException {
sampOp =
new SamplingDistribution(new BatchTupleSource(input), samplePct, sampleType, RANDOM_SEED);
sampOp.open(TestEnvVars.get());
verifyExpectedResults(sampOp, expectedResults);
}
/**
* Tests the actual distribution against what could be possible. Note: doesn't
* test if it is statistically random.
*/
private void verifyPossibleDistribution(SamplingDistribution sampOp) throws DbException {
int rowIdx = 0;
int computedSampleSize = 0;
while (!sampOp.eos()) {
TupleBatch result = sampOp.nextReady();
if (result != null) {
assertEquals(expectedResultSchema, result.getSchema());
for (int i = 0; i < result.numTuples(); ++i, ++rowIdx) {
assertTrue(result.getInt(2, i) >= 0 && result.getInt(2, i) <= sampOp.getSampleSize());
if (sampOp.getSampleType() == SamplingType.WithoutReplacement) {
// SampleWoR cannot sample more than worker's population size.
assertTrue(result.getInt(2, i) <= result.getInt(1, i));
}
computedSampleSize += result.getInt(2, i);
}
}
}
assertEquals(input.numTuples(), rowIdx);
assertEquals(sampOp.getSampleSize(), computedSampleSize);
}
private void verifyPossibleDistribution(int sampleSize, SamplingType sampleType)
throws DbException {
sampOp =
new SamplingDistribution(new BatchTupleSource(input), sampleSize, sampleType, RANDOM_SEED);
sampOp.open(TestEnvVars.get());
verifyPossibleDistribution(sampOp);
}
private void verifyPossibleDistribution(float samplePct, SamplingType sampleType)
throws DbException {
sampOp =
new SamplingDistribution(new BatchTupleSource(input), samplePct, sampleType, RANDOM_SEED);
sampOp.open(TestEnvVars.get());
verifyPossibleDistribution(sampOp);
}
/** Run through all results without doing anything. */
private void drainOperator(int sampleSize, SamplingType sampleType) throws DbException {
sampOp =
new SamplingDistribution(new BatchTupleSource(input), sampleSize, sampleType, RANDOM_SEED);
sampOp.open(TestEnvVars.get());
while (!sampOp.eos()) {
sampOp.nextReady();
}
}
private void drainOperator(float samplePct, SamplingType sampleType) throws DbException {
sampOp =
new SamplingDistribution(new BatchTupleSource(input), samplePct, sampleType, RANDOM_SEED);
sampOp.open(TestEnvVars.get());
while (!sampOp.eos()) {
sampOp.nextReady();
}
}
}