/** * */ package edu.washington.escience.myria.operator; import java.util.List; import java.util.Map; import java.util.Random; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import edu.washington.escience.myria.DbException; import edu.washington.escience.myria.RelationKey; import edu.washington.escience.myria.Schema; import edu.washington.escience.myria.Type; import edu.washington.escience.myria.accessmethod.ConnectionInfo; import edu.washington.escience.myria.column.Column; import edu.washington.escience.myria.column.builder.IntColumnBuilder; import edu.washington.escience.myria.parallel.RelationWriteMetadata; import edu.washington.escience.myria.storage.MutableTupleBuffer; import edu.washington.escience.myria.storage.TupleBatch; /** * Samples the stream into a temp relation. */ public class SampledDbInsertTemp extends DbInsertTemp { /** Required for Java serialization. */ private static final long serialVersionUID = 1L; /** The name of the table the tuples should be inserted into. */ private final RelationKey countRelationKey; /** Number of tuples seen so far from the child. */ private int currentTupleCount; /** Number of tuples to sample from the stream. */ private final int sampleSize; /** Reservoir that holds sampleSize number of tuples. */ private MutableTupleBuffer reservoir; /** Random generator used for creating the distribution. */ private Random rand; /** Schema that will be written to the countRelationKey. */ private static final Schema COUNT_SCHEMA = Schema.ofFields( "WorkerID", Type.INT_TYPE, "PartitionSize", Type.INT_TYPE, "PartitionSampleSize", Type.INT_TYPE); /** * * @param child * the source of tuples to be inserted * @param sampleSize * number of tuples to store from the stream * @param sampleRelationKey * the key of the table that tuples will be inserted into * @param countRelationKey * the key of the table that tuple count info will be inserted into * @param connectionInfo * parameters of the database connection * @param randomSeed * value to seed the random generator with. null if no specified seed */ public SampledDbInsertTemp( final Operator child, final int sampleSize, final RelationKey sampleRelationKey, final RelationKey countRelationKey, final ConnectionInfo connectionInfo, Long randomSeed) { super(child, sampleRelationKey, connectionInfo, false, null); Preconditions.checkArgument(sampleSize >= 0, "sampleSize must be non-negative"); this.sampleSize = sampleSize; Preconditions.checkNotNull(countRelationKey, "countRelationKey cannot be null"); this.countRelationKey = countRelationKey; rand = new Random(); if (randomSeed != null) { rand.setSeed(randomSeed); } } /** * Uses reservoir sampling to insert the specified sampleSize. * https://en.wikipedia.org/wiki/Reservoir_sampling */ @Override protected void consumeTuples(final TupleBatch tb) throws DbException { final List<? extends Column<?>> columns = tb.getDataColumns(); for (int i = 0; i < tb.numTuples(); i++) { if (reservoir.numTuples() < sampleSize) { // Reservoir size < k. Add this tuple. for (int j = 0; j < tb.numColumns(); j++) { reservoir.put(j, columns.get(j), i); } } else { // Replace probabilistically int replaceIdx = rand.nextInt(currentTupleCount); if (replaceIdx < sampleSize) { for (int j = 0; j < tb.numColumns(); j++) { reservoir.replace(j, replaceIdx, columns.get(j), i); } } } currentTupleCount++; } } @Override protected void childEOS() throws DbException { // Insert the reservoir samples. for (TupleBatch tb : reservoir.getAll()) { accessMethod.tupleBatchInsert(getRelationKey(), tb); } // Insert (WorkerID, PartitionSize, PartitionSampleSize) to // countRelationKey. IntColumnBuilder wIdCol = new IntColumnBuilder(); IntColumnBuilder tupCountCol = new IntColumnBuilder(); IntColumnBuilder sampledSizeCol = new IntColumnBuilder(); wIdCol.appendInt(getNodeID()); tupCountCol.appendInt(currentTupleCount); sampledSizeCol.appendInt(reservoir.numTuples()); ImmutableList.Builder<Column<?>> columns = ImmutableList.builder(); columns.add(wIdCol.build(), tupCountCol.build(), sampledSizeCol.build()); TupleBatch tb = new TupleBatch(COUNT_SCHEMA, columns.build()); accessMethod.tupleBatchInsert(countRelationKey, tb); } @Override protected void init(final ImmutableMap<String, Object> execEnvVars) throws DbException { setupConnection(execEnvVars); // Set up the reservoir table. accessMethod.dropTableIfExists(getRelationKey()); accessMethod.createTableIfNotExists(getRelationKey(), getSchema()); // Set up the tuple count table. accessMethod.dropTableIfExists(countRelationKey); accessMethod.createTableIfNotExists(countRelationKey, COUNT_SCHEMA); reservoir = new MutableTupleBuffer(getChild().getSchema()); } @Override public void cleanup() { super.cleanup(); reservoir = null; } @Override public Map<RelationKey, RelationWriteMetadata> writeSet() { return ImmutableMap.of( getRelationKey(), new RelationWriteMetadata(getRelationKey(), getSchema(), true, true), countRelationKey, new RelationWriteMetadata(countRelationKey, COUNT_SCHEMA, true, true)); } }