/**
*
*/
package edu.washington.escience.myria.operator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.RelationKey;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.accessmethod.ConnectionInfo;
import edu.washington.escience.myria.column.Column;
import edu.washington.escience.myria.column.builder.IntColumnBuilder;
import edu.washington.escience.myria.parallel.RelationWriteMetadata;
import edu.washington.escience.myria.storage.MutableTupleBuffer;
import edu.washington.escience.myria.storage.TupleBatch;
/**
* Samples the stream into a temp relation.
*/
public class SampledDbInsertTemp extends DbInsertTemp {
/** Required for Java serialization. */
private static final long serialVersionUID = 1L;
/** The name of the table the tuples should be inserted into. */
private final RelationKey countRelationKey;
/** Number of tuples seen so far from the child. */
private int currentTupleCount;
/** Number of tuples to sample from the stream. */
private final int sampleSize;
/** Reservoir that holds sampleSize number of tuples. */
private MutableTupleBuffer reservoir;
/** Random generator used for creating the distribution. */
private Random rand;
/** Schema that will be written to the countRelationKey. */
private static final Schema COUNT_SCHEMA =
Schema.ofFields(
"WorkerID",
Type.INT_TYPE,
"PartitionSize",
Type.INT_TYPE,
"PartitionSampleSize",
Type.INT_TYPE);
/**
*
* @param child
* the source of tuples to be inserted
* @param sampleSize
* number of tuples to store from the stream
* @param sampleRelationKey
* the key of the table that tuples will be inserted into
* @param countRelationKey
* the key of the table that tuple count info will be inserted into
* @param connectionInfo
* parameters of the database connection
* @param randomSeed
* value to seed the random generator with. null if no specified seed
*/
public SampledDbInsertTemp(
final Operator child,
final int sampleSize,
final RelationKey sampleRelationKey,
final RelationKey countRelationKey,
final ConnectionInfo connectionInfo,
Long randomSeed) {
super(child, sampleRelationKey, connectionInfo, false, null);
Preconditions.checkArgument(sampleSize >= 0, "sampleSize must be non-negative");
this.sampleSize = sampleSize;
Preconditions.checkNotNull(countRelationKey, "countRelationKey cannot be null");
this.countRelationKey = countRelationKey;
rand = new Random();
if (randomSeed != null) {
rand.setSeed(randomSeed);
}
}
/**
* Uses reservoir sampling to insert the specified sampleSize.
* https://en.wikipedia.org/wiki/Reservoir_sampling
*/
@Override
protected void consumeTuples(final TupleBatch tb) throws DbException {
final List<? extends Column<?>> columns = tb.getDataColumns();
for (int i = 0; i < tb.numTuples(); i++) {
if (reservoir.numTuples() < sampleSize) {
// Reservoir size < k. Add this tuple.
for (int j = 0; j < tb.numColumns(); j++) {
reservoir.put(j, columns.get(j), i);
}
} else {
// Replace probabilistically
int replaceIdx = rand.nextInt(currentTupleCount);
if (replaceIdx < sampleSize) {
for (int j = 0; j < tb.numColumns(); j++) {
reservoir.replace(j, replaceIdx, columns.get(j), i);
}
}
}
currentTupleCount++;
}
}
@Override
protected void childEOS() throws DbException {
// Insert the reservoir samples.
for (TupleBatch tb : reservoir.getAll()) {
accessMethod.tupleBatchInsert(getRelationKey(), tb);
}
// Insert (WorkerID, PartitionSize, PartitionSampleSize) to
// countRelationKey.
IntColumnBuilder wIdCol = new IntColumnBuilder();
IntColumnBuilder tupCountCol = new IntColumnBuilder();
IntColumnBuilder sampledSizeCol = new IntColumnBuilder();
wIdCol.appendInt(getNodeID());
tupCountCol.appendInt(currentTupleCount);
sampledSizeCol.appendInt(reservoir.numTuples());
ImmutableList.Builder<Column<?>> columns = ImmutableList.builder();
columns.add(wIdCol.build(), tupCountCol.build(), sampledSizeCol.build());
TupleBatch tb = new TupleBatch(COUNT_SCHEMA, columns.build());
accessMethod.tupleBatchInsert(countRelationKey, tb);
}
@Override
protected void init(final ImmutableMap<String, Object> execEnvVars) throws DbException {
setupConnection(execEnvVars);
// Set up the reservoir table.
accessMethod.dropTableIfExists(getRelationKey());
accessMethod.createTableIfNotExists(getRelationKey(), getSchema());
// Set up the tuple count table.
accessMethod.dropTableIfExists(countRelationKey);
accessMethod.createTableIfNotExists(countRelationKey, COUNT_SCHEMA);
reservoir = new MutableTupleBuffer(getChild().getSchema());
}
@Override
public void cleanup() {
super.cleanup();
reservoir = null;
}
@Override
public Map<RelationKey, RelationWriteMetadata> writeSet() {
return ImmutableMap.of(
getRelationKey(),
new RelationWriteMetadata(getRelationKey(), getSchema(), true, true),
countRelationKey,
new RelationWriteMetadata(countRelationKey, COUNT_SCHEMA, true, true));
}
}