package edu.washington.escience.myria.storage;
import java.io.Serializable;
import java.util.Arrays;
import java.util.BitSet;
import java.util.List;
import java.util.Objects;
import java.nio.ByteBuffer;
import org.joda.time.DateTime;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.column.Column;
import edu.washington.escience.myria.column.PrefixColumn;
import edu.washington.escience.myria.operator.network.distribute.PartitionFunction;
import edu.washington.escience.myria.proto.TransportProto.TransportMessage;
import edu.washington.escience.myria.util.IPCUtils;
import net.jcip.annotations.ThreadSafe;
/**
* Container class for a batch of tuples. The goal is to amortize memory management overhead and processing overhead
* from code/data locality.
*/
@ThreadSafe
public class TupleBatch implements ReadableTable, Serializable {
/** Required for Java serialization. */
private static final long serialVersionUID = 1L;
/** The hard-coded number of tuples in a batch. */
private int batchSize;
/** Schema of tuples in this batch. */
private final Schema schema;
/** Tuple data stored as columns in this batch. */
private final ImmutableList<? extends Column<?>> columns;
/** Number of tuples in this TB. */
private final int numTuples;
/** Whether this TB is an EOI TB. */
private final boolean isEOI;
/**
* EOI TB constructor.
*
* @param schema schema of the tuples in this batch.
* @param isEoi whether this TupleBatch is an EOI TupleBatch.
*/
private TupleBatch(final Schema schema, final boolean isEoi) {
this.schema = schema;
numTuples = 0;
ImmutableList.Builder<Column<?>> b = ImmutableList.builder();
for (Type type : schema.getColumnTypes()) {
b.add(Column.emptyColumn(type));
}
columns = b.build();
isEOI = isEoi;
batchSize = TupleUtils.getBatchSize(schema);
}
/*
* @param columnNames the new column names.
* @return a shallow copy of the specified TupleBatch with the new column names.
*/
public TupleBatch rename(final List<String> columnNames) {
Schema newSchema =
Schema.of(schema.getColumnTypes(), Objects.requireNonNull(columnNames, "columnNames"));
return new TupleBatch(newSchema, columns, numTuples, isEOI);
}
/**
* Standard immutable TupleBatch constructor. All fields must be populated before creation and cannot be changed.
*
* @param schema schema of the tuples in this batch. Must match columns.
* @param columns contains the column-stored data. Must match schema.
* @param numTuples the number of tuples in this TupleBatch.
*/
public TupleBatch(
final Schema schema, final List<? extends Column<?>> columns, final int numTuples) {
this(schema, columns, numTuples, false);
}
/**
* Constructor that gets the number of tuples from the columns.
*
* @param schema schema of the tuples in this batch. Must match columns.
* @param columns contains the column-stored data. Must match schema.
*/
public TupleBatch(final Schema schema, final List<? extends Column<?>> columns) {
this(schema, columns, columns.get(0).size());
}
/**
* Construct a TupleBatch from the specified components.
*
* @param schema schema of the tuples in this batch. Must match columns.
* @param columns schema of the tuples in this batch. Must match columns.
* @param numTuples the number of tuples in this batch. Must match columns.
* @param isEOI whether this is an EOI TupleBatch.
*/
public TupleBatch(
final Schema schema,
final List<? extends Column<?>> columns,
final int numTuples,
final boolean isEOI) {
this.schema = Objects.requireNonNull(schema, "schema");
this.columns = ImmutableList.copyOf(Objects.requireNonNull(columns, "columns"));
Preconditions.checkArgument(
columns.size() == schema.numColumns(),
"Number of columns in data must equal the number of fields in schema");
for (int i = 0; i < columns.size(); i++) {
Column<?> column = columns.get(i);
Preconditions.checkArgument(
numTuples == column.size(),
"Incorrect size for column %s. Expected %s tuples, but found %s tuples.",
i,
numTuples,
column.size());
}
this.numTuples = numTuples;
this.isEOI = isEOI;
batchSize = TupleUtils.getBatchSize(schema);
}
/**
* put the tuple batch into TBB by smashing it into cells and putting them one by one.
*
* @param tbb the TBB buffer.
*/
public final void compactInto(final TupleBatchBuffer tbb) {
if (isEOI()) {
/* an EOI TB has no data */
tbb.appendTB(this);
return;
}
for (int i = 0; i < numTuples; i++) {
tbb.append(this, i);
}
}
/**
* Return a new TupleBatch that contains only the filtered rows of the current dataset. Note that if some of the
* tuples in this batch are invalid, we will have to map the indices in the specified filter to the "real" indices in
* the tuple.
*
* @param filter the rows to be retained.
* @return a TupleBatch that contains only the filtered rows of the current dataset.
*/
public final TupleBatch filter(final BitSet filter) {
Preconditions.checkArgument(
filter.length() <= numTuples(),
"Error: trying to filter a TupleBatch of length %s with a filter of length %s",
numTuples(),
filter.length());
int newNumTuples = filter.cardinality();
/* Shortcut: the filter is full, so all current tuples are retained. Just return this. */
if (newNumTuples == numTuples) {
return this;
}
ImmutableList.Builder<Column<?>> newColumns = ImmutableList.builder();
for (Column<?> column : columns) {
newColumns.add(column.filter(filter));
}
return new TupleBatch(schema, newColumns.build(), newNumTuples, isEOI);
}
/**
* Return a new TupleBatch that contains only first <code>prefix</code> rows of this batch.
*
* @param prefix the number of rows in the prefix to be retained.
* @return a TupleBatch that contains only the filtered rows of the current dataset.
*/
@SuppressWarnings({"rawtypes", "unchecked"})
public final TupleBatch prefix(final int prefix) {
Preconditions.checkArgument(
prefix <= numTuples(),
"Error: cannot take a prefix of length %s from a batch of length %s",
prefix,
numTuples());
ImmutableList.Builder<Column<?>> newColumns = ImmutableList.builder();
for (Column<?> column : columns) {
newColumns.add(new PrefixColumn(column, prefix));
}
return new TupleBatch(schema, newColumns.build(), prefix, isEOI);
}
@Override
public final boolean getBoolean(final int column, final int row) {
return columns.get(column).getBoolean(row);
}
@Override
public final double getDouble(final int column, final int row) {
return columns.get(column).getDouble(row);
}
@Override
public final float getFloat(final int column, final int row) {
return columns.get(column).getFloat(row);
}
@Override
public final int getInt(final int column, final int row) {
return columns.get(column).getInt(row);
}
@Override
public final long getLong(final int column, final int row) {
return columns.get(column).getLong(row);
}
@Override
@Deprecated
public final Object getObject(final int column, final int row) {
return columns.get(column).getObject(row);
}
@Override
public final Schema getSchema() {
return schema;
}
@Override
public final String getString(final int column, final int row) {
return columns.get(column).getString(row);
}
@Override
public final DateTime getDateTime(final int column, final int row) {
return columns.get(column).getDateTime(row);
}
@Override
public ByteBuffer getBlob(final int column, final int row) {
return columns.get(column).getBlob(row);
}
@Override
public final int numColumns() {
return schema.numColumns();
}
@Override
public final int numTuples() {
return numTuples;
}
/**
* Partition this TB using the partition function. The method is implemented by shallow copy of TupleBatches.
*
* @return an array of TBs. The length of the array is the same as the number of partitions. If no tuple presents in a
* partition, say the i'th partition, the i'th element in the result array is null.
* @param pf the partition function.
*/
public final TupleBatch[] partition(final PartitionFunction pf) {
if (isEOI) {
TupleBatch[] result = new TupleBatch[pf.numPartitions()];
Arrays.fill(result, this);
return result;
}
return pf.partition(this);
}
/**
* Creates a new TupleBatch with only the indicated columns. Internal implementation of a (non-duplicate-eliminating)
* PROJECT statement.
*
* @param remainingColumns zero-indexed array of columns to retain.
* @return a projected TupleBatch.
*/
public final TupleBatch selectColumns(final int[] remainingColumns) {
Objects.requireNonNull(remainingColumns);
final ImmutableList.Builder<Column<?>> newColumns = new ImmutableList.Builder<Column<?>>();
for (final int i : remainingColumns) {
newColumns.add(columns.get(i));
}
return new TupleBatch(
getSchema().getSubSchema(remainingColumns), newColumns.build(), numTuples, isEOI);
}
/**
* @param rows a BitSet flagging the rows to be removed.
* @return a new TB with the specified rows removed.
*/
public final TupleBatch filterOut(final BitSet rows) {
BitSet inverted = (BitSet) rows.clone();
inverted.flip(0, numTuples);
return filter(inverted);
}
@Override
public final String toString() {
if (isEOI) {
return "EOI";
}
final List<Type> columnTypes = schema.getColumnTypes();
final StringBuilder sb = new StringBuilder();
for (int i = 0; i < numTuples; i++) {
sb.append("|\t");
for (int j = 0; j < schema.numColumns(); j++) {
sb.append(columnTypes.get(j).toString(columns.get(j), i));
sb.append("\t|\t");
}
sb.append('\n');
}
return sb.toString();
}
/** @return the data columns. */
public final ImmutableList<? extends Column<?>> getDataColumns() {
return columns;
}
/** @return a TransportMessage encoding the TupleBatch. */
public final TransportMessage toTransportMessage() {
return IPCUtils.normalDataMessage(columns, numTuples);
}
/**
* Create an EOI TupleBatch.
*
* @param schema schema.
* @return EOI TB for the schema.
*/
public static final TupleBatch eoiTupleBatch(final Schema schema) {
return new TupleBatch(schema, true);
}
/** @return if the TupleBatch is an EOI. */
public final boolean isEOI() {
return isEOI;
}
/**
* Construct a new TupleBatch that equals the current batch with the specified column appended. The number of valid
* tuples in this batch must be the same as the size of the other batch. If this batch is not dense, then
*
* @param columnName the name of the column to be added.
* @param column the column to be added.
* @return a new TupleBatch containing the tuples of this column plus the tuples of the other.
*/
public TupleBatch appendColumn(final String columnName, final Column<?> column) {
Preconditions.checkArgument(
numTuples() == column.size(),
"Cannot append column of size %s to batch of size %s",
column.size(),
numTuples());
Schema newSchema = Schema.appendColumn(schema, column.getType(), columnName);
List<Column<?>> newColumns =
ImmutableList.<Column<?>>builder().addAll(columns).add(column).build();
return new TupleBatch(newSchema, newColumns, numTuples, isEOI);
}
@Override
public ReadableColumn asColumn(final int column) {
return columns.get(column);
}
public int getBatchSize() {
return batchSize;
}
}