TupleBatch.java example

Explorer
myria-master
package edu.washington.escience.myria.storage;

import java.io.Serializable;
import java.util.Arrays;
import java.util.BitSet;
import java.util.List;
import java.util.Objects;
import java.nio.ByteBuffer;
import org.joda.time.DateTime;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;

import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.column.Column;
import edu.washington.escience.myria.column.PrefixColumn;
import edu.washington.escience.myria.operator.network.distribute.PartitionFunction;
import edu.washington.escience.myria.proto.TransportProto.TransportMessage;
import edu.washington.escience.myria.util.IPCUtils;
import net.jcip.annotations.ThreadSafe;

/**
 * Container class for a batch of tuples. The goal is to amortize memory management overhead and processing overhead
 * from code/data locality.
 */
@ThreadSafe
public class TupleBatch implements ReadableTable, Serializable {
  /** Required for Java serialization. */
  private static final long serialVersionUID = 1L;
  /** The hard-coded number of tuples in a batch. */
  private int batchSize;
  /** Schema of tuples in this batch. */
  private final Schema schema;
  /** Tuple data stored as columns in this batch. */
  private final ImmutableList<? extends Column<?>> columns;
  /** Number of tuples in this TB. */
  private final int numTuples;
  /** Whether this TB is an EOI TB. */
  private final boolean isEOI;

  /**
   * EOI TB constructor.
   *
   * @param schema schema of the tuples in this batch.
   * @param isEoi whether this TupleBatch is an EOI TupleBatch.
   */
  private TupleBatch(final Schema schema, final boolean isEoi) {
    this.schema = schema;
    numTuples = 0;
    ImmutableList.Builder<Column<?>> b = ImmutableList.builder();
    for (Type type : schema.getColumnTypes()) {
      b.add(Column.emptyColumn(type));
    }
    columns = b.build();
    isEOI = isEoi;
    batchSize = TupleUtils.getBatchSize(schema);
  }

  /*
   * @param columnNames the new column names.
   * @return a shallow copy of the specified TupleBatch with the new column names.
   */
  public TupleBatch rename(final List<String> columnNames) {
    Schema newSchema =
        Schema.of(schema.getColumnTypes(), Objects.requireNonNull(columnNames, "columnNames"));
    return new TupleBatch(newSchema, columns, numTuples, isEOI);
  }

  /**
   * Standard immutable TupleBatch constructor. All fields must be populated before creation and cannot be changed.
   *
   * @param schema schema of the tuples in this batch. Must match columns.
   * @param columns contains the column-stored data. Must match schema.
   * @param numTuples the number of tuples in this TupleBatch.
   */
  public TupleBatch(
      final Schema schema, final List<? extends Column<?>> columns, final int numTuples) {
    this(schema, columns, numTuples, false);
  }

  /**
   * Constructor that gets the number of tuples from the columns.
   *
   * @param schema schema of the tuples in this batch. Must match columns.
   * @param columns contains the column-stored data. Must match schema.
   */
  public TupleBatch(final Schema schema, final List<? extends Column<?>> columns) {
    this(schema, columns, columns.get(0).size());
  }

  /**
   * Construct a TupleBatch from the specified components.
   *
   * @param schema schema of the tuples in this batch. Must match columns.
   * @param columns schema of the tuples in this batch. Must match columns.
   * @param numTuples the number of tuples in this batch. Must match columns.
   * @param isEOI whether this is an EOI TupleBatch.
   */
  public TupleBatch(
      final Schema schema,
      final List<? extends Column<?>> columns,
      final int numTuples,
      final boolean isEOI) {
    this.schema = Objects.requireNonNull(schema, "schema");
    this.columns = ImmutableList.copyOf(Objects.requireNonNull(columns, "columns"));
    Preconditions.checkArgument(
        columns.size() == schema.numColumns(),
        "Number of columns in data must equal the number of fields in schema");
    for (int i = 0; i < columns.size(); i++) {
      Column<?> column = columns.get(i);
      Preconditions.checkArgument(
          numTuples == column.size(),
          "Incorrect size for column %s. Expected %s tuples, but found %s tuples.",
          i,
          numTuples,
          column.size());
    }
    this.numTuples = numTuples;
    this.isEOI = isEOI;
    batchSize = TupleUtils.getBatchSize(schema);
  }

  /**
   * put the tuple batch into TBB by smashing it into cells and putting them one by one.
   *
   * @param tbb the TBB buffer.
   */
  public final void compactInto(final TupleBatchBuffer tbb) {
    if (isEOI()) {
      /* an EOI TB has no data */
      tbb.appendTB(this);
      return;
    }
    for (int i = 0; i < numTuples; i++) {
      tbb.append(this, i);
    }
  }

  /**
   * Return a new TupleBatch that contains only the filtered rows of the current dataset. Note that if some of the
   * tuples in this batch are invalid, we will have to map the indices in the specified filter to the "real" indices in
   * the tuple.
   *
   * @param filter the rows to be retained.
   * @return a TupleBatch that contains only the filtered rows of the current dataset.
   */
  public final TupleBatch filter(final BitSet filter) {
    Preconditions.checkArgument(
        filter.length() <= numTuples(),
        "Error: trying to filter a TupleBatch of length %s with a filter of length %s",
        numTuples(),
        filter.length());
    int newNumTuples = filter.cardinality();

    /* Shortcut: the filter is full, so all current tuples are retained. Just return this. */
    if (newNumTuples == numTuples) {
      return this;
    }

    ImmutableList.Builder<Column<?>> newColumns = ImmutableList.builder();
    for (Column<?> column : columns) {
      newColumns.add(column.filter(filter));
    }
    return new TupleBatch(schema, newColumns.build(), newNumTuples, isEOI);
  }

  /**
   * Return a new TupleBatch that contains only first <code>prefix</code> rows of this batch.
   *
   * @param prefix the number of rows in the prefix to be retained.
   * @return a TupleBatch that contains only the filtered rows of the current dataset.
   */
  @SuppressWarnings({"rawtypes", "unchecked"})
  public final TupleBatch prefix(final int prefix) {
    Preconditions.checkArgument(
        prefix <= numTuples(),
        "Error: cannot take a prefix of length %s from a batch of length %s",
        prefix,
        numTuples());
    ImmutableList.Builder<Column<?>> newColumns = ImmutableList.builder();
    for (Column<?> column : columns) {
      newColumns.add(new PrefixColumn(column, prefix));
    }
    return new TupleBatch(schema, newColumns.build(), prefix, isEOI);
  }

  @Override
  public final boolean getBoolean(final int column, final int row) {
    return columns.get(column).getBoolean(row);
  }

  @Override
  public final double getDouble(final int column, final int row) {
    return columns.get(column).getDouble(row);
  }

  @Override
  public final float getFloat(final int column, final int row) {
    return columns.get(column).getFloat(row);
  }

  @Override
  public final int getInt(final int column, final int row) {
    return columns.get(column).getInt(row);
  }

  @Override
  public final long getLong(final int column, final int row) {
    return columns.get(column).getLong(row);
  }

  @Override
  @Deprecated
  public final Object getObject(final int column, final int row) {
    return columns.get(column).getObject(row);
  }

  @Override
  public final Schema getSchema() {
    return schema;
  }

  @Override
  public final String getString(final int column, final int row) {
    return columns.get(column).getString(row);
  }

  @Override
  public final DateTime getDateTime(final int column, final int row) {
    return columns.get(column).getDateTime(row);
  }

  @Override
  public ByteBuffer getBlob(final int column, final int row) {
    return columns.get(column).getBlob(row);
  }

  @Override
  public final int numColumns() {
    return schema.numColumns();
  }

  @Override
  public final int numTuples() {
    return numTuples;
  }

  /**
   * Partition this TB using the partition function. The method is implemented by shallow copy of TupleBatches.
   *
   * @return an array of TBs. The length of the array is the same as the number of partitions. If no tuple presents in a
   *         partition, say the i'th partition, the i'th element in the result array is null.
   * @param pf the partition function.
   */
  public final TupleBatch[] partition(final PartitionFunction pf) {
    if (isEOI) {
      TupleBatch[] result = new TupleBatch[pf.numPartitions()];
      Arrays.fill(result, this);
      return result;
    }
    return pf.partition(this);
  }

  /**
   * Creates a new TupleBatch with only the indicated columns. Internal implementation of a (non-duplicate-eliminating)
   * PROJECT statement.
   *
   * @param remainingColumns zero-indexed array of columns to retain.
   * @return a projected TupleBatch.
   */
  public final TupleBatch selectColumns(final int[] remainingColumns) {
    Objects.requireNonNull(remainingColumns);
    final ImmutableList.Builder<Column<?>> newColumns = new ImmutableList.Builder<Column<?>>();
    for (final int i : remainingColumns) {
      newColumns.add(columns.get(i));
    }
    return new TupleBatch(
        getSchema().getSubSchema(remainingColumns), newColumns.build(), numTuples, isEOI);
  }

  /**
   * @param rows a BitSet flagging the rows to be removed.
   * @return a new TB with the specified rows removed.
   */
  public final TupleBatch filterOut(final BitSet rows) {
    BitSet inverted = (BitSet) rows.clone();
    inverted.flip(0, numTuples);
    return filter(inverted);
  }

  @Override
  public final String toString() {
    if (isEOI) {
      return "EOI";
    }
    final List<Type> columnTypes = schema.getColumnTypes();
    final StringBuilder sb = new StringBuilder();
    for (int i = 0; i < numTuples; i++) {
      sb.append("|\t");
      for (int j = 0; j < schema.numColumns(); j++) {
        sb.append(columnTypes.get(j).toString(columns.get(j), i));
        sb.append("\t|\t");
      }
      sb.append('\n');
    }
    return sb.toString();
  }

  /** @return the data columns. */
  public final ImmutableList<? extends Column<?>> getDataColumns() {
    return columns;
  }

  /** @return a TransportMessage encoding the TupleBatch. */
  public final TransportMessage toTransportMessage() {
    return IPCUtils.normalDataMessage(columns, numTuples);
  }

  /**
   * Create an EOI TupleBatch.
   *
   * @param schema schema.
   * @return EOI TB for the schema.
   */
  public static final TupleBatch eoiTupleBatch(final Schema schema) {
    return new TupleBatch(schema, true);
  }

  /** @return if the TupleBatch is an EOI. */
  public final boolean isEOI() {
    return isEOI;
  }

  /**
   * Construct a new TupleBatch that equals the current batch with the specified column appended. The number of valid
   * tuples in this batch must be the same as the size of the other batch. If this batch is not dense, then
   *
   * @param columnName the name of the column to be added.
   * @param column the column to be added.
   * @return a new TupleBatch containing the tuples of this column plus the tuples of the other.
   */
  public TupleBatch appendColumn(final String columnName, final Column<?> column) {
    Preconditions.checkArgument(
        numTuples() == column.size(),
        "Cannot append column of size %s to batch of size %s",
        column.size(),
        numTuples());
    Schema newSchema = Schema.appendColumn(schema, column.getType(), columnName);
    List<Column<?>> newColumns =
        ImmutableList.<Column<?>>builder().addAll(columns).add(column).build();
    return new TupleBatch(newSchema, newColumns, numTuples, isEOI);
  }

  @Override
  public ReadableColumn asColumn(final int column) {
    return columns.get(column);
  }

  public int getBatchSize() {
    return batchSize;
  }
}