package edu.washington.escience.myria.column; import java.io.Serializable; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.BitSet; import org.joda.time.DateTime; import com.google.protobuf.ByteString; import edu.washington.escience.myria.Type; import edu.washington.escience.myria.proto.DataProto.BlobColumnMessage; import edu.washington.escience.myria.proto.DataProto.BooleanColumnMessage; import edu.washington.escience.myria.proto.DataProto.ColumnMessage; import edu.washington.escience.myria.proto.DataProto.DateTimeColumnMessage; import edu.washington.escience.myria.proto.DataProto.DoubleColumnMessage; import edu.washington.escience.myria.proto.DataProto.FloatColumnMessage; import edu.washington.escience.myria.proto.DataProto.IntColumnMessage; import edu.washington.escience.myria.proto.DataProto.LongColumnMessage; import edu.washington.escience.myria.proto.DataProto.StringColumnMessage; import edu.washington.escience.myria.proto.DataProto.BlobColumnMessage; import edu.washington.escience.myria.storage.ReadableColumn; import edu.washington.escience.myria.util.ImmutableIntArray; /** * A column of a batch of tuples. * * @param <T> type of the objects in this column. * * */ public abstract class Column<T extends Comparable<?>> implements ReadableColumn, Serializable { /** Required for Java serialization. */ private static final long serialVersionUID = 1L; @Override public boolean getBoolean(final int row) { throw new UnsupportedOperationException(getClass().getName()); } @Override public DateTime getDateTime(final int row) { throw new UnsupportedOperationException(getClass().getName()); } @Override public double getDouble(final int row) { throw new UnsupportedOperationException(getClass().getName()); } @Override public float getFloat(final int row) { throw new UnsupportedOperationException(getClass().getName()); } @Override public int getInt(final int row) { throw new UnsupportedOperationException(getClass().getName()); } @Override public long getLong(final int row) { throw new UnsupportedOperationException(getClass().getName()); } @Override public abstract T getObject(int row); @Override public String getString(final int row) { throw new UnsupportedOperationException(getClass().getName()); } @Override public ByteBuffer getBlob(final int row) { throw new UnsupportedOperationException(getClass().getName()); } @Override public abstract Type getType(); /** * Serializes this column as a protobuf message into the specified output stream. * * @return a ColumnMessage containing a serialized copy of this column. */ public ColumnMessage serializeToProto() { return Column.defaultProto(this); } /** * Serializes this column as a protobuf message into the specified output stream. * * @param validIndices the rows of the column to serialize. * @return a ColumnMessage containing a serialized copy of this column. */ public ColumnMessage serializeToProto(final ImmutableIntArray validIndices) { return Column.defaultProto(this, validIndices); } @Override public abstract int size(); /** * Creates a new Column containing the contents of this column including only the specified rows. * * @param filter a BitSet indicating which rows should be kept. * @return a new Column containing the contents of this column including only the specified rows. */ public Column<T> filter(final BitSet filter) { return new FilteredColumn<T>(this, filter); } /** * @param type the type of the column to be returned. * @return a new empty column of the specified type. */ public static Column<?> emptyColumn(final Type type) { switch (type) { case BOOLEAN_TYPE: return new BooleanColumn(new BitSet(0), 0); case DATETIME_TYPE: return new DateTimeColumn(new DateTime[] {}, 0); case DOUBLE_TYPE: return new DoubleColumn(new double[] {}, 0); case FLOAT_TYPE: return new FloatColumn(new float[] {}, 0); case INT_TYPE: return new IntArrayColumn(new int[] {}, 0); case LONG_TYPE: return new LongColumn(new long[] {}, 0); case STRING_TYPE: return new StringArrayColumn(new String[] {}, 0); case BLOB_TYPE: return new BlobColumn(new ByteBuffer[] {}, 0); } throw new UnsupportedOperationException("Allocating an empty column of type " + type); } /** * A default implementation to serialize any Boolean column to a proto. Full copy. * * @param column the column to be serialized. * @return a ColumnMessage with a BooleanColumn member. */ protected static final ColumnMessage defaultBooleanProto(final Column<?> column) { ByteString.Output bytes = ByteString.newOutput((column.size() + 7) / 8); int bitCnt = 0; int b = 0; for (int i = 0; i < column.size(); ++i) { if (column.getBoolean(i)) { b |= (1 << bitCnt); } bitCnt++; if (bitCnt == 8) { bytes.write(b); bitCnt = 0; b = 0; } } /* Note that we do *not* build the inner class. We pass its builder instead. */ final BooleanColumnMessage.Builder inner = BooleanColumnMessage.newBuilder().setData(bytes.toByteString()); return ColumnMessage.newBuilder() .setType(ColumnMessage.Type.BOOLEAN) .setBooleanColumn(inner) .build(); } /** * A default implementation to serialize any DateTime column to a proto. Full copy. * * @param column the column to be serialized. * @return a ColumnMessage with a DateColumn member. */ protected static ColumnMessage defaultDateTimeProto(final Column<?> column) { ByteBuffer dataBytes = ByteBuffer.allocate(column.size() * Long.SIZE / Byte.SIZE); for (int i = 0; i < column.size(); i++) { dataBytes.putLong(column.getDateTime(i).getMillis()); } dataBytes.flip(); final DateTimeColumnMessage.Builder inner = DateTimeColumnMessage.newBuilder().setData(ByteString.copyFrom(dataBytes)); return ColumnMessage.newBuilder() .setType(ColumnMessage.Type.DATETIME) .setDateColumn(inner) .build(); } /** * A default implementation to serialize any Bytes Type column to a proto. * * @param column the column to be serialized. * @return a ColumnMessage with a BytesType member. */ protected static ColumnMessage defaultBytesProto(final Column<?> column) { final BlobColumnMessage.Builder inner = BlobColumnMessage.newBuilder(); int bblen = 0; for (int i = 0; i < column.size(); i++) { bblen = bblen + column.getBlob(i).array().length; } ByteBuffer bb = ByteBuffer.allocate(bblen); int startP = 0, endP = 0; for (int i = 0; i < column.size(); i++) { int len = column.getBlob(i).array().length; endP = startP + len; inner.addStartIndices(startP); inner.addEndIndices(endP); bb.put(column.getBlob(i).array(), bb.position(), len); startP = endP; } bb.flip(); inner.setData(ByteString.copyFrom(bb)); return ColumnMessage.newBuilder().setType(ColumnMessage.Type.BLOB).setBlobColumn(inner).build(); } /** * A default implementation to serialize any Double column to a proto. Full copy. * * @param column the column to be serialized. * @return a ColumnMessage with a DoubleColumn member. */ protected static ColumnMessage defaultDoubleProto(final Column<?> column) { ByteBuffer dataBytes = ByteBuffer.allocate(column.size() * Double.SIZE / Byte.SIZE); for (int i = 0; i < column.size(); i++) { dataBytes.putDouble(column.getDouble(i)); } dataBytes.flip(); final DoubleColumnMessage.Builder inner = DoubleColumnMessage.newBuilder().setData(ByteString.copyFrom(dataBytes)); return ColumnMessage.newBuilder() .setType(ColumnMessage.Type.DOUBLE) .setDoubleColumn(inner) .build(); } /** * A default implementation to serialize any Float column to a proto. Full copy. * * @param column the column to be serialized. * @return a ColumnMessage with a FloatColumn member. */ protected static ColumnMessage defaultFloatProto(final Column<?> column) { ByteBuffer dataBytes = ByteBuffer.allocate(column.size() * Float.SIZE / Byte.SIZE); for (int i = 0; i < column.size(); i++) { dataBytes.putFloat(column.getFloat(i)); } dataBytes.flip(); final FloatColumnMessage.Builder inner = FloatColumnMessage.newBuilder().setData(ByteString.copyFrom(dataBytes)); return ColumnMessage.newBuilder() .setType(ColumnMessage.Type.FLOAT) .setFloatColumn(inner) .build(); } /** * A default implementation to serialize any Integer column to a proto. Full copy. * * @param column the column to be serialized. * @return a ColumnMessage with an IntColumn member. */ protected static ColumnMessage defaultIntProto(final Column<?> column) { ByteBuffer dataBytes = ByteBuffer.allocate(column.size() * Integer.SIZE / Byte.SIZE); for (int i = 0; i < column.size(); i++) { dataBytes.putInt(column.getInt(i)); } dataBytes.flip(); final IntColumnMessage.Builder inner = IntColumnMessage.newBuilder().setData(ByteString.copyFrom(dataBytes)); return ColumnMessage.newBuilder().setType(ColumnMessage.Type.INT).setIntColumn(inner).build(); } /** * A default implementation to serialize any Long column to a proto. Full copy. * * @param column the column to be serialized. * @return a ColumnMessage with a LongColumn member. */ protected static ColumnMessage defaultLongProto(final Column<?> column) { ByteBuffer dataBytes = ByteBuffer.allocate(column.size() * Long.SIZE / Byte.SIZE); for (int i = 0; i < column.size(); i++) { dataBytes.putLong(column.getLong(i)); } dataBytes.flip(); final LongColumnMessage.Builder inner = LongColumnMessage.newBuilder().setData(ByteString.copyFrom(dataBytes)); return ColumnMessage.newBuilder().setType(ColumnMessage.Type.LONG).setLongColumn(inner).build(); } /** * A default implementation to serialize any column to a proto. Full copy. * * @param column the column to be serialized. * @return a ColumnMessage with an appropriate member. */ protected static ColumnMessage defaultProto(final Column<?> column) { switch (column.getType()) { case BOOLEAN_TYPE: return defaultBooleanProto(column); case DATETIME_TYPE: return defaultDateTimeProto(column); case DOUBLE_TYPE: return defaultDoubleProto(column); case FLOAT_TYPE: return defaultFloatProto(column); case INT_TYPE: return defaultIntProto(column); case LONG_TYPE: return defaultLongProto(column); case STRING_TYPE: return defaultStringProto(column); case BLOB_TYPE: return defaultBytesProto(column); } throw new UnsupportedOperationException("Serializing a column of type " + column.getType()); } /** * A default implementation to serialize any filtered column to a proto. Full copy. * * @param column the column to be serialized. * @param validIndices the valid indices in the column. * @return a ColumnMessage with an appropriate member. */ protected static ColumnMessage defaultProto( final Column<?> column, final ImmutableIntArray validIndices) { BitSet filter = new BitSet(column.size()); for (int i = 0; i < column.size(); ++i) { filter.set(validIndices.get(i)); } return defaultProto(new FilteredColumn<>(column, filter)); } /** * A default implementation to serialize any String column to a proto. Full copy. * * @param column the column to be serialized. * @return a ColumnMessage with a StringColumn member. */ protected static ColumnMessage defaultStringProto(final Column<?> column) { final StringColumnMessage.Builder inner = StringColumnMessage.newBuilder(); StringBuilder sb = new StringBuilder(); int startP = 0, endP = 0; for (int i = 0; i < column.size(); i++) { int len = column.getString(i).getBytes(StandardCharsets.UTF_8).length; endP = startP + len; inner.addStartIndices(startP); inner.addEndIndices(endP); sb.append(column.getString(i)); startP = endP; } inner.setData(ByteString.copyFromUtf8(sb.toString())); return ColumnMessage.newBuilder() .setType(ColumnMessage.Type.STRING) .setStringColumn(inner) .build(); } }