/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.common.datatable; import com.linkedin.pinot.common.data.FieldSpec; import com.linkedin.pinot.common.request.AggregationInfo; import com.linkedin.pinot.common.request.BrokerRequest; import com.linkedin.pinot.common.request.Selection; import com.linkedin.pinot.common.utils.DataSchema; import com.linkedin.pinot.common.utils.DataTable; import com.linkedin.pinot.core.query.aggregation.AggregationFunctionContext; import com.linkedin.pinot.core.query.aggregation.function.AggregationFunction; import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import javax.annotation.Nonnull; /** * * Datatable that holds data in a matrix form. The purpose of this class is to * provide a way to construct a datatable and ability to serialize and * deserialize.<br> * Why can't we use existing serialization/deserialization mechanism. Most * existing techniques protocol buffer, thrift, avro are optimized for * transporting a single record but Pinot transfers quite a lot of data from * server to broker during the scatter/gather operation. The cost of * serialization and deserialization directly impacts the performance. Most * ser/deser requires us to convert the primitives data types in objects like * Integer etc. This is waste of cpu resource and increase the payload size. We * optimize the data format for Pinot usecase. We can also support lazy * construction of obejcts. Infact we retain the bytes as it is and will be able * to lookup the a field directly within a byte buffer.<br> * * USAGE: * * Datatable is initialized with the schema of the table. Schema describes the * columnnames, their order and data type for each column.<br> * Each row must follow the same convention. We don't support MultiValue columns * for now. Format, * |VERSION,DATA_START_OFFSET,DICTIONARY_START_OFFSET,INDEX_START_OFFSET * ,METADATA_START_OFFSET | |<DATA> | * * |<DICTIONARY>| * * * |<METADATA>| Data contains the actual values written by the application We * first write the entire data in its raw byte format. For example if you data * type is Int, it will write 4 bytes. For most data types that are fixed width, * we just write the raw data. For special cases like String, we create a * dictionary. Dictionary will be never exposed to the user. All conversions * will be done internally. In future, we might decide dynamically if dictionary * creation is needed, for now we will always create dictionaries for string * columns. During deserialization we will always load the dictionary * first.Overall having dictionary allow us to convert data table into a fixed * width matrix and thus allowing look up and easy traversal. * * */ // TODO: potential optimizations: // TODO: 1. Fix float size. // TODO: 2. Use one dictionary for all columns (save space). // TODO: 3. Given a data schema, write all values one by one instead of using rowId and colId to position (save time). public class DataTableBuilder { private final DataSchema _dataSchema; private final int[] _columnOffsets; private final int _rowSizeInBytes; private final Map<String, Map<String, Integer>> _dictionaryMap = new HashMap<>(); private final Map<String, Map<Integer, String>> _reverseDictionaryMap = new HashMap<>(); private final ByteArrayOutputStream _fixedSizeDataByteArrayOutputStream = new ByteArrayOutputStream(); private final ByteArrayOutputStream _variableSizeDataByteArrayOutputStream = new ByteArrayOutputStream(); private final DataOutputStream _variableSizeDataOutputStream = new DataOutputStream(_variableSizeDataByteArrayOutputStream); private int _numRows; private ByteBuffer _currentRowDataByteBuffer; public DataTableBuilder(@Nonnull DataSchema dataSchema) { _dataSchema = dataSchema; _columnOffsets = new int[dataSchema.size()]; _rowSizeInBytes = DataTableUtils.computeColumnOffsets(dataSchema, _columnOffsets); } public void startRow() { _numRows++; _currentRowDataByteBuffer = ByteBuffer.allocate(_rowSizeInBytes); } public void setColumn(int colId, boolean value) { _currentRowDataByteBuffer.position(_columnOffsets[colId]); if (value) { _currentRowDataByteBuffer.put((byte) 1); } else { _currentRowDataByteBuffer.put((byte) 0); } } public void setColumn(int colId, byte value) { _currentRowDataByteBuffer.position(_columnOffsets[colId]); _currentRowDataByteBuffer.put(value); } public void setColumn(int colId, char value) { _currentRowDataByteBuffer.position(_columnOffsets[colId]); _currentRowDataByteBuffer.putChar(value); } public void setColumn(int colId, short value) { _currentRowDataByteBuffer.position(_columnOffsets[colId]); _currentRowDataByteBuffer.putShort(value); } public void setColumn(int colId, int value) { _currentRowDataByteBuffer.position(_columnOffsets[colId]); _currentRowDataByteBuffer.putInt(value); } public void setColumn(int colId, long value) { _currentRowDataByteBuffer.position(_columnOffsets[colId]); _currentRowDataByteBuffer.putLong(value); } public void setColumn(int colId, float value) { _currentRowDataByteBuffer.position(_columnOffsets[colId]); _currentRowDataByteBuffer.putFloat(value); } public void setColumn(int colId, double value) { _currentRowDataByteBuffer.position(_columnOffsets[colId]); _currentRowDataByteBuffer.putDouble(value); } public void setColumn(int colId, @Nonnull String value) { String columnName = _dataSchema.getColumnName(colId); Map<String, Integer> dictionary = _dictionaryMap.get(columnName); if (dictionary == null) { dictionary = new HashMap<>(); _dictionaryMap.put(columnName, dictionary); _reverseDictionaryMap.put(columnName, new HashMap<Integer, String>()); } _currentRowDataByteBuffer.position(_columnOffsets[colId]); Integer dictId = dictionary.get(value); if (dictId == null) { dictId = dictionary.size(); dictionary.put(value, dictId); _reverseDictionaryMap.get(columnName).put(dictId, value); } _currentRowDataByteBuffer.putInt(dictId); } public void setColumn(int colId, @Nonnull Object value) throws IOException { _currentRowDataByteBuffer.position(_columnOffsets[colId]); _currentRowDataByteBuffer.putInt(_variableSizeDataByteArrayOutputStream.size()); byte[] bytes = ObjectCustomSerDe.serialize(value); _currentRowDataByteBuffer.putInt(bytes.length); _variableSizeDataOutputStream.writeInt(ObjectCustomSerDe.getObjectType(value).getValue()); _variableSizeDataByteArrayOutputStream.write(bytes); } public void setColumn(int colId, @Nonnull byte[] values) { _currentRowDataByteBuffer.position(_columnOffsets[colId]); _currentRowDataByteBuffer.putInt(_variableSizeDataByteArrayOutputStream.size()); _currentRowDataByteBuffer.putInt(values.length); for (byte value : values) { _variableSizeDataByteArrayOutputStream.write(value); } } public void setColumn(int colId, @Nonnull char[] values) throws IOException { _currentRowDataByteBuffer.position(_columnOffsets[colId]); _currentRowDataByteBuffer.putInt(_variableSizeDataByteArrayOutputStream.size()); _currentRowDataByteBuffer.putInt(values.length); for (char value : values) { _variableSizeDataOutputStream.writeChar(value); } } public void setColumn(int colId, @Nonnull short[] values) throws IOException { _currentRowDataByteBuffer.position(_columnOffsets[colId]); _currentRowDataByteBuffer.putInt(_variableSizeDataByteArrayOutputStream.size()); _currentRowDataByteBuffer.putInt(values.length); for (short value : values) { _variableSizeDataOutputStream.writeShort(value); } } public void setColumn(int colId, @Nonnull int[] values) throws IOException { _currentRowDataByteBuffer.position(_columnOffsets[colId]); _currentRowDataByteBuffer.putInt(_variableSizeDataByteArrayOutputStream.size()); _currentRowDataByteBuffer.putInt(values.length); for (int value : values) { _variableSizeDataOutputStream.writeInt(value); } } public void setColumn(int colId, @Nonnull long[] values) throws IOException { _currentRowDataByteBuffer.position(_columnOffsets[colId]); _currentRowDataByteBuffer.putInt(_variableSizeDataByteArrayOutputStream.size()); _currentRowDataByteBuffer.putInt(values.length); for (long value : values) { _variableSizeDataOutputStream.writeLong(value); } } public void setColumn(int colId, @Nonnull float[] values) throws IOException { _currentRowDataByteBuffer.position(_columnOffsets[colId]); _currentRowDataByteBuffer.putInt(_variableSizeDataByteArrayOutputStream.size()); _currentRowDataByteBuffer.putInt(values.length); for (float value : values) { _variableSizeDataOutputStream.writeFloat(value); } } public void setColumn(int colId, @Nonnull double[] values) throws IOException { _currentRowDataByteBuffer.position(_columnOffsets[colId]); _currentRowDataByteBuffer.putInt(_variableSizeDataByteArrayOutputStream.size()); _currentRowDataByteBuffer.putInt(values.length); for (double value : values) { _variableSizeDataOutputStream.writeDouble(value); } } public void setColumn(int colId, @Nonnull String[] values) throws IOException { _currentRowDataByteBuffer.position(_columnOffsets[colId]); _currentRowDataByteBuffer.putInt(_variableSizeDataByteArrayOutputStream.size()); _currentRowDataByteBuffer.putInt(values.length); String columnName = _dataSchema.getColumnName(colId); Map<String, Integer> dictionary = _dictionaryMap.get(columnName); if (dictionary == null) { dictionary = new HashMap<>(); _dictionaryMap.put(columnName, dictionary); _reverseDictionaryMap.put(columnName, new HashMap<Integer, String>()); } for (String value : values) { Integer dictId = dictionary.get(value); if (dictId == null) { dictId = dictionary.size(); dictionary.put(value, dictId); _reverseDictionaryMap.get(columnName).put(dictId, value); } _variableSizeDataOutputStream.writeInt(dictId); } } public void finishRow() throws IOException { _fixedSizeDataByteArrayOutputStream.write(_currentRowDataByteBuffer.array()); } public DataTable build() { return new DataTableImplV2(_numRows, _dataSchema, _reverseDictionaryMap, _fixedSizeDataByteArrayOutputStream.toByteArray(), _variableSizeDataByteArrayOutputStream.toByteArray()); } /** * Build an empty data table based on the broker request. */ public static DataTable buildEmptyDataTable(BrokerRequest brokerRequest) throws IOException { // Selection query. if (brokerRequest.isSetSelections()) { Selection selection = brokerRequest.getSelections(); List<String> selectionColumns = selection.getSelectionColumns(); int numSelectionColumns = selectionColumns.size(); FieldSpec.DataType[] dataTypes = new FieldSpec.DataType[numSelectionColumns]; // Use STRING data type as default for selection query. Arrays.fill(dataTypes, FieldSpec.DataType.STRING); DataSchema dataSchema = new DataSchema(selectionColumns.toArray(new String[numSelectionColumns]), dataTypes); return new DataTableBuilder(dataSchema).build(); } // Aggregation query. List<AggregationInfo> aggregationsInfo = brokerRequest.getAggregationsInfo(); int numAggregations = aggregationsInfo.size(); AggregationFunctionContext[] aggregationFunctionContexts = new AggregationFunctionContext[numAggregations]; for (int i = 0; i < numAggregations; i++) { aggregationFunctionContexts[i] = AggregationFunctionContext.instantiate(aggregationsInfo.get(i)); } if (brokerRequest.isSetGroupBy()) { // Aggregation group-by query. String[] columnNames = new String[]{"functionName", "GroupByResultMap"}; FieldSpec.DataType[] columnTypes = new FieldSpec.DataType[]{FieldSpec.DataType.STRING, FieldSpec.DataType.OBJECT}; // Build the data table. DataTableBuilder dataTableBuilder = new DataTableBuilder(new DataSchema(columnNames, columnTypes)); for (int i = 0; i < numAggregations; i++) { dataTableBuilder.startRow(); dataTableBuilder.setColumn(0, aggregationFunctionContexts[i].getAggregationColumnName()); dataTableBuilder.setColumn(1, new HashMap<String, Object>()); dataTableBuilder.finishRow(); } return dataTableBuilder.build(); } else { // Aggregation only query. String[] aggregationColumnNames = new String[numAggregations]; FieldSpec.DataType[] dataTypes = new FieldSpec.DataType[numAggregations]; Object[] aggregationResults = new Object[numAggregations]; for (int i = 0; i < numAggregations; i++) { AggregationFunctionContext aggregationFunctionContext = aggregationFunctionContexts[i]; aggregationColumnNames[i] = aggregationFunctionContext.getAggregationColumnName(); AggregationFunction aggregationFunction = aggregationFunctionContext.getAggregationFunction(); dataTypes[i] = aggregationFunction.getIntermediateResultDataType(); aggregationResults[i] = aggregationFunction.extractAggregationResult(aggregationFunction.createAggregationResultHolder()); } // Build the data table. DataTableBuilder dataTableBuilder = new DataTableBuilder(new DataSchema(aggregationColumnNames, dataTypes)); dataTableBuilder.startRow(); for (int i = 0; i < numAggregations; i++) { switch (dataTypes[i]) { case LONG: dataTableBuilder.setColumn(i, ((Number) aggregationResults[i]).longValue()); break; case DOUBLE: dataTableBuilder.setColumn(i, ((Double) aggregationResults[i]).doubleValue()); break; case OBJECT: dataTableBuilder.setColumn(i, aggregationResults[i]); break; default: throw new UnsupportedOperationException( "Unsupported aggregation column data type: " + dataTypes[i] + " for column: " + aggregationColumnNames[i]); } } dataTableBuilder.finishRow(); return dataTableBuilder.build(); } } }