/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.io.readerwriter.impl; import java.nio.ByteOrder; import java.util.ArrayList; import java.util.List; import com.linkedin.pinot.core.io.reader.impl.FixedByteSingleValueMultiColReader; import com.linkedin.pinot.core.io.readerwriter.BaseSingleColumnMultiValueReaderWriter; import com.linkedin.pinot.core.io.writer.impl.FixedByteSingleValueMultiColWriter; import com.linkedin.pinot.core.segment.memory.PinotDataBuffer; /** * This class provides expandable off-heap implementation to store a multi-valued column across a number of rows. * The maximum number of values in any row must be known while invoking the constructor. Other than that, this class * allocates additional memory as needed to accommodate any number of rows. * * Writes into the data structure are strictly sequential, but reads can be random. * * Writes are of type: * * setIntArray(int rowNumber, int[] values) * * It is expected that rowNumber starts with 0 and increments by 1 on each invocation, and that it is decided ahead of * time that the class is used to store a certain type of data structure (int arrays, or char arrays, etc.) Mix & match * is not allowed. * * Two kinds of data structures are used in this class. * * 1. A header (essentially an index into the other data structure) that has one entry per row. The entry has 3 integers * - data buffer ID * - offset in the data buffer where column values start * - length (number of values in the multi-valued column). * * New header structures are added as new rows come in. Each header class holds the same number of rows (for easy lookup) * * 2. A data buffer that has the values for the column that the header points to. Data buffers are added as needed, * whenever we reach a limitation that we cannot fit the values of a column in the current buffer. * * Note that data buffers and headers grow independently. * * Data format * <code> * HEADER SECTION 0 * bufferId startIndex length * bufferId startIndex length * bufferId startIndex length * ... * HEADER SECTION 1 * bufferId startIndex length * bufferId startIndex length * bufferId startIndex length * ... * Data BUFFER SECTION 0 * [set of values of row 0] [set of values of row 1] * ..... * [set of values of row m] * Data BUFFER SECTION 1 * [set of values of row m +1 ] [set of values of row M +2] * ..... * [set of values of row ] * Data BUFFER SECTION N * [set of values of row ... ] [set of values of row ...] * ..... * [set of values of row n] * </code> * */ public class FixedByteSingleColumnMultiValueReaderWriter extends BaseSingleColumnMultiValueReaderWriter { /** * number of columns is 1, column size is variable but less than maxNumberOfMultiValuesPerRow * @param rows */ private static final int SIZE_OF_INT = 4; private static final int NUM_COLS_IN_HEADER = 3; private static final int INCREMENT_PERCENTAGE = 100;//Increments the Initial size by 100% of initial capacity every time we runs out of capacity private PinotDataBuffer headerBuffer; private List<PinotDataBuffer> dataBuffers = new ArrayList<>(); private List<PinotDataBuffer> headerBuffers = new ArrayList<>(); private List<FixedByteSingleValueMultiColReader> headerReaders = new ArrayList<>(); private List<FixedByteSingleValueMultiColWriter> headerWriters = new ArrayList<>(); private FixedByteSingleValueMultiColWriter curHeaderWriter; private List<FixedByteSingleValueMultiColWriter> dataWriters = new ArrayList<FixedByteSingleValueMultiColWriter>(); private List<FixedByteSingleValueMultiColReader> dataReaders = new ArrayList<FixedByteSingleValueMultiColReader>(); private FixedByteSingleValueMultiColWriter currentDataWriter; private int currentDataWriterIndex = -1; private int currentCapacity = 0; private int headerSize; private int incrementalCapacity; private int columnSizeInBytes; private int maxNumberOfMultiValuesPerRow; private final int rowCountPerChunk; private int prevRowStartIndex = 0; // Offset in the databuffer for the last row added. private int prevRowLength = 0; // Number of values in the column for the last row added. public FixedByteSingleColumnMultiValueReaderWriter(int maxNumberOfMultiValuesPerRow, int avgMultiValueCount, int rowCountPerChunk, int columnSizeInBytes) { int initialCapacity = Math.max(maxNumberOfMultiValuesPerRow, rowCountPerChunk * avgMultiValueCount); int incrementalCapacity = Math.max(maxNumberOfMultiValuesPerRow, (int) (initialCapacity * 1.0f * INCREMENT_PERCENTAGE / 100)); this.columnSizeInBytes = columnSizeInBytes; this.maxNumberOfMultiValuesPerRow = maxNumberOfMultiValuesPerRow; headerSize = rowCountPerChunk * SIZE_OF_INT * NUM_COLS_IN_HEADER; this.rowCountPerChunk = rowCountPerChunk; addHeaderBuffers(); //at least create space for million entries, which for INT translates into 4mb buffer this.incrementalCapacity = incrementalCapacity; addDataBuffers(initialCapacity); //init(rowCountPerChunk, columnSizeInBytes, maxNumberOfMultiValuesPerRow, initialCapacity, incrementalCapacity); } private void addHeaderBuffers() { headerBuffer = PinotDataBuffer.allocateDirect(headerSize); // We know that these bufffers will not be copied directly into a file (or mapped from a file). // So, we can use native byte order here. headerBuffer.order(ByteOrder.nativeOrder()); //dataBufferId, startIndex, length curHeaderWriter = new FixedByteSingleValueMultiColWriter(headerBuffer, rowCountPerChunk, 3, new int[] { SIZE_OF_INT, SIZE_OF_INT, SIZE_OF_INT }); FixedByteSingleValueMultiColReader curHeaderReader = new FixedByteSingleValueMultiColReader(headerBuffer, rowCountPerChunk, new int[] { SIZE_OF_INT, SIZE_OF_INT, SIZE_OF_INT }); headerBuffers.add(headerBuffer); headerWriters.add(curHeaderWriter); headerReaders.add(curHeaderReader); } /** * This method automatically computes the space needed based on the columnSizeInBytes * @param rowCapacity Additional capacity to be added in terms of number of rows * @throws RuntimeException */ private void addDataBuffers(int rowCapacity) throws RuntimeException { PinotDataBuffer dataBuffer; try { dataBuffer = PinotDataBuffer.allocateDirect(rowCapacity * columnSizeInBytes); dataBuffer.order(ByteOrder.nativeOrder()); dataBuffers.add(dataBuffer); currentDataWriter = new FixedByteSingleValueMultiColWriter(dataBuffer, rowCapacity, 1, new int[] { columnSizeInBytes }); dataWriters.add(currentDataWriter); FixedByteSingleValueMultiColReader dataFileReader = new FixedByteSingleValueMultiColReader(dataBuffer, rowCapacity, new int[] { columnSizeInBytes }); dataReaders.add(dataFileReader); //update the capacity currentCapacity = rowCapacity; currentDataWriterIndex = currentDataWriterIndex + 1; } catch (Exception e) { throw new RuntimeException("Error while expanding the capacity by allocating additional buffer with capacity:" + rowCapacity, e); } } @Override public void close() { for (PinotDataBuffer dataBuffer : dataBuffers) { dataBuffer.close(); } dataBuffers.clear(); for (PinotDataBuffer headerBuffer : headerBuffers) { headerBuffer.close(); } headerBuffers.clear(); headerBuffer = null; for (FixedByteSingleValueMultiColReader reader : headerReaders) { reader.close(); } for (FixedByteSingleValueMultiColReader reader : dataReaders) { reader.close(); } for (FixedByteSingleValueMultiColWriter writer : headerWriters) { writer.close(); } for (FixedByteSingleValueMultiColWriter writer : dataWriters) { writer.close(); } } private void writeIntoHeader(int row, int dataWriterIndex, int startIndex, int length) { if (row >= headerBuffers.size() * rowCountPerChunk) { addHeaderBuffers(); } curHeaderWriter.setInt(getRowInCurrentHeader(row), 0, dataWriterIndex); curHeaderWriter.setInt(getRowInCurrentHeader(row), 1, startIndex); curHeaderWriter.setInt(getRowInCurrentHeader(row), 2, length); } // TODO Use powers of two for rowCountPerChunk to optimize computation for the // methods below. Or, assert that the input values to the class are powers of two. TBD. private final FixedByteSingleValueMultiColReader getCurrentReader(int row) { return headerReaders.get(row / rowCountPerChunk); } private final int getRowInCurrentHeader(int row) { return row % rowCountPerChunk; } private int updateHeader(int row, int numValues) { assert (numValues <= maxNumberOfMultiValuesPerRow); int newStartIndex = prevRowStartIndex + prevRowLength; if (newStartIndex + numValues > currentCapacity) { addDataBuffers(incrementalCapacity); prevRowStartIndex = 0; prevRowLength = 0; newStartIndex = 0; } writeIntoHeader(row, currentDataWriterIndex, newStartIndex, numValues); prevRowStartIndex = newStartIndex; prevRowLength = numValues; return newStartIndex; } @Override public void setCharArray(int row, char[] charArray) { int newStartIndex = updateHeader(row, charArray.length); for (int i = 0; i < charArray.length; i++) { currentDataWriter.setChar(newStartIndex + i, 0, charArray[i]); } } @Override public void setShortArray(int row, short[] shortsArray) { int newStartIndex = updateHeader(row, shortsArray.length); for (int i = 0; i < shortsArray.length; i++) { currentDataWriter.setShort(newStartIndex + i, 0, shortsArray[i]); } } @Override public void setIntArray(int row, int[] intArray) { int newStartIndex = updateHeader(row, intArray.length); for (int i = 0; i < intArray.length; i++) { currentDataWriter.setInt(newStartIndex + i, 0, intArray[i]); } } @Override public void setLongArray(int row, long[] longArray) { int newStartIndex = updateHeader(row, longArray.length); for (int i = 0; i < longArray.length; i++) { currentDataWriter.setLong(newStartIndex + i, 0, longArray[i]); } } @Override public void setFloatArray(int row, float[] floatArray) { int newStartIndex = updateHeader(row, floatArray.length); for (int i = 0; i < floatArray.length; i++) { currentDataWriter.setFloat(newStartIndex + i, 0, floatArray[i]); } } @Override public void setDoubleArray(int row, double[] doubleArray) { int newStartIndex = updateHeader(row, doubleArray.length); for (int i = 0; i < doubleArray.length; i++) { currentDataWriter.setDouble(newStartIndex + i, 0, doubleArray[i]); } } @Override public void setStringArray(int row, String[] stringArray) { int newStartIndex = updateHeader(row, stringArray.length); for (int i = 0; i < stringArray.length; i++) { currentDataWriter.setString(newStartIndex + i, 0, stringArray[i]); } } @Override public void setBytesArray(int row, byte[][] bytesArray) { int newStartIndex = updateHeader(row, bytesArray.length); for (int i = 0; i < bytesArray.length; i++) { currentDataWriter.setBytes(newStartIndex + i, 0, bytesArray[i]); } } @Override public int getCharArray(int row, char[] charArray) { FixedByteSingleValueMultiColReader headerReader = getCurrentReader(row); int rowInCurrentHeader = getRowInCurrentHeader(row); int bufferIndex = headerReader.getInt(rowInCurrentHeader, 0); int startIndex = headerReader.getInt(rowInCurrentHeader, 1); int length = headerReader.getInt(rowInCurrentHeader, 2); FixedByteSingleValueMultiColReader dataReader = dataReaders.get(bufferIndex); for (int i = 0; i < length; i++) { charArray[i] = dataReader.getChar(startIndex + i, 0); } return length; } @Override public int getShortArray(int row, short[] shortsArray) { FixedByteSingleValueMultiColReader headerReader = getCurrentReader(row); int rowInCurrentHeader = getRowInCurrentHeader(row); int bufferIndex = headerReader.getInt(rowInCurrentHeader, 0); int startIndex = headerReader.getInt(rowInCurrentHeader, 1); int length = headerReader.getInt(rowInCurrentHeader, 2); FixedByteSingleValueMultiColReader dataReader = dataReaders.get(bufferIndex); for (int i = 0; i < length; i++) { shortsArray[i] = dataReader.getShort(startIndex + i, 0); } return length; } @Override public int getIntArray(int row, int[] intArray) { FixedByteSingleValueMultiColReader headerReader = getCurrentReader(row); int rowInCurrentHeader = getRowInCurrentHeader(row); int bufferIndex = headerReader.getInt(rowInCurrentHeader, 0); int startIndex = headerReader.getInt(rowInCurrentHeader, 1); int length = headerReader.getInt(rowInCurrentHeader, 2); FixedByteSingleValueMultiColReader dataReader = dataReaders.get(bufferIndex); for (int i = 0; i < length; i++) { intArray[i] = dataReader.getInt(startIndex + i, 0); } return length; } @Override public int getLongArray(int row, long[] longArray) { FixedByteSingleValueMultiColReader headerReader = getCurrentReader(row); int rowInCurrentHeader = getRowInCurrentHeader(row); int bufferIndex = headerReader.getInt(rowInCurrentHeader, 0); int startIndex = headerReader.getInt(rowInCurrentHeader, 1); int length = headerReader.getInt(rowInCurrentHeader, 2); FixedByteSingleValueMultiColReader dataReader = dataReaders.get(bufferIndex); for (int i = 0; i < length; i++) { longArray[i] = dataReader.getLong(startIndex + i, 0); } return length; } @Override public int getFloatArray(int row, float[] floatArray) { FixedByteSingleValueMultiColReader headerReader = getCurrentReader(row); int rowInCurrentHeader = getRowInCurrentHeader(row); int bufferIndex = headerReader.getInt(rowInCurrentHeader, 0); int startIndex = headerReader.getInt(rowInCurrentHeader, 1); int length = headerReader.getInt(rowInCurrentHeader, 2); FixedByteSingleValueMultiColReader dataReader = dataReaders.get(bufferIndex); for (int i = 0; i < length; i++) { floatArray[i] = dataReader.getFloat(startIndex + i, 0); } return length; } @Override public int getDoubleArray(int row, double[] doubleArray) { FixedByteSingleValueMultiColReader headerReader = getCurrentReader(row); int rowInCurrentHeader = getRowInCurrentHeader(row); int bufferIndex = headerReader.getInt(rowInCurrentHeader, 0); int startIndex = headerReader.getInt(rowInCurrentHeader, 1); int length = headerReader.getInt(rowInCurrentHeader, 2); FixedByteSingleValueMultiColReader dataReader = dataReaders.get(bufferIndex); for (int i = 0; i < length; i++) { doubleArray[i] = dataReader.getDouble(startIndex + i, 0); } return length; } @Override public int getStringArray(int row, String[] stringArray) { FixedByteSingleValueMultiColReader headerReader = getCurrentReader(row); int rowInCurrentHeader = getRowInCurrentHeader(row); int bufferIndex = headerReader.getInt(rowInCurrentHeader, 0); int startIndex = headerReader.getInt(rowInCurrentHeader, 1); int length = headerReader.getInt(rowInCurrentHeader, 2); FixedByteSingleValueMultiColReader dataReader = dataReaders.get(bufferIndex); for (int i = 0; i < length; i++) { stringArray[i] = dataReader.getString(startIndex + i, 0); } return length; } @Override public int getBytesArray(int row, byte[][] bytesArray) { FixedByteSingleValueMultiColReader headerReader = getCurrentReader(row); int rowInCurrentHeader = getRowInCurrentHeader(row); int bufferIndex = headerReader.getInt(rowInCurrentHeader, 0); int startIndex = headerReader.getInt(rowInCurrentHeader, 1); int length = headerReader.getInt(rowInCurrentHeader, 2); FixedByteSingleValueMultiColReader dataReader = dataReaders.get(bufferIndex); for (int i = 0; i < length; i++) { bytesArray[i] = dataReader.getBytes(startIndex + i, 0); } return length; } }