/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.io.writer.impl.v1; import com.linkedin.pinot.core.io.compression.ChunkCompressor; import java.io.File; import java.io.IOException; import java.nio.charset.Charset; import javax.annotation.concurrent.NotThreadSafe; /** * Class to write out variable length bytes into a single column. * * The layout of the file is as follows: * <p> Header Section: </p> * <ul> * <li> Integer: File format version. </li> * <li> Integer: Total number of chunks. </li> * <li> Integer: Number of docs per chunk. </li> * <li> Integer: Length of longest entry (in bytes). </li> * <li> Integer array: Integer offsets for all chunks in the data .</li> * </ul> * * <p> Individual Chunks: </p> * <ul> * <li> Integer offsets to start position of rows: For partial chunks, offset values are 0 for missing rows. </li> * <li> Data bytes. </li> * </ul> * * Only sequential writes are supported. */ @NotThreadSafe public class VarByteChunkSingleValueWriter extends BaseChunkSingleValueWriter { private static final int INT_SIZE = Integer.SIZE / Byte.SIZE; private static final Charset UTF_8 = Charset.forName("UTF-8"); private static final int VERSION = 1; private final int _chunkHeaderSize; private int _chunkHeaderOffset; private int _chunkDataOffSet; /** * Constructor for the class. * * @param file File to write to. * @param compressor Compressor for compressing individual chunks of data. * @param totalDocs Total number of docs to write. * @param numDocsPerChunk Number of documents per chunk. * @param lengthOfLongestEntry Length of longest entry (in bytes). * @throws IOException */ public VarByteChunkSingleValueWriter(File file, ChunkCompressor compressor, int totalDocs, int numDocsPerChunk, int lengthOfLongestEntry) throws IOException { super(file, compressor, totalDocs, numDocsPerChunk, ((numDocsPerChunk * INT_SIZE) + (lengthOfLongestEntry * numDocsPerChunk)), // chunkSize lengthOfLongestEntry, VERSION); _chunkHeaderOffset = 0; _chunkHeaderSize = numDocsPerChunk * INT_SIZE; _chunkDataOffSet = _chunkHeaderSize; } @Override public void setString(int row, String string) { byte[] bytes = string.getBytes(UTF_8); int length = bytes.length; _chunkBuffer.putInt(_chunkHeaderOffset, _chunkDataOffSet); _chunkHeaderOffset += INT_SIZE; _chunkBuffer.position(_chunkDataOffSet); _chunkBuffer.put(bytes); _chunkDataOffSet += length; // If buffer filled, then compress and write to file. if (_chunkHeaderOffset == _chunkHeaderSize) { writeChunk(); } } @Override public void setBytes(int row, byte[] bytes) { throw new UnsupportedOperationException(); } @Override public void close() throws IOException { // Write the chunk if it is non-empty. if (_chunkBuffer.position() > 0) { writeChunk(); } // Write the header and close the file. _header.flip(); _dataFile.write(_header, 0); _dataFile.close(); } /** * Helper method to compress and write the current chunk. * <ul> * <li> Chunk header is of fixed size, so fills out any remaining offsets for partially filled chunks. </li> * <li> Compresses and writes the chunk to the data file. </li> * <li> Updates the header with the current chunks offset. </li> * <li> Clears up the buffers, so that they can be reused. </li> * </ul> * */ protected void writeChunk() { // For partially filled chunks, we still need to clear the offsets for remaining rows, as we reuse this buffer. for (int i = _chunkHeaderOffset; i < _chunkHeaderSize; i += INT_SIZE) { _chunkBuffer.putInt(i, 0); } super.writeChunk(); // Reset the chunk offsets. _chunkHeaderOffset = 0; _chunkDataOffSet = _chunkHeaderSize; } }