FixedBitMultiValueReader.java example

Explorer
pinot-master
/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.core.io.reader.impl.v1;

import com.google.common.base.Preconditions;
import com.google.common.primitives.Ints;
import com.linkedin.pinot.core.io.reader.BaseSingleColumnMultiValueReader;
import com.linkedin.pinot.core.io.reader.impl.FixedBitSingleValueMultiColReader;
import com.linkedin.pinot.core.io.reader.impl.FixedByteSingleValueMultiColReader;
import com.linkedin.pinot.core.segment.memory.PinotDataBuffer;
import com.linkedin.pinot.core.util.PinotDataCustomBitSet;
import java.io.IOException;

/**
 * Storage Layout
 * ==============
 * There will be three sections HEADER section, BITMAP and RAW DATA
 * CHUNK OFFSET HEADER will contain one line per chunk, each line corresponding to the start offset
 * and length of the chunk
 * BITMAP This will contain sequence of bits. The number of bits will be equal to the
 * totalNumberOfValues.A bit is set to 1 if its start of a new docId. The number of bits set to 1
 * will be equal to the number of docs.
 * RAWDATA This simply has the actual multi valued data stored in sequence of int's. The number of
 * ints is equal to the totalNumberOfValues
 * We divide all the documents into groups refered to as CHUNK. Each CHUNK will
 * - Have the same number of documents.
 * - Started Offset of each CHUNK in the BITMAP will stored in the HEADER section. This is to speed
 * the look up.
 * Over all each look up will take log(NUM CHUNKS) for binary search + CHUNK to linear scan on the
 * bitmap to find the right offset in the raw data section
 */
public class FixedBitMultiValueReader
    extends BaseSingleColumnMultiValueReader<MultiValueReaderContext> {

  private static final int SIZE_OF_INT = 4;
  private static final int NUM_COLS_IN_HEADER = 1;
  private static final int PREFERRED_NUM_VALUES_PER_CHUNK = 2048;

  private PinotDataBuffer indexDataBuffer;
  private PinotDataBuffer chunkOffsetsBuffer;
  private PinotDataBuffer bitsetBuffer;
  private PinotDataBuffer rawDataBuffer;
  private FixedByteSingleValueMultiColReader chunkOffsetsReader;
  private PinotDataCustomBitSet customBitSet;
  private FixedBitSingleValueMultiColReader rawDataReader;
  private int numChunks;
  private int chunkOffsetHeaderSize;
  private int bitsetSize;
  private int rawDataSize;
  private int totalSize;
  private int totalNumValues;
  private int docsPerChunk;
  private final int numDocs;

  public FixedBitMultiValueReader(PinotDataBuffer indexDataBuffer, int numDocs, int totalNumValues,
      int columnSizeInBits, boolean signed) {

    this.indexDataBuffer = indexDataBuffer;
    this.numDocs = numDocs;
    this.totalNumValues = totalNumValues;
    float averageValuesPerDoc = totalNumValues / numDocs;
    this.docsPerChunk = (int) (Math.ceil(PREFERRED_NUM_VALUES_PER_CHUNK / averageValuesPerDoc));
    this.numChunks = (numDocs + docsPerChunk - 1) / docsPerChunk;
    chunkOffsetHeaderSize = numChunks * SIZE_OF_INT * NUM_COLS_IN_HEADER;
    bitsetSize = (totalNumValues + 7) / 8;
    rawDataSize = Ints.checkedCast(((long) totalNumValues * columnSizeInBits + 7) / 8);
    totalSize = chunkOffsetHeaderSize + bitsetSize + rawDataSize;
    Preconditions.checkState(totalSize > 0 && totalSize < Integer.MAX_VALUE, "Total size should not exceed 2GB");
    chunkOffsetsBuffer = indexDataBuffer.view(0, chunkOffsetHeaderSize);
    int bitsetEndPos = chunkOffsetHeaderSize + bitsetSize;
    bitsetBuffer = indexDataBuffer.view(chunkOffsetHeaderSize, bitsetEndPos);
    rawDataBuffer = indexDataBuffer.view(bitsetEndPos, bitsetEndPos + rawDataSize);
    chunkOffsetsReader = new FixedByteSingleValueMultiColReader(chunkOffsetsBuffer, numChunks, new int[] {
        SIZE_OF_INT
    });

    customBitSet = PinotDataCustomBitSet.withDataBuffer(bitsetSize, bitsetBuffer);
    rawDataReader = new FixedBitSingleValueMultiColReader(rawDataBuffer, totalNumValues,
        1, new int[] {
            columnSizeInBits
        }, new boolean[] {
            signed
        });
  }

  public int getTotalSize() {
    return totalSize;
  }

  @Override
  public void close() throws IOException {
    customBitSet.close();
    customBitSet = null;
    rawDataReader.close();
    rawDataReader = null;
    chunkOffsetsReader.close();
    indexDataBuffer.close();
  }

  private int computeLength(int rowOffSetStart) {
    long rowOffSetEnd = customBitSet.nextSetBitAfter(rowOffSetStart);
    if (rowOffSetEnd < 0) {
      return (totalNumValues - rowOffSetStart);
    }
    return (int) (rowOffSetEnd - rowOffSetStart);
  }

  private int computeStartOffset(int row) {
    int chunkId = row / docsPerChunk;
    int chunkIdOffset = chunkOffsetsReader.getInt(chunkId, 0);
    if (row % docsPerChunk == 0) {
      return chunkIdOffset;
    }
    return customBitSet.findNthBitSetAfter(chunkIdOffset, row - chunkId * docsPerChunk);
  }

  @Override
  public int getIntArray(int row, int[] intArray) {
    int startOffset = computeStartOffset(row);
    int length = computeLength(startOffset);
    for (int i = 0; i < length; i++) {
      intArray[i] = rawDataReader.getInt(startOffset + i, 0);
    }
    return length;
  }

  @Override
  public int getIntArray(int row, int[] intArray, MultiValueReaderContext readerContext) {
    int startOffset = 0;
    int endOffset = 0;
    int chunkId = row / docsPerChunk;
    int length;
    if (readerContext.chunkId == chunkId && row > readerContext.rowId ) {
      if (row == readerContext.rowId + 1) {
        startOffset = readerContext.endPosition;
      } else  {
        startOffset = customBitSet.findNthBitSetAfter(readerContext.endPosition,
            (row - readerContext.rowId - 1));
      }
    } else {
      int chunkIdOffset = chunkOffsetsReader.getInt(chunkId, 0);
      if (row % docsPerChunk == 0) {
        startOffset = chunkIdOffset;
      } else {
        startOffset =
            customBitSet.findNthBitSetAfter(chunkIdOffset, row - chunkId * docsPerChunk);
      }
    }
    endOffset = customBitSet.findNthBitSetAfter(startOffset, 1);
    if (endOffset < 0) {
      length = totalNumValues - startOffset;
    } else {
      length = endOffset - startOffset;
    }
    rawDataReader.getInt(startOffset, length, 0, intArray);
    readerContext.rowId = row;
    readerContext.chunkId = chunkId;
    readerContext.startPosition = startOffset;
    readerContext.endPosition = endOffset;
    return length;

  }

  // for testing reader context
  private void validate(int row, int[] intArray, int length) {
    int[] temp = new int[intArray.length];
    int tempLength = getIntArray(row, temp);
    if (tempLength != length) {
      System.out.println("ERROR");
    } else {
      for (int i = 0; i < length; i++) {
        if (temp[i] != intArray[i]) {
          System.out.println("ERROR expected startOffset:" + computeStartOffset(row));
        }
      }
    }
  }

  @Override
  public MultiValueReaderContext createContext() {
    MultiValueReaderContext readerContext = new MultiValueReaderContext();
    return readerContext;
  }
}