RawIndexBenchmark.java example

Explorer
pinot-master
/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.perf;

import com.linkedin.pinot.common.data.DimensionFieldSpec;
import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.segment.ReadMode;
import com.linkedin.pinot.core.common.Block;
import com.linkedin.pinot.core.common.BlockId;
import com.linkedin.pinot.core.data.GenericRow;
import com.linkedin.pinot.core.data.readers.RecordReader;
import com.linkedin.pinot.core.data.readers.TestRecordReader;
import com.linkedin.pinot.core.indexsegment.IndexSegment;
import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig;
import com.linkedin.pinot.core.operator.BReusableFilteredDocIdSetOperator;
import com.linkedin.pinot.core.operator.BaseOperator;
import com.linkedin.pinot.core.operator.MProjectionOperator;
import com.linkedin.pinot.core.operator.blocks.BaseFilterBlock;
import com.linkedin.pinot.core.operator.blocks.ProjectionBlock;
import com.linkedin.pinot.core.operator.docvalsets.ProjectionBlockValSet;
import com.linkedin.pinot.core.operator.filter.BaseFilterOperator;
import com.linkedin.pinot.core.plan.DocIdSetPlanNode;
import com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl;
import com.linkedin.pinot.core.segment.creator.impl.V1Constants;
import com.linkedin.pinot.core.segment.index.loader.Loaders;
import com.linkedin.pinot.operator.ArrayBasedFilterBlock;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import org.apache.commons.io.FileUtils;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;


/**
 * Class to perform benchmark on lookups for dictionary encoded fwd index v.s. raw index without dictionary.
 * It can take an existing segment with two columns to compare. It can also create a segment on the fly with a
 * given input file containing strings (one string per line).
 */
@SuppressWarnings({"FieldCanBeLocal", "unused"})
public class RawIndexBenchmark {
  private static final String SEGMENT_DIR_NAME = System.getProperty("java.io.tmpdir") + File.separator + "rawIndexPerf";
  private static final String SEGMENT_NAME = "perfTestSegment";
  private static final int NUM_COLUMNS = 2;

  private static final String DEFAULT_RAW_INDEX_COLUMN = "column_0";
  private static final String DEFAULT_FWD_INDEX_COLUMN = "column_1";
  private static final int DEFAULT_NUM_LOOKUP = 100_000;
  private static final int DEFAULT_NUM_CONSECUTIVE_LOOKUP = 50;

  @Option(name = "-segmentDir", required = false, forbids = {"-dataFile"}, usage = "Untarred segment")
  private String _segmentDir = null;

  @Option(name = "-fwdIndexColumn", required = false, usage = "Name of column with dictionary encoded index")
  private String _fwdIndexColumn = DEFAULT_FWD_INDEX_COLUMN;

  @Option(name = "-rawIndexColumn", required = false, usage = "Name of column with raw index (no-dictionary")
  private String _rawIndexColumn = DEFAULT_RAW_INDEX_COLUMN;

  @Option(name = "-dataFile", required = false, forbids = {"-segmentDir"}, usage = "File containing input data (one string per line)")
  private String _dataFile = null;

  @Option(name = "-loadMode", required = false, usage = "Load mode for data (mmap|heap")
  private String _loadMode = "heap";

  @Option(name = "-numLookups", required = false, usage = "Number of lookups to be performed for benchmark")
  private int _numLookups = DEFAULT_NUM_LOOKUP;

  @Option(name = "-numConsecutiveLookups", required = false, usage = "Number of consecutive docIds to lookup")
  private int _numConsecutiveLookups = DEFAULT_NUM_CONSECUTIVE_LOOKUP;

  @Option(name = "-help", required = false, help = true, aliases = {"-h"}, usage = "print this message")
  private boolean _help = false;

  private int _numRows = 0;

  public void run()
      throws Exception {
    if (_segmentDir == null && _dataFile == null) {
      System.out.println("Error: One of 'segmentDir' or 'dataFile' must be specified");
      return;
    }

    File segmentFile = (_segmentDir == null) ? buildSegment() : new File(_segmentDir);
    IndexSegment segment = Loaders.IndexSegment.load(segmentFile, ReadMode.valueOf(_loadMode));
    compareIndexSizes(segment, segmentFile, _fwdIndexColumn, _rawIndexColumn);
    compareLookups(segment);

    // Cleanup the temporary directory
    if (_segmentDir != null) {
      FileUtils.deleteQuietly(new File(SEGMENT_DIR_NAME));
    }
    segment.destroy();
  }

  /**
   * Helper method that builds a segment containing two columns both with data from input file.
   * The first column has raw indices (no dictionary), where as the second column is dictionary encoded.
   *
   * @throws Exception
   */
  private File buildSegment()
      throws Exception {
    Schema schema = new Schema();

    for (int i = 0; i < NUM_COLUMNS; i++) {
      String column = "column_" + i;
      DimensionFieldSpec dimensionFieldSpec = new DimensionFieldSpec(column, FieldSpec.DataType.STRING, true);
      schema.addField(dimensionFieldSpec);
    }

    SegmentGeneratorConfig config = new SegmentGeneratorConfig(schema);
    config.setRawIndexCreationColumns(Collections.singletonList(_rawIndexColumn));

    config.setOutDir(SEGMENT_DIR_NAME);
    config.setSegmentName(SEGMENT_NAME);

    BufferedReader reader = new BufferedReader(new FileReader(_dataFile));
    String value;

    final List<GenericRow> rows = new ArrayList<>();

    System.out.println("Reading data...");
    while ((value = reader.readLine()) != null) {
      HashMap<String, Object> map = new HashMap<>();

      for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) {
        map.put(fieldSpec.getName(), value);
      }

      GenericRow genericRow = new GenericRow();
      genericRow.init(map);
      rows.add(genericRow);
      _numRows++;

      if (_numRows % 1000000 == 0) {
        System.out.println("Read rows: " + _numRows);
      }
    }

    System.out.println("Generating segment...");
    SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
    RecordReader recordReader = new TestRecordReader(rows, schema);

    driver.init(config, recordReader);
    driver.build();

    return new File(SEGMENT_DIR_NAME, SEGMENT_NAME);
  }

  /**
   * Compares and prints the index size for the raw and dictionary encoded columns.
   *
   * @param segment Segment to compare
   */
  private void compareIndexSizes(IndexSegment segment, File segmentDir, String fwdIndexColumn, String rawIndexColumn) {
    String filePrefix = segmentDir.getAbsolutePath() + File.separator;
    File rawIndexFile = new File(filePrefix + rawIndexColumn + V1Constants.Indexes.RAW_SV_FWD_IDX_FILE_EXTENTION);

    String extension = (segment.getDataSource(_fwdIndexColumn).getDataSourceMetadata().isSorted())
        ? V1Constants.Indexes.SORTED_FWD_IDX_FILE_EXTENTION : V1Constants.Indexes.UN_SORTED_SV_FWD_IDX_FILE_EXTENTION;

    File fwdIndexFile = new File(filePrefix + _fwdIndexColumn + extension);
    File fwdIndexDictFile = new File(filePrefix + _fwdIndexColumn + V1Constants.Dict.FILE_EXTENTION);

    long rawIndexSize = rawIndexFile.length();
    long fwdIndexSize = fwdIndexFile.length() + fwdIndexDictFile.length();

    System.out.println("Raw index size: " + toMegaBytes(rawIndexSize) + " MB.");
    System.out.println("Fwd index size: " + toMegaBytes(fwdIndexSize) + " MB.");
    System.out.println("Storage space saving: " + ((fwdIndexSize - rawIndexSize) * 100.0 / fwdIndexSize) + " %");
  }

  /**
   * Compares lookup times for the two columns.
   * Performs {@link #_numConsecutiveLookups} on the two columns on randomly generated docIds.
   *
   * @param segment Segment to compare the columns for
   */
  private void compareLookups(IndexSegment segment) {
    int[] filteredDocIds = generateDocIds(segment);
    long rawIndexTime = profileLookups(segment, _rawIndexColumn, filteredDocIds);
    long fwdIndexTime = profileLookups(segment, _fwdIndexColumn, filteredDocIds);

    System.out.println("Raw index lookup time: " + rawIndexTime);
    System.out.println("Fwd index lookup time: " + fwdIndexTime);
    System.out.println("Percentage change: " + ((fwdIndexTime - rawIndexTime) * 100.0 / rawIndexTime) + " %");
  }

  /**
   * Profiles the lookup time for a given column, for the given docIds.
   *
   * @param segment Segment to profile
   * @param column Column to profile
   * @param docIds DocIds to lookup on the column
   * @return Time take in millis for the lookups
   */
  private long profileLookups(IndexSegment segment, String column, int[] docIds) {
    BaseFilterOperator filterOperator = new TestFilterOperator(docIds);
    BReusableFilteredDocIdSetOperator docIdSetOperator =
        new BReusableFilteredDocIdSetOperator(filterOperator, docIds.length, DocIdSetPlanNode.MAX_DOC_PER_CALL);

    ProjectionBlock projectionBlock;
    MProjectionOperator projectionOperator = new MProjectionOperator(buildDataSourceMap(segment), docIdSetOperator);

    long start = System.currentTimeMillis();
    while ((projectionBlock = (ProjectionBlock) projectionOperator.nextBlock()) != null) {
      ProjectionBlockValSet blockValueSet = (ProjectionBlockValSet) projectionBlock.getBlockValueSet(column);
      blockValueSet.getDoubleValuesSV();
    }
    return (System.currentTimeMillis() - start);
  }

  /**
   * Convert from bytes to mega-bytes.
   *
   * @param sizeInBytes Size to convert
   * @return Size in MB's
   */
  private double toMegaBytes(long sizeInBytes) {
    return sizeInBytes / (1024 * 1024);
  }

  /**
   * Helper method to build map from column to data source
   *
   * @param segment Segment for which to build the map
   * @return Column to data source map
   */
  private Map<String, BaseOperator> buildDataSourceMap(IndexSegment segment) {
    Map<String, BaseOperator> dataSourceMap = new HashMap<>();
    for (String column : segment.getColumnNames()) {
      dataSourceMap.put(column, segment.getDataSource(column));
    }
    return dataSourceMap;
  }

  /**
   * Generate random docIds.
   * <ul>
   *   <li> Total of {@link #_numLookups} docIds are generated. </li>
   *   <li> DocId's are in clusters containing {@link #_numConsecutiveLookups} ids. </li>
   * </ul>
   * @param segment
   * @return
   */
  private int[] generateDocIds(IndexSegment segment) {
    Random random = new Random();
    int numDocs = segment.getSegmentMetadata().getTotalDocs();
    int maxDocId = numDocs - _numConsecutiveLookups - 1;

    int[] docIdSet = new int[_numLookups];
    int j = 0;
    for (int i = 0; i < (_numLookups / _numConsecutiveLookups); i++) {
      int startDocId = random.nextInt(maxDocId);
      int endDocId = startDocId + _numConsecutiveLookups;

      for (int docId = startDocId; docId < endDocId; docId++) {
        docIdSet[j++] = docId;
      }
    }

    int docId = random.nextInt(maxDocId);
    for (; j < _numLookups; ++j) {
      docIdSet[j] = docId++;
    }
    return docIdSet;
  }

  /**
   * Helper class to generate doc id's for lookup
   */
  class TestFilterOperator extends BaseFilterOperator {
    private static final String OPERATOR_NAME = "TestFilterOperator";
    private int[] _filteredDocIds;

    public TestFilterOperator(int[] filteredDocIds) {
      _filteredDocIds = filteredDocIds;
    }

    @Override
    public BaseFilterBlock nextFilterBlock(BlockId blockId) {
      return new ArrayBasedFilterBlock(_filteredDocIds);
    }

    @Override
    public boolean isResultEmpty() {
      return false;
    }

    @Override
    public boolean open() {
      return true;
    }

    @Override
    public boolean close() {
      return true;
    }

    @Override
    public String getOperatorName() {
      return OPERATOR_NAME;
    }
  }

  /**
   * Main method for the class. Parses the command line arguments, and invokes the benchmark.
   *
   * @param args Command line arguments.
   * @throws Exception
   */
  public static void main(String[] args)
      throws Exception {
    RawIndexBenchmark benchmark = new RawIndexBenchmark();
    CmdLineParser parser = new CmdLineParser(benchmark);
    parser.parseArgument(args);
    benchmark.run();
  }
}