/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.perf; import com.linkedin.pinot.common.data.DimensionFieldSpec; import com.linkedin.pinot.common.data.FieldSpec; import com.linkedin.pinot.common.data.Schema; import com.linkedin.pinot.common.segment.ReadMode; import com.linkedin.pinot.core.common.Block; import com.linkedin.pinot.core.common.BlockId; import com.linkedin.pinot.core.data.GenericRow; import com.linkedin.pinot.core.data.readers.RecordReader; import com.linkedin.pinot.core.data.readers.TestRecordReader; import com.linkedin.pinot.core.indexsegment.IndexSegment; import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig; import com.linkedin.pinot.core.operator.BReusableFilteredDocIdSetOperator; import com.linkedin.pinot.core.operator.BaseOperator; import com.linkedin.pinot.core.operator.MProjectionOperator; import com.linkedin.pinot.core.operator.blocks.BaseFilterBlock; import com.linkedin.pinot.core.operator.blocks.ProjectionBlock; import com.linkedin.pinot.core.operator.docvalsets.ProjectionBlockValSet; import com.linkedin.pinot.core.operator.filter.BaseFilterOperator; import com.linkedin.pinot.core.plan.DocIdSetPlanNode; import com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl; import com.linkedin.pinot.core.segment.creator.impl.V1Constants; import com.linkedin.pinot.core.segment.index.loader.Loaders; import com.linkedin.pinot.operator.ArrayBasedFilterBlock; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Random; import org.apache.commons.io.FileUtils; import org.kohsuke.args4j.CmdLineParser; import org.kohsuke.args4j.Option; /** * Class to perform benchmark on lookups for dictionary encoded fwd index v.s. raw index without dictionary. * It can take an existing segment with two columns to compare. It can also create a segment on the fly with a * given input file containing strings (one string per line). */ @SuppressWarnings({"FieldCanBeLocal", "unused"}) public class RawIndexBenchmark { private static final String SEGMENT_DIR_NAME = System.getProperty("java.io.tmpdir") + File.separator + "rawIndexPerf"; private static final String SEGMENT_NAME = "perfTestSegment"; private static final int NUM_COLUMNS = 2; private static final String DEFAULT_RAW_INDEX_COLUMN = "column_0"; private static final String DEFAULT_FWD_INDEX_COLUMN = "column_1"; private static final int DEFAULT_NUM_LOOKUP = 100_000; private static final int DEFAULT_NUM_CONSECUTIVE_LOOKUP = 50; @Option(name = "-segmentDir", required = false, forbids = {"-dataFile"}, usage = "Untarred segment") private String _segmentDir = null; @Option(name = "-fwdIndexColumn", required = false, usage = "Name of column with dictionary encoded index") private String _fwdIndexColumn = DEFAULT_FWD_INDEX_COLUMN; @Option(name = "-rawIndexColumn", required = false, usage = "Name of column with raw index (no-dictionary") private String _rawIndexColumn = DEFAULT_RAW_INDEX_COLUMN; @Option(name = "-dataFile", required = false, forbids = {"-segmentDir"}, usage = "File containing input data (one string per line)") private String _dataFile = null; @Option(name = "-loadMode", required = false, usage = "Load mode for data (mmap|heap") private String _loadMode = "heap"; @Option(name = "-numLookups", required = false, usage = "Number of lookups to be performed for benchmark") private int _numLookups = DEFAULT_NUM_LOOKUP; @Option(name = "-numConsecutiveLookups", required = false, usage = "Number of consecutive docIds to lookup") private int _numConsecutiveLookups = DEFAULT_NUM_CONSECUTIVE_LOOKUP; @Option(name = "-help", required = false, help = true, aliases = {"-h"}, usage = "print this message") private boolean _help = false; private int _numRows = 0; public void run() throws Exception { if (_segmentDir == null && _dataFile == null) { System.out.println("Error: One of 'segmentDir' or 'dataFile' must be specified"); return; } File segmentFile = (_segmentDir == null) ? buildSegment() : new File(_segmentDir); IndexSegment segment = Loaders.IndexSegment.load(segmentFile, ReadMode.valueOf(_loadMode)); compareIndexSizes(segment, segmentFile, _fwdIndexColumn, _rawIndexColumn); compareLookups(segment); // Cleanup the temporary directory if (_segmentDir != null) { FileUtils.deleteQuietly(new File(SEGMENT_DIR_NAME)); } segment.destroy(); } /** * Helper method that builds a segment containing two columns both with data from input file. * The first column has raw indices (no dictionary), where as the second column is dictionary encoded. * * @throws Exception */ private File buildSegment() throws Exception { Schema schema = new Schema(); for (int i = 0; i < NUM_COLUMNS; i++) { String column = "column_" + i; DimensionFieldSpec dimensionFieldSpec = new DimensionFieldSpec(column, FieldSpec.DataType.STRING, true); schema.addField(dimensionFieldSpec); } SegmentGeneratorConfig config = new SegmentGeneratorConfig(schema); config.setRawIndexCreationColumns(Collections.singletonList(_rawIndexColumn)); config.setOutDir(SEGMENT_DIR_NAME); config.setSegmentName(SEGMENT_NAME); BufferedReader reader = new BufferedReader(new FileReader(_dataFile)); String value; final List<GenericRow> rows = new ArrayList<>(); System.out.println("Reading data..."); while ((value = reader.readLine()) != null) { HashMap<String, Object> map = new HashMap<>(); for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) { map.put(fieldSpec.getName(), value); } GenericRow genericRow = new GenericRow(); genericRow.init(map); rows.add(genericRow); _numRows++; if (_numRows % 1000000 == 0) { System.out.println("Read rows: " + _numRows); } } System.out.println("Generating segment..."); SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl(); RecordReader recordReader = new TestRecordReader(rows, schema); driver.init(config, recordReader); driver.build(); return new File(SEGMENT_DIR_NAME, SEGMENT_NAME); } /** * Compares and prints the index size for the raw and dictionary encoded columns. * * @param segment Segment to compare */ private void compareIndexSizes(IndexSegment segment, File segmentDir, String fwdIndexColumn, String rawIndexColumn) { String filePrefix = segmentDir.getAbsolutePath() + File.separator; File rawIndexFile = new File(filePrefix + rawIndexColumn + V1Constants.Indexes.RAW_SV_FWD_IDX_FILE_EXTENTION); String extension = (segment.getDataSource(_fwdIndexColumn).getDataSourceMetadata().isSorted()) ? V1Constants.Indexes.SORTED_FWD_IDX_FILE_EXTENTION : V1Constants.Indexes.UN_SORTED_SV_FWD_IDX_FILE_EXTENTION; File fwdIndexFile = new File(filePrefix + _fwdIndexColumn + extension); File fwdIndexDictFile = new File(filePrefix + _fwdIndexColumn + V1Constants.Dict.FILE_EXTENTION); long rawIndexSize = rawIndexFile.length(); long fwdIndexSize = fwdIndexFile.length() + fwdIndexDictFile.length(); System.out.println("Raw index size: " + toMegaBytes(rawIndexSize) + " MB."); System.out.println("Fwd index size: " + toMegaBytes(fwdIndexSize) + " MB."); System.out.println("Storage space saving: " + ((fwdIndexSize - rawIndexSize) * 100.0 / fwdIndexSize) + " %"); } /** * Compares lookup times for the two columns. * Performs {@link #_numConsecutiveLookups} on the two columns on randomly generated docIds. * * @param segment Segment to compare the columns for */ private void compareLookups(IndexSegment segment) { int[] filteredDocIds = generateDocIds(segment); long rawIndexTime = profileLookups(segment, _rawIndexColumn, filteredDocIds); long fwdIndexTime = profileLookups(segment, _fwdIndexColumn, filteredDocIds); System.out.println("Raw index lookup time: " + rawIndexTime); System.out.println("Fwd index lookup time: " + fwdIndexTime); System.out.println("Percentage change: " + ((fwdIndexTime - rawIndexTime) * 100.0 / rawIndexTime) + " %"); } /** * Profiles the lookup time for a given column, for the given docIds. * * @param segment Segment to profile * @param column Column to profile * @param docIds DocIds to lookup on the column * @return Time take in millis for the lookups */ private long profileLookups(IndexSegment segment, String column, int[] docIds) { BaseFilterOperator filterOperator = new TestFilterOperator(docIds); BReusableFilteredDocIdSetOperator docIdSetOperator = new BReusableFilteredDocIdSetOperator(filterOperator, docIds.length, DocIdSetPlanNode.MAX_DOC_PER_CALL); ProjectionBlock projectionBlock; MProjectionOperator projectionOperator = new MProjectionOperator(buildDataSourceMap(segment), docIdSetOperator); long start = System.currentTimeMillis(); while ((projectionBlock = (ProjectionBlock) projectionOperator.nextBlock()) != null) { ProjectionBlockValSet blockValueSet = (ProjectionBlockValSet) projectionBlock.getBlockValueSet(column); blockValueSet.getDoubleValuesSV(); } return (System.currentTimeMillis() - start); } /** * Convert from bytes to mega-bytes. * * @param sizeInBytes Size to convert * @return Size in MB's */ private double toMegaBytes(long sizeInBytes) { return sizeInBytes / (1024 * 1024); } /** * Helper method to build map from column to data source * * @param segment Segment for which to build the map * @return Column to data source map */ private Map<String, BaseOperator> buildDataSourceMap(IndexSegment segment) { Map<String, BaseOperator> dataSourceMap = new HashMap<>(); for (String column : segment.getColumnNames()) { dataSourceMap.put(column, segment.getDataSource(column)); } return dataSourceMap; } /** * Generate random docIds. * <ul> * <li> Total of {@link #_numLookups} docIds are generated. </li> * <li> DocId's are in clusters containing {@link #_numConsecutiveLookups} ids. </li> * </ul> * @param segment * @return */ private int[] generateDocIds(IndexSegment segment) { Random random = new Random(); int numDocs = segment.getSegmentMetadata().getTotalDocs(); int maxDocId = numDocs - _numConsecutiveLookups - 1; int[] docIdSet = new int[_numLookups]; int j = 0; for (int i = 0; i < (_numLookups / _numConsecutiveLookups); i++) { int startDocId = random.nextInt(maxDocId); int endDocId = startDocId + _numConsecutiveLookups; for (int docId = startDocId; docId < endDocId; docId++) { docIdSet[j++] = docId; } } int docId = random.nextInt(maxDocId); for (; j < _numLookups; ++j) { docIdSet[j] = docId++; } return docIdSet; } /** * Helper class to generate doc id's for lookup */ class TestFilterOperator extends BaseFilterOperator { private static final String OPERATOR_NAME = "TestFilterOperator"; private int[] _filteredDocIds; public TestFilterOperator(int[] filteredDocIds) { _filteredDocIds = filteredDocIds; } @Override public BaseFilterBlock nextFilterBlock(BlockId blockId) { return new ArrayBasedFilterBlock(_filteredDocIds); } @Override public boolean isResultEmpty() { return false; } @Override public boolean open() { return true; } @Override public boolean close() { return true; } @Override public String getOperatorName() { return OPERATOR_NAME; } } /** * Main method for the class. Parses the command line arguments, and invokes the benchmark. * * @param args Command line arguments. * @throws Exception */ public static void main(String[] args) throws Exception { RawIndexBenchmark benchmark = new RawIndexBenchmark(); CmdLineParser parser = new CmdLineParser(benchmark); parser.parseArgument(args); benchmark.run(); } }