StringDictionaryPerfTest.java example

Explorer
pinot-master
/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.perf;

import com.linkedin.pinot.common.data.DimensionFieldSpec;
import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.segment.ReadMode;
import com.linkedin.pinot.core.data.GenericRow;
import com.linkedin.pinot.core.data.readers.FileFormat;
import com.linkedin.pinot.core.data.readers.RecordReader;
import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig;
import com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl;
import com.linkedin.pinot.core.segment.index.IndexSegmentImpl;
import com.linkedin.pinot.core.segment.index.loader.Loaders;
import com.linkedin.pinot.core.segment.index.readers.ImmutableDictionaryReader;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.commons.lang.mutable.MutableLong;


/**
 * Performance test for lookup in string dictionary.
 */
public class StringDictionaryPerfTest {
  private static final int MAX_STRING_LENGTH = 100;
  private static final String TMP_DIR = System.getProperty("java.io.tmpdir");
  private static final String COLUMN_NAME = "test";
  private static final int TOTAL_NUM_LOOKUPS = 100_000;

  String[] _inputStrings;
  private File _indexDir;
  private int _dictLength;

  /**
   * Helper method to build a segment:
   * <ul>
   *   <li> Segment contains one string column </li>
   *   <li> Row values for the column are randomly generated strings of length 1 to 100 </li>
   * </ul>
   *
   * @param dictLength Length of the dictionary
   * @throws Exception
   */
  public void buildSegment(int dictLength)
      throws Exception {
    Schema schema = new Schema();
    String segmentName = "perfTestSegment" + System.currentTimeMillis();
    _indexDir = new File(TMP_DIR + File.separator + segmentName);
    _indexDir.deleteOnExit();

    FieldSpec fieldSpec = new DimensionFieldSpec(COLUMN_NAME, FieldSpec.DataType.STRING, true);
    schema.addField(fieldSpec);

    _dictLength = dictLength;
    _inputStrings = new String[dictLength];

    SegmentGeneratorConfig config = new SegmentGeneratorConfig(schema);
    config.setOutDir(_indexDir.getParent());
    config.setFormat(FileFormat.AVRO);
    config.setSegmentName(segmentName);

    Random random = new Random(System.nanoTime());
    final List<GenericRow> data = new ArrayList<>();
    Set<String> uniqueStrings = new HashSet<>(dictLength);

    int i = 0;
    while (i < dictLength) {
      HashMap<String, Object> map = new HashMap<>();
      String randomString = RandomStringUtils.randomAlphanumeric(1 + random.nextInt(MAX_STRING_LENGTH));

      if (uniqueStrings.contains(randomString)) {
        continue;
      }

      _inputStrings[i] = randomString;
      uniqueStrings.add(randomString);
      map.put("test", _inputStrings[i++]);

      GenericRow genericRow = new GenericRow();
      genericRow.init(map);
      data.add(genericRow);
    }

    SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
    RecordReader reader = getGenericRowRecordReader(schema, data);
    driver.init(config, reader);
    driver.build();
  }

  /**
   * Measures the performance of string dictionary lookups by performing the provided
   * number of lookups to random indices.
   *
   * @param numLookups Number of lookups to perform
   * @throws Exception
   */
  public void perfTestLookups(int numLookups)
      throws Exception {
    IndexSegmentImpl segment = (IndexSegmentImpl) Loaders.IndexSegment.load(_indexDir, ReadMode.heap);
    ImmutableDictionaryReader dictionary = segment.getDictionaryFor(COLUMN_NAME);

    Random random = new Random(System.nanoTime());
    long start = System.currentTimeMillis();

    for (int i = 0; i < numLookups; i++) {
      int index = 1 + random.nextInt(_dictLength);
      dictionary.indexOf(_inputStrings[index]);
    }

    FileUtils.deleteQuietly(_indexDir);
    System.out.println("Total time for " + TOTAL_NUM_LOOKUPS + " lookups: " + (System.currentTimeMillis() - start));
  }

  /**
   * Returns an implementation of GenericRow record reader.
   *
   * @param schema Schema for the data
   * @param data Data
   * @return GenericRow record reader
   */
  private static RecordReader getGenericRowRecordReader(final Schema schema, final List<GenericRow> data) {
    return new RecordReader() {
      int _counter = 0;

      @Override
      public void rewind()
          throws Exception {
        _counter = 0;
      }

      @Override
      public GenericRow next() {
        return data.get(_counter++);
      }

      @Override
      public GenericRow next(GenericRow row) {
        return next();
      }

      @Override
      public void init()
          throws Exception {
      }

      @Override
      public boolean hasNext() {
        return _counter < data.size();
      }

      @Override
      public Schema getSchema() {
        return schema;
      }

      @Override
      public Map<String, MutableLong> getNullCountMap() {
        return null;
      }

      @Override
      public void close()
          throws Exception {
      }
    };
  }

  public static void main(String[] args)
      throws Exception {
    if (args.length != 2) {
      System.out.println("Usage: StringDictionaryPerfRunner <dictionary_length> <num_lookups> ");
    }

    int dictLength = Integer.valueOf(args[0]);
    int numLookups = Integer.valueOf(args[1]);

    StringDictionaryPerfTest test = new StringDictionaryPerfTest();
    test.buildSegment(dictLength);
    test.perfTestLookups(numLookups);
  }
}