/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.perf; import com.linkedin.pinot.common.data.DimensionFieldSpec; import com.linkedin.pinot.common.data.FieldSpec; import com.linkedin.pinot.common.data.Schema; import com.linkedin.pinot.common.segment.ReadMode; import com.linkedin.pinot.core.data.GenericRow; import com.linkedin.pinot.core.data.readers.FileFormat; import com.linkedin.pinot.core.data.readers.RecordReader; import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig; import com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl; import com.linkedin.pinot.core.segment.index.IndexSegmentImpl; import com.linkedin.pinot.core.segment.index.loader.Loaders; import com.linkedin.pinot.core.segment.index.readers.ImmutableDictionaryReader; import java.io.File; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.RandomStringUtils; import org.apache.commons.lang.mutable.MutableLong; /** * Performance test for lookup in string dictionary. */ public class StringDictionaryPerfTest { private static final int MAX_STRING_LENGTH = 100; private static final String TMP_DIR = System.getProperty("java.io.tmpdir"); private static final String COLUMN_NAME = "test"; private static final int TOTAL_NUM_LOOKUPS = 100_000; String[] _inputStrings; private File _indexDir; private int _dictLength; /** * Helper method to build a segment: * <ul> * <li> Segment contains one string column </li> * <li> Row values for the column are randomly generated strings of length 1 to 100 </li> * </ul> * * @param dictLength Length of the dictionary * @throws Exception */ public void buildSegment(int dictLength) throws Exception { Schema schema = new Schema(); String segmentName = "perfTestSegment" + System.currentTimeMillis(); _indexDir = new File(TMP_DIR + File.separator + segmentName); _indexDir.deleteOnExit(); FieldSpec fieldSpec = new DimensionFieldSpec(COLUMN_NAME, FieldSpec.DataType.STRING, true); schema.addField(fieldSpec); _dictLength = dictLength; _inputStrings = new String[dictLength]; SegmentGeneratorConfig config = new SegmentGeneratorConfig(schema); config.setOutDir(_indexDir.getParent()); config.setFormat(FileFormat.AVRO); config.setSegmentName(segmentName); Random random = new Random(System.nanoTime()); final List<GenericRow> data = new ArrayList<>(); Set<String> uniqueStrings = new HashSet<>(dictLength); int i = 0; while (i < dictLength) { HashMap<String, Object> map = new HashMap<>(); String randomString = RandomStringUtils.randomAlphanumeric(1 + random.nextInt(MAX_STRING_LENGTH)); if (uniqueStrings.contains(randomString)) { continue; } _inputStrings[i] = randomString; uniqueStrings.add(randomString); map.put("test", _inputStrings[i++]); GenericRow genericRow = new GenericRow(); genericRow.init(map); data.add(genericRow); } SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl(); RecordReader reader = getGenericRowRecordReader(schema, data); driver.init(config, reader); driver.build(); } /** * Measures the performance of string dictionary lookups by performing the provided * number of lookups to random indices. * * @param numLookups Number of lookups to perform * @throws Exception */ public void perfTestLookups(int numLookups) throws Exception { IndexSegmentImpl segment = (IndexSegmentImpl) Loaders.IndexSegment.load(_indexDir, ReadMode.heap); ImmutableDictionaryReader dictionary = segment.getDictionaryFor(COLUMN_NAME); Random random = new Random(System.nanoTime()); long start = System.currentTimeMillis(); for (int i = 0; i < numLookups; i++) { int index = 1 + random.nextInt(_dictLength); dictionary.indexOf(_inputStrings[index]); } FileUtils.deleteQuietly(_indexDir); System.out.println("Total time for " + TOTAL_NUM_LOOKUPS + " lookups: " + (System.currentTimeMillis() - start)); } /** * Returns an implementation of GenericRow record reader. * * @param schema Schema for the data * @param data Data * @return GenericRow record reader */ private static RecordReader getGenericRowRecordReader(final Schema schema, final List<GenericRow> data) { return new RecordReader() { int _counter = 0; @Override public void rewind() throws Exception { _counter = 0; } @Override public GenericRow next() { return data.get(_counter++); } @Override public GenericRow next(GenericRow row) { return next(); } @Override public void init() throws Exception { } @Override public boolean hasNext() { return _counter < data.size(); } @Override public Schema getSchema() { return schema; } @Override public Map<String, MutableLong> getNullCountMap() { return null; } @Override public void close() throws Exception { } }; } public static void main(String[] args) throws Exception { if (args.length != 2) { System.out.println("Usage: StringDictionaryPerfRunner <dictionary_length> <num_lookups> "); } int dictLength = Integer.valueOf(args[0]); int numLookups = Integer.valueOf(args[1]); StringDictionaryPerfTest test = new StringDictionaryPerfTest(); test.buildSegment(dictLength); test.perfTestLookups(numLookups); } }