NoDictionaryGroupKeyGeneratorTest.java example

Explorer
pinot-master
/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.query.aggregation.groupby;

import com.linkedin.pinot.common.data.DimensionFieldSpec;
import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.request.transform.TransformExpressionTree;
import com.linkedin.pinot.common.segment.ReadMode;
import com.linkedin.pinot.core.common.DataSource;
import com.linkedin.pinot.core.data.GenericRow;
import com.linkedin.pinot.core.data.readers.RecordReader;
import com.linkedin.pinot.core.data.readers.TestRecordReader;
import com.linkedin.pinot.core.indexsegment.IndexSegment;
import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig;
import com.linkedin.pinot.core.operator.BReusableFilteredDocIdSetOperator;
import com.linkedin.pinot.core.operator.BaseOperator;
import com.linkedin.pinot.core.operator.MProjectionOperator;
import com.linkedin.pinot.core.operator.blocks.TransformBlock;
import com.linkedin.pinot.core.operator.filter.MatchEntireSegmentOperator;
import com.linkedin.pinot.core.operator.transform.TransformExpressionOperator;
import com.linkedin.pinot.core.plan.DocIdSetPlanNode;
import com.linkedin.pinot.core.query.aggregation.groupby.AggregationGroupByTrimmingService;
import com.linkedin.pinot.core.query.aggregation.groupby.GroupKeyGenerator;
import com.linkedin.pinot.core.query.aggregation.groupby.NoDictionaryMultiColumnGroupKeyGenerator;
import com.linkedin.pinot.core.query.aggregation.groupby.NoDictionarySingleColumnGroupKeyGenerator;
import com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl;
import com.linkedin.pinot.core.segment.index.loader.Loaders;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;


/**
 * Unit test for {@link NoDictionaryMultiColumnGroupKeyGenerator}
 */
public class NoDictionaryGroupKeyGeneratorTest {
  private static final String SEGMENT_DIR_NAME = System.getProperty("java.io.tmpdir") + File.separator + "rawIndexPerf";
  private static final String SEGMENT_NAME = "perfTestSegment";

  private static final String STRING_DICT_COLUMN = "string_dict_column";
  private static final String[] COLUMN_NAMES =
      {"int_column", "long_column", "float_column", "double_column", "string_column", STRING_DICT_COLUMN};
  private static final String[] NO_DICT_COLUMN_NAMES =
      {"int_column", "long_column", "float_column", "double_column", "string_column"};

  private static final FieldSpec.DataType[] DATA_TYPES =
      {FieldSpec.DataType.INT, FieldSpec.DataType.LONG, FieldSpec.DataType.FLOAT, FieldSpec.DataType.DOUBLE, FieldSpec.DataType.STRING, FieldSpec.DataType.STRING};

  private static final int NUM_COLUMNS = DATA_TYPES.length;
  private static final int NUM_ROWS = 1;
  private TestRecordReader _recordReader;
  private Map<String, BaseOperator> _dataSourceMap;
  private IndexSegment _indexSegment;

  @BeforeClass
  public void setup()
      throws Exception {
    buildSegment();

    // Load the segment.
    File segment = new File(SEGMENT_DIR_NAME, SEGMENT_NAME);
    _indexSegment = Loaders.IndexSegment.load(segment, ReadMode.heap);

    // Build the data source map
    _dataSourceMap = new HashMap<>();
    for (String column : _indexSegment.getColumnNames()) {
      DataSource dataSource = _indexSegment.getDataSource(column);
      _dataSourceMap.put(column, dataSource);
    }
  }

  /**
   * Unit test for {@link com.linkedin.pinot.core.query.aggregation.groupby.NoDictionarySingleColumnGroupKeyGenerator}
   * @throws Exception
   */
  @Test
  public void testSingleColumnGroupKeyGenerator()
      throws Exception {
    for (int i = 0; i < COLUMN_NAMES.length; i++) {
      testGroupKeyGenerator(new String[]{COLUMN_NAMES[i]}, new FieldSpec.DataType[]{DATA_TYPES[i]});
    }
  }

  /**
   * Unit test for {@link NoDictionaryMultiColumnGroupKeyGenerator}
   * @throws Exception
   */
  @Test
  public void testMultiColumnGroupKeyGenerator()
      throws Exception {
    testGroupKeyGenerator(_indexSegment.getColumnNames(), DATA_TYPES);
  }

  /**
   * Tests multi-column group key generator when at least one column as dictionary, and others don't.
   */
  @Test
  public void testMultiColumnHybridGroupKeyGenerator()
      throws Exception {
    for (int i = 0; i < NO_DICT_COLUMN_NAMES.length; i++) {
      testGroupKeyGenerator(new String[]{NO_DICT_COLUMN_NAMES[i], STRING_DICT_COLUMN},
          new FieldSpec.DataType[]{DATA_TYPES[i], FieldSpec.DataType.STRING});
    }
  }

  private void testGroupKeyGenerator(String[] groupByColumns, FieldSpec.DataType[] dataTypes)
      throws Exception {
    // Build the projection operator.
    MatchEntireSegmentOperator matchEntireSegmentOperator = new MatchEntireSegmentOperator(NUM_ROWS);
    BReusableFilteredDocIdSetOperator docIdSetOperator =
        new BReusableFilteredDocIdSetOperator(matchEntireSegmentOperator, NUM_ROWS, 10000);
    MProjectionOperator projectionOperator = new MProjectionOperator(_dataSourceMap, docIdSetOperator);
    TransformExpressionOperator transformOperator =
        new TransformExpressionOperator(projectionOperator, new ArrayList<TransformExpressionTree>());

    // Iterator over all projection blocks and generate group keys.
    TransformBlock transformBlock;
    int[] docIdToGroupKeys = new int[DocIdSetPlanNode.MAX_DOC_PER_CALL];

    GroupKeyGenerator groupKeyGenerator = null;
    while ((transformBlock = (TransformBlock) transformOperator.nextBlock()) != null) {
      if (groupKeyGenerator == null) {
        // Build the group key generator.
        groupKeyGenerator =
            (groupByColumns.length == 1) ? new NoDictionarySingleColumnGroupKeyGenerator(groupByColumns[0],
                dataTypes[0]) : new NoDictionaryMultiColumnGroupKeyGenerator(transformBlock, groupByColumns);
      }
      groupKeyGenerator.generateKeysForBlock(transformBlock, docIdToGroupKeys);
    }

    // Assert total number of group keys is as expected
    Assert.assertTrue(groupKeyGenerator != null);
    Set<String> expectedGroupKeys = getExpectedGroupKeys(_recordReader, groupByColumns);
    Assert.assertEquals(groupKeyGenerator.getCurrentGroupKeyUpperBound(), expectedGroupKeys.size(),
        "Number of group keys mis-match.");

    // Assert all group key values are as expected
    Iterator<GroupKeyGenerator.GroupKey> uniqueGroupKeys = groupKeyGenerator.getUniqueGroupKeys();
    while (uniqueGroupKeys.hasNext()) {
      GroupKeyGenerator.GroupKey groupKey = uniqueGroupKeys.next();
      String actual = groupKey.getStringKey();
      Assert.assertTrue(expectedGroupKeys.contains(actual), "Unexpected group key: " + actual);
    }
  }

  /**
   * Helper method to build group keys for a given array of group-by columns.
   *
   * @param groupByColumns Group-by columns for which to generate the group-keys.
   * @return Set of unique group keys.
   * @throws Exception
   */
  private Set<String> getExpectedGroupKeys(RecordReader recordReader, String[] groupByColumns)
      throws Exception {
    Set<String> groupKeys = new HashSet<>();
    StringBuilder stringBuilder = new StringBuilder();

    recordReader.rewind();
    while (recordReader.hasNext()) {
      GenericRow row = recordReader.next();

      stringBuilder.setLength(0);
      for (int i = 0; i < groupByColumns.length; i++) {
        stringBuilder.append(row.getValue(groupByColumns[i]));
        if (i < groupByColumns.length - 1) {
          stringBuilder.append(AggregationGroupByTrimmingService.GROUP_KEY_DELIMITER);
        }
      }
      groupKeys.add(stringBuilder.toString());
    }
    return groupKeys;
  }

  /**
   * Helper method to build a segment as follows:
   * <ul>
   *   <li> One string column without dictionary. </li>
   *   <li> One integer column with dictionary. </li>
   * </ul>
   *
   * It also computes the unique group keys while it generates the index.
   *
   * @return Set containing unique group keys from the created segment.
   *
   * @throws Exception
   */
  private TestRecordReader buildSegment()
      throws Exception {
    Schema schema = new Schema();

    for (int i = 0; i < COLUMN_NAMES.length; i++) {
      DimensionFieldSpec dimensionFieldSpec = new DimensionFieldSpec(COLUMN_NAMES[i], DATA_TYPES[i], true);
      schema.addField(dimensionFieldSpec);
    }

    SegmentGeneratorConfig config = new SegmentGeneratorConfig(schema);
    config.setRawIndexCreationColumns(Arrays.asList(NO_DICT_COLUMN_NAMES));

    config.setOutDir(SEGMENT_DIR_NAME);
    config.setSegmentName(SEGMENT_NAME);

    Random random = new Random();
    List<GenericRow> rows = new ArrayList<>(NUM_ROWS);
    for (int i = 0; i < NUM_ROWS; i++) {
      Map<String, Object> map = new HashMap<>(NUM_COLUMNS);

      for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) {
        String column = fieldSpec.getName();

        FieldSpec.DataType dataType = fieldSpec.getDataType();
        switch (dataType) {
          case INT:
            map.put(column, random.nextInt());
            break;

          case LONG:
            map.put(column, random.nextLong());
            break;

          case FLOAT:
            map.put(column, random.nextFloat());
            break;

          case DOUBLE:
            map.put(column, random.nextDouble());
            break;

          case STRING:
            map.put(column, "value_" + i);
            break;

          default:
            throw new IllegalArgumentException("Illegal data type specified: " + dataType);
        }
      }

      GenericRow genericRow = new GenericRow();
      genericRow.init(map);
      rows.add(genericRow);
    }

    SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
    _recordReader = new TestRecordReader(rows, schema);

    driver.init(config, _recordReader);
    driver.build();

    return _recordReader;
  }
}