DefaultGroupByExecutor.java example

Explorer
pinot-master
/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.core.query.aggregation.groupby;

import com.google.common.base.Preconditions;
import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.request.GroupBy;
import com.linkedin.pinot.core.common.BlockMetadata;
import com.linkedin.pinot.core.common.BlockValSet;
import com.linkedin.pinot.core.operator.blocks.TransformBlock;
import com.linkedin.pinot.core.plan.DocIdSetPlanNode;
import com.linkedin.pinot.core.query.aggregation.AggregationFunctionContext;
import com.linkedin.pinot.core.query.aggregation.function.AggregationFunction;
import com.linkedin.pinot.core.query.aggregation.function.AggregationFunctionFactory;
import java.util.List;
import javax.annotation.Nonnull;


/**
 * This class implements group by aggregation.
 * It is optimized for performance, and uses the best possible algorithm/data-structure
 * for a given query based on the following parameters:
 * - Maximum number of group keys possible.
 * - Single/Multi valued columns.
 */
public class DefaultGroupByExecutor implements GroupByExecutor {
  public static final int MAX_INITIAL_RESULT_HOLDER_CAPACITY = 10_000;

  // Thread local (reusable) array for dict id to group key mapping.
  private static final ThreadLocal<int[]> THREAD_LOCAL_DICT_ID_TO_GROUP_KEY = new ThreadLocal<int[]>() {
    @Override
    protected int[] initialValue() {
      return new int[DocIdSetPlanNode.MAX_DOC_PER_CALL];
    }
  };

  // Thread local (reusable) array for dict id to MV group key mapping.
  private static final ThreadLocal<int[][]> THREAD_LOCAL_DICT_ID_TO_MV_GROUP_KEY = new ThreadLocal<int[][]>() {
    @Override
    protected int[][] initialValue() {
      return new int[DocIdSetPlanNode.MAX_DOC_PER_CALL][];
    }
  };

  private static final double GROUP_BY_TRIM_FACTOR = 0.9;
  private final int _numAggrFunc;
  private final int _numGroupsLimit;
  private final AggregationFunctionContext[] _aggrFunctionContexts;
  private final AggregationFunction[] _aggregationFunctions;

  private GroupKeyGenerator _groupKeyGenerator;
  private GroupByResultHolder[] _resultHolderArray;
  private final String[] _groupByColumns;

  private int[] _docIdToSVGroupKey;
  private int[][] _docIdToMVGroupKey;

  private boolean _hasMVGroupByColumns = false;
  private boolean _inited = false; // boolean to ensure init() has been called.
  private boolean _finished = false; // boolean to ensure that finish() has been called.
  private boolean _groupByInited = false; // boolean for lazy creation of group-key generator etc.
  private boolean _hasColumnsWithoutDictionary = false;

  /**
   * Constructor for the class.
   * @param aggrFunctionContexts Array of aggregation functions
   * @param groupBy Group by from broker request
   * @param numGroupsLimit Limit on number of aggregation groups returned in the result
   */
  public DefaultGroupByExecutor(@Nonnull AggregationFunctionContext[] aggrFunctionContexts, GroupBy groupBy,
      int numGroupsLimit) {
    Preconditions.checkNotNull(aggrFunctionContexts.length > 0);
    Preconditions.checkNotNull(groupBy);

    List<String> groupByColumns = groupBy.getColumns();
    List<String> groupByExpressions = groupBy.getExpressions();

    // Expressions contain simple group by columns (ie without any transform) as well.
    if (groupByExpressions != null && !groupByExpressions.isEmpty()) {
      _groupByColumns = groupByExpressions.toArray(new String[groupByExpressions.size()]);
    } else {
      _groupByColumns = groupByColumns.toArray(new String[groupByColumns.size()]);
    }

    _numAggrFunc = aggrFunctionContexts.length;

    // TODO: revisit the trim factor. Usually the factor should be 5-10, and based on the 'TOP' limit.
    // When results are trimmed, drop bottom 10% of groups.
    _numGroupsLimit = (int) (GROUP_BY_TRIM_FACTOR * numGroupsLimit);

    _aggrFunctionContexts = aggrFunctionContexts;
    _aggregationFunctions = new AggregationFunction[_numAggrFunc];
    for (int i = 0; i < _numAggrFunc; i++) {
      _aggregationFunctions[i] = aggrFunctionContexts[i].getAggregationFunction();
    }
  }

  /**
   * {@inheritDoc}
   * No-op for this implementation of GroupKeyGenerator. Most initialization happens lazily
   * in process(), as a transform is required to initialize group key generator, etc.
   */
  @Override
  public void init() {
    // Returned if already initialized.
    if (_inited) {
      return;
    }

    _inited = true;
  }

  /**
   * Process the provided set of docId's to perform the requested aggregation-group-by-operation.
   *
   * @param transformBlock Transform block to process
   */
  @Override
  public void process(TransformBlock transformBlock) {
    Preconditions.checkState(_inited,
        "Method 'process' cannot be called before 'init' for class " + getClass().getName());

    initGroupBy(transformBlock);
    generateGroupKeysForBlock(transformBlock);
    int capacityNeeded = _groupKeyGenerator.getCurrentGroupKeyUpperBound();

    for (int i = 0; i < _numAggrFunc; i++) {
      _resultHolderArray[i].ensureCapacity(capacityNeeded);
      aggregateColumn(transformBlock, _aggrFunctionContexts[i], _resultHolderArray[i]);

      // Result holder limits the max number of group keys (default 100k), if the number of groups
      // exceeds beyond that limit, groups with lower values (as per sort order) are trimmed.
      // Once result holder trims those groups, the group key generator needs to purge them.
      if (!_hasColumnsWithoutDictionary) {
        int[] trimmedKeys = _resultHolderArray[i].trimResults();
        _groupKeyGenerator.purgeKeys(trimmedKeys);
      }
    }
  }

  /**
   * Helper method to perform aggregation for a given column.
   *
   * @param transformBlock Transform block to aggregate
   * @param aggrFuncContext Aggregation function context
   * @param resultHolder Holder for results of aggregation
   */
  @SuppressWarnings("ConstantConditions")
  private void aggregateColumn(TransformBlock transformBlock, AggregationFunctionContext aggrFuncContext,
      GroupByResultHolder resultHolder) {
    AggregationFunction aggregationFunction = aggrFuncContext.getAggregationFunction();
    String[] aggregationColumns = aggrFuncContext.getAggregationColumns();
    Preconditions.checkState(aggregationColumns.length == 1);
    int length = transformBlock.getNumDocs();

    if (!aggregationFunction.getName().equals(AggregationFunctionFactory.AggregationFunctionType.COUNT.getName())) {
      BlockValSet blockValueSet = transformBlock.getBlockValueSet(aggregationColumns[0]);
      if (_hasMVGroupByColumns) {
        aggregationFunction.aggregateGroupByMV(length, _docIdToMVGroupKey, resultHolder, blockValueSet);
      } else {
        aggregationFunction.aggregateGroupBySV(length, _docIdToSVGroupKey, resultHolder, blockValueSet);
      }
    } else {
      if (_hasMVGroupByColumns) {
        aggregationFunction.aggregateGroupByMV(length, _docIdToMVGroupKey, resultHolder);
      } else {
        aggregationFunction.aggregateGroupBySV(length, _docIdToSVGroupKey, resultHolder);
      }
    }
  }

  /**
   * {@inheritDoc}
   */
  @Override
  public void finish() {
    Preconditions.checkState(_inited,
        "Method 'finish' cannot be called before 'init' for class " + getClass().getName());

    _finished = true;
  }

  /**
   * Return the final result of the aggregation-group-by operation.
   * This method should be called after all docIdSets have been 'processed'.
   *
   * @return Results of aggregation group by.
   */
  @Override
  public AggregationGroupByResult getResult() {
    Preconditions.checkState(_finished,
        "Method 'getResult' cannot be called before 'finish' for class " + getClass().getName());

    // If group by was not initialized (in case of no transform blocks), return null.
    if (!_groupByInited) {
      return null;
    }

    return new AggregationGroupByResult(_groupKeyGenerator, _aggregationFunctions, _resultHolderArray);
  }

  /**
   * Generate group keys for the given docIdSet. For single valued columns, each docId has one group key,
   * but for multi-valued columns, each docId could have more than one group key.
   *
   * For SV keys: _docIdToSVGroupKey mapping is updated.
   * For MV keys: _docIdToMVGroupKey mapping is updated.
   *
   * @param transformBlock Transform block for which to generate group keys
   */
  private void generateGroupKeysForBlock(TransformBlock transformBlock) {
    if (_hasMVGroupByColumns) {
      _groupKeyGenerator.generateKeysForBlock(transformBlock, _docIdToMVGroupKey);
    } else {
      _groupKeyGenerator.generateKeysForBlock(transformBlock, _docIdToSVGroupKey);
    }
  }

  /**
   * Helper method to initialize result holder array.
   *
   * @param trimSize Trim size for group by keys
   * @param maxNumResults Maximum number of groups possible
   */
  private void initResultHolderArray(int trimSize, int maxNumResults) {
    _resultHolderArray = new GroupByResultHolder[_numAggrFunc];
    int initialCapacity = Math.min(maxNumResults, MAX_INITIAL_RESULT_HOLDER_CAPACITY);
    for (int i = 0; i < _numAggrFunc; i++) {
      _resultHolderArray[i] = _aggrFunctionContexts[i].getAggregationFunction()
          .createGroupByResultHolder(initialCapacity, maxNumResults, trimSize);
    }
  }

  /**
   * Allocate storage for docId to group keys mapping.
   */
  private void initDocIdToGroupKeyMap() {
    if (_hasMVGroupByColumns) {
      // TODO: Revisit block fetching of multi-valued columns
      _docIdToMVGroupKey = THREAD_LOCAL_DICT_ID_TO_MV_GROUP_KEY.get();
    } else {
      _docIdToSVGroupKey = THREAD_LOCAL_DICT_ID_TO_GROUP_KEY.get();
    }
  }

  /**
   * Initializes the following:
   * <p> - Group key generator. </p>
   * <p> - Result holders </p>
   * <p> - Re-usable storage (eg docId to group key mapping) </p>
   *
   * This is separate from init(), as this can only happen within process as transform block is
   * required to create group key generator.
   *
   * @param transformBlock Transform block to group by.
   */
  private void initGroupBy(TransformBlock transformBlock) {
    if (_groupByInited) {
      return;
    }

    FieldSpec.DataType dataType = null;
    for (String groupByColumn : _groupByColumns) {
      BlockMetadata metadata = transformBlock.getBlockMetadata(groupByColumn);

      if (!metadata.isSingleValue()) {
        _hasMVGroupByColumns = true;
      }

      if (!metadata.hasDictionary()) {
        _hasColumnsWithoutDictionary = true;
      }

      // Used only for single group-by case, so ok to overwrite.
      dataType = metadata.getDataType();
    }

    if (_hasColumnsWithoutDictionary) {
      if (_groupByColumns.length == 1) {
        _groupKeyGenerator = new NoDictionarySingleColumnGroupKeyGenerator(_groupByColumns[0], dataType);
      } else {
        _groupKeyGenerator = new NoDictionaryMultiColumnGroupKeyGenerator(transformBlock, _groupByColumns);
      }
    } else {
      _groupKeyGenerator = new DefaultGroupKeyGenerator(transformBlock, _groupByColumns);
    }

    int maxNumResults = _groupKeyGenerator.getGlobalGroupKeyUpperBound();
    initResultHolderArray(_numGroupsLimit, maxNumResults);
    initDocIdToGroupKeyMap();
    _groupByInited = true;
  }
}