VectorGroupByOperator.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec.vector;

import java.lang.management.ManagementFactory;
import java.lang.management.MemoryMXBean;
import java.lang.ref.SoftReference;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
import org.apache.hadoop.hive.ql.exec.KeyWrapper;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.vector.expressions.ConstantVectorExpression;
import org.apache.hadoop.hive.ql.exec.vector.expressions.IdentityExpression;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriterFactory;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorAggregateExpression;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.AggregationDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.VectorGroupByDesc;
import org.apache.hadoop.hive.ql.plan.VectorGroupByDesc.ProcessingMode;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.ql.util.JavaDataModel;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.DataOutputBuffer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javolution.util.FastBitSet;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;

/**
 * Vectorized GROUP BY operator implementation. Consumes the vectorized input and
 * stores the aggregate operators' intermediate states. Emits row mode output.
 *
 */
public class VectorGroupByOperator extends Operator<GroupByDesc> implements
    VectorizationContextRegion {

  private static final Logger LOG = LoggerFactory.getLogger(
      VectorGroupByOperator.class.getName());

  private VectorGroupByDesc vectorDesc;

  /**
   * This is the vector of aggregators. They are stateless and only implement
   * the algorithm of how to compute the aggregation. state is kept in the
   * aggregation buffers and is our responsibility to match the proper state for each key.
   */
  private VectorAggregateExpression[] aggregators;

  /**
   * Key vector expressions.
   */
  private VectorExpression[] keyExpressions;
  private int outputKeyLength;

  private boolean isVectorOutput;

  // Create a new outgoing vectorization context because column name map will change.
  private VectorizationContext vOutContext = null;

  // The above members are initialized by the constructor and must not be
  // transient.
  //---------------------------------------------------------------------------

  private transient VectorExpressionWriter[] keyOutputWriters;

  /**
   * The aggregation buffers to use for the current batch.
   */
  private transient VectorAggregationBufferBatch aggregationBatchInfo;

  /**
   * The current batch key wrappers.
   * The very same instance gets reused for all batches.
   */
  private transient VectorHashKeyWrapperBatch keyWrappersBatch;

  private transient Object[] forwardCache;

  private transient VectorizedRowBatch outputBatch;
  private transient VectorizedRowBatchCtx vrbCtx;

  private transient VectorAssignRow vectorAssignRow;

  /*
   * Grouping sets members.
   */
  private transient boolean groupingSetsPresent;

  // The field bits (i.e. which fields to include) or "id" for each grouping set.
  private transient int[] groupingSets;

  // The position in the column keys of the dummy grouping set id column.
  private transient int groupingSetsPosition;

  // The planner puts a constant field in for the dummy grouping set id.  We will overwrite it
  // as we process the grouping sets.
  private transient ConstantVectorExpression groupingSetsDummyVectorExpression;

  // We translate the grouping set bit field into a boolean arrays.
  private transient boolean[][] allGroupingSetsOverrideIsNulls;

  private transient int numEntriesHashTable;

  private transient long maxHashTblMemory;

  private transient long maxMemory;

  private float memoryThreshold;

  /**
   * Interface for processing mode: global, hash, unsorted streaming, or group batch
   */
  private static interface IProcessingMode {
    public void initialize(Configuration hconf) throws HiveException;
    public void startGroup() throws HiveException;
    public void endGroup() throws HiveException;
    public void processBatch(VectorizedRowBatch batch) throws HiveException;
    public void close(boolean aborted) throws HiveException;
  }

  /**
   * Base class for all processing modes
   */
  private abstract class ProcessingModeBase implements IProcessingMode {

    // Overridden and used in sorted reduce group batch processing mode.
    @Override
    public void startGroup() throws HiveException {
      // Do nothing.
    }
    @Override
    public void endGroup() throws HiveException {
      // Do nothing.
    }

    protected abstract void doProcessBatch(VectorizedRowBatch batch, boolean isFirstGroupingSet,
        boolean[] currentGroupingSetsOverrideIsNulls) throws HiveException;

    @Override
    public void processBatch(VectorizedRowBatch batch) throws HiveException {

      if (!groupingSetsPresent) {
        doProcessBatch(batch, false, null);
        return;
      }

      // We drive the doProcessBatch logic with the same batch but different
      // grouping set id and null variation.
      // PERFORMANCE NOTE: We do not try to reuse columns and generate the KeyWrappers anew...

      final int size = groupingSets.length;
      for (int i = 0; i < size; i++) {

        // NOTE: We are overwriting the constant vector value...
        groupingSetsDummyVectorExpression.setLongValue(groupingSets[i]);
        groupingSetsDummyVectorExpression.evaluate(batch);

        doProcessBatch(batch, (i == 0), allGroupingSetsOverrideIsNulls[i]);
      }
    }

    /**
     * Evaluates the aggregators on the current batch.
     * The aggregationBatchInfo must have been prepared
     * by calling {@link #prepareBatchAggregationBufferSets} first.
     */
    protected void processAggregators(VectorizedRowBatch batch) throws HiveException {
      // We now have a vector of aggregation buffer sets to use for each row
      // We can start computing the aggregates.
      // If the number of distinct keys in the batch is 1 we can
      // use the optimized code path of aggregateInput
      VectorAggregationBufferRow[] aggregationBufferSets =
          aggregationBatchInfo.getAggregationBuffers();
      if (aggregationBatchInfo.getDistinctBufferSetCount() == 1) {
        VectorAggregateExpression.AggregationBuffer[] aggregationBuffers =
            aggregationBufferSets[0].getAggregationBuffers();
        for (int i = 0; i < aggregators.length; ++i) {
          aggregators[i].aggregateInput(aggregationBuffers[i], batch);
        }
      } else {
        for (int i = 0; i < aggregators.length; ++i) {
          aggregators[i].aggregateInputSelection(
              aggregationBufferSets,
              i,
              batch);
        }
      }
    }

    /**
     * allocates a new aggregation buffer set.
     */
    protected VectorAggregationBufferRow allocateAggregationBuffer() throws HiveException {
      VectorAggregateExpression.AggregationBuffer[] aggregationBuffers =
          new VectorAggregateExpression.AggregationBuffer[aggregators.length];
      for (int i=0; i < aggregators.length; ++i) {
        aggregationBuffers[i] = aggregators[i].getNewAggregationBuffer();
        aggregators[i].reset(aggregationBuffers[i]);
      }
      VectorAggregationBufferRow bufferSet = new VectorAggregationBufferRow(aggregationBuffers);
      return bufferSet;
    }

  }

  /**
   * Global aggregates (no GROUP BY clause, no keys)
   * This mode is very simple, there are no keys to consider, and only flushes one row at closing
   * The one row must flush even if no input was seen (NULLs)
   */
  private class ProcessingModeGlobalAggregate extends ProcessingModeBase {

    /**
     * In global processing mode there is only one set of aggregation buffers
     */
    private VectorAggregationBufferRow aggregationBuffers;

    @Override
    public void initialize(Configuration hconf) throws HiveException {
      aggregationBuffers =  allocateAggregationBuffer();
      LOG.info("using global aggregation processing mode");
    }

    @Override
    public void doProcessBatch(VectorizedRowBatch batch, boolean isFirstGroupingSet,
        boolean[] currentGroupingSetsOverrideIsNulls) throws HiveException {
      for (int i = 0; i < aggregators.length; ++i) {
        aggregators[i].aggregateInput(aggregationBuffers.getAggregationBuffer(i), batch);
      }
    }

    @Override
    public void close(boolean aborted) throws HiveException {
      if (!aborted) {
        writeSingleRow(null, aggregationBuffers);
      }
    }
  }

  /**
   * Hash Aggregate mode processing
   */
  private class ProcessingModeHashAggregate extends ProcessingModeBase {

    /**
     * The global key-aggregation hash map.
     */
    private Map<KeyWrapper, VectorAggregationBufferRow> mapKeysAggregationBuffers;

    /**
     * Total per hashtable entry fixed memory (does not depend on key/agg values).
     */
    private long fixedHashEntrySize;

    /**
     * Average per hashtable entry variable size memory (depends on key/agg value).
     */
    private int avgVariableSize;

    /**
     * Number of entries added to the hashtable since the last check if it should flush.
     */
    private int numEntriesSinceCheck;

    /**
     * Sum of batch size processed (ie. rows).
     */
    private long sumBatchSize;

    /**
     * Max number of entries in the vector group by aggregation hashtables.
     * Exceeding this will trigger a flush irrelevant of memory pressure condition.
     */
    private int maxHtEntries = 1000000;

    /**
     * The number of new entries that must be added to the hashtable before a memory size check.
     */
    private int checkInterval = 10000;

    /**
     * Percent of entries to flush when memory threshold exceeded.
     */
    private float percentEntriesToFlush = 0.1f;

    /**
     * A soft reference used to detect memory pressure
     */
    private SoftReference<Object> gcCanary = new SoftReference<Object>(new Object());

    /**
     * Counts the number of time the gcCanary died and was resurrected
     */
    private long gcCanaryFlushes = 0L;

    /**
     * Count of rows since the last check for changing from aggregate to streaming mode
     */
    private long lastModeCheckRowCount = 0;

    /**
     * Minimum factor for hash table to reduce number of entries
     * If this is not met, the processing switches to streaming mode
     */
    private float minReductionHashAggr;

    /**
     * Number of rows processed between checks for minReductionHashAggr factor
     * TODO: there is overlap between numRowsCompareHashAggr and checkInterval
     */
    private long numRowsCompareHashAggr;

    @Override
    public void initialize(Configuration hconf) throws HiveException {
      // hconf is null in unit testing
      if (null != hconf) {
        this.percentEntriesToFlush = HiveConf.getFloatVar(hconf,
          HiveConf.ConfVars.HIVE_VECTORIZATION_GROUPBY_FLUSH_PERCENT);
        this.checkInterval = HiveConf.getIntVar(hconf,
          HiveConf.ConfVars.HIVE_VECTORIZATION_GROUPBY_CHECKINTERVAL);
        this.maxHtEntries = HiveConf.getIntVar(hconf,
            HiveConf.ConfVars.HIVE_VECTORIZATION_GROUPBY_MAXENTRIES);
        this.minReductionHashAggr = HiveConf.getFloatVar(hconf,
            HiveConf.ConfVars.HIVEMAPAGGRHASHMINREDUCTION);
          this.numRowsCompareHashAggr = HiveConf.getIntVar(hconf,
            HiveConf.ConfVars.HIVEGROUPBYMAPINTERVAL);
      }
      else {
        this.percentEntriesToFlush =
            HiveConf.ConfVars.HIVE_VECTORIZATION_GROUPBY_FLUSH_PERCENT.defaultFloatVal;
        this.checkInterval =
            HiveConf.ConfVars.HIVE_VECTORIZATION_GROUPBY_CHECKINTERVAL.defaultIntVal;
        this.maxHtEntries =
            HiveConf.ConfVars.HIVE_VECTORIZATION_GROUPBY_MAXENTRIES.defaultIntVal;
        this.minReductionHashAggr =
            HiveConf.ConfVars.HIVEMAPAGGRHASHMINREDUCTION.defaultFloatVal;
          this.numRowsCompareHashAggr =
            HiveConf.ConfVars.HIVEGROUPBYMAPINTERVAL.defaultIntVal;
      }

      sumBatchSize = 0;

      mapKeysAggregationBuffers = new HashMap<KeyWrapper, VectorAggregationBufferRow>();
      computeMemoryLimits();
      LOG.debug("using hash aggregation processing mode");
    }

    @Override
    public void doProcessBatch(VectorizedRowBatch batch, boolean isFirstGroupingSet,
        boolean[] currentGroupingSetsOverrideIsNulls) throws HiveException {

      if (!groupingSetsPresent || isFirstGroupingSet) {

        // Evaluate the key expressions once.
        for(int i = 0; i < keyExpressions.length; ++i) {
          keyExpressions[i].evaluate(batch);
        }
      }

      // First we traverse the batch to evaluate and prepare the KeyWrappers
      // After this the KeyWrappers are properly set and hash code is computed
      if (!groupingSetsPresent) {
        keyWrappersBatch.evaluateBatch(batch);
      } else {
        keyWrappersBatch.evaluateBatchGroupingSets(batch, currentGroupingSetsOverrideIsNulls);
      }

      // Next we locate the aggregation buffer set for each key
      prepareBatchAggregationBufferSets(batch);

      // Finally, evaluate the aggregators
      processAggregators(batch);

      //Flush if memory limits were reached
      // We keep flushing until the memory is under threshold
      int preFlushEntriesCount = numEntriesHashTable;
      while (shouldFlush(batch)) {
        flush(false);

        if(gcCanary.get() == null) {
          gcCanaryFlushes++;
          gcCanary = new SoftReference<Object>(new Object());
        }

        //Validate that some progress is being made
        if (!(numEntriesHashTable < preFlushEntriesCount)) {
          if (LOG.isDebugEnabled()) {
            LOG.debug(String.format("Flush did not progress: %d entries before, %d entries after",
                preFlushEntriesCount,
                numEntriesHashTable));
          }
          break;
        }
        preFlushEntriesCount = numEntriesHashTable;
      }

      if (sumBatchSize == 0 && 0 != batch.size) {
        // Sample the first batch processed for variable sizes.
        updateAvgVariableSize(batch);
      }

      sumBatchSize += batch.size;
      lastModeCheckRowCount += batch.size;

      // Check if we should turn into streaming mode
      checkHashModeEfficiency();
    }

    @Override
    public void close(boolean aborted) throws HiveException {
      if (!aborted) {
        flush(true);
      }
    }

    /**
     * Locates the aggregation buffer sets to use for each key in the current batch.
     * The keyWrappersBatch must have evaluated the current batch first.
     */
    private void prepareBatchAggregationBufferSets(VectorizedRowBatch batch) throws HiveException {
      // The aggregation batch vector needs to know when we start a new batch
      // to bump its internal version.
      aggregationBatchInfo.startBatch();

      if (batch.size == 0) {
        return;
      }

      // We now have to probe the global hash and find-or-allocate
      // the aggregation buffers to use for each key present in the batch
      VectorHashKeyWrapper[] keyWrappers = keyWrappersBatch.getVectorHashKeyWrappers();

      final int n = keyExpressions.length == 0 ? 1 : batch.size;
      // note - the row mapping is not relevant when aggregationBatchInfo::getDistinctBufferSetCount() == 1

      for (int i=0; i < n; ++i) {
        VectorHashKeyWrapper kw = keyWrappers[i];
        VectorAggregationBufferRow aggregationBuffer = mapKeysAggregationBuffers.get(kw);
        if (null == aggregationBuffer) {
          // the probe failed, we must allocate a set of aggregation buffers
          // and push the (keywrapper,buffers) pair into the hash.
          // is very important to clone the keywrapper, the one we have from our
          // keyWrappersBatch is going to be reset/reused on next batch.
          aggregationBuffer = allocateAggregationBuffer();
          mapKeysAggregationBuffers.put(kw.copyKey(), aggregationBuffer);
          numEntriesHashTable++;
          numEntriesSinceCheck++;
        }
        aggregationBatchInfo.mapAggregationBufferSet(aggregationBuffer, i);
      }
    }

    /**
     * Computes the memory limits for hash table flush (spill).
     */
    private void computeMemoryLimits() {
      JavaDataModel model = JavaDataModel.get();

      fixedHashEntrySize =
          model.hashMapEntry() +
          keyWrappersBatch.getKeysFixedSize() +
          aggregationBatchInfo.getAggregatorsFixedSize();

      MemoryMXBean memoryMXBean = ManagementFactory.getMemoryMXBean();
      maxMemory = memoryMXBean.getHeapMemoryUsage().getMax();
      memoryThreshold = conf.getMemoryThreshold();
      // Tests may leave this unitialized, so better set it to 1
      if (memoryThreshold == 0.0f) {
        memoryThreshold = 1.0f;
      }

      maxHashTblMemory = (int)(maxMemory * memoryThreshold);

      if (LOG.isDebugEnabled()) {
        LOG.debug(String.format("maxMemory:%dMb (%d * %f) fixSize:%d (key:%d agg:%d)",
            maxHashTblMemory/1024/1024,
            maxMemory/1024/1024,
            memoryThreshold,
            fixedHashEntrySize,
            keyWrappersBatch.getKeysFixedSize(),
            aggregationBatchInfo.getAggregatorsFixedSize()));
      }
    }

    /**
     * Flushes the entries in the hash table by emiting output (forward).
     * When parameter 'all' is true all the entries are flushed.
     * @param all
     * @throws HiveException
     */
    private void flush(boolean all) throws HiveException {

      int entriesToFlush = all ? numEntriesHashTable :
        (int)(numEntriesHashTable * this.percentEntriesToFlush);
      int entriesFlushed = 0;

      if (LOG.isDebugEnabled()) {
        LOG.debug(String.format(
            "Flush %d %s entries:%d fixed:%d variable:%d (used:%dMb max:%dMb) gcCanary:%s",
            entriesToFlush, all ? "(all)" : "",
            numEntriesHashTable, fixedHashEntrySize, avgVariableSize,
            numEntriesHashTable * (fixedHashEntrySize + avgVariableSize)/1024/1024,
            maxHashTblMemory/1024/1024,
            gcCanary.get() == null ? "dead" : "alive"));
      }

      /* Iterate the global (keywrapper,aggregationbuffers) map and emit
       a row for each key */
      Iterator<Map.Entry<KeyWrapper, VectorAggregationBufferRow>> iter =
          mapKeysAggregationBuffers.entrySet().iterator();
      while(iter.hasNext()) {
        Map.Entry<KeyWrapper, VectorAggregationBufferRow> pair = iter.next();

        writeSingleRow((VectorHashKeyWrapper) pair.getKey(), pair.getValue());

        if (!all) {
          iter.remove();
          --numEntriesHashTable;
          if (++entriesFlushed >= entriesToFlush) {
            break;
          }
        }
      }

      if (all) {
        mapKeysAggregationBuffers.clear();
        numEntriesHashTable = 0;
      }

      if (all && LOG.isDebugEnabled()) {
        LOG.debug(String.format("GC canary caused %d flushes", gcCanaryFlushes));
      }
    }

    /**
     * Returns true if the memory threshold for the hash table was reached.
     */
    private boolean shouldFlush(VectorizedRowBatch batch) {
      if (batch.size == 0) {
        return false;
      }
      //numEntriesSinceCheck is the number of entries added to the hash table
      // since the last time we checked the average variable size
      if (numEntriesSinceCheck >= this.checkInterval) {
        // Were going to update the average variable row size by sampling the current batch
        updateAvgVariableSize(batch);
        numEntriesSinceCheck = 0;
      }
      if (numEntriesHashTable > this.maxHtEntries ||
          numEntriesHashTable * (fixedHashEntrySize + avgVariableSize) > maxHashTblMemory) {
        return true;
      }
      if (gcCanary.get() == null) {
        return true;
      }

      return false;
    }

    /**
     * Updates the average variable size of the hash table entries.
     * The average is only updates by probing the batch that added the entry in the hash table
     * that caused the check threshold to be reached.
     */
    private void updateAvgVariableSize(VectorizedRowBatch batch) {
      int keyVariableSize = keyWrappersBatch.getVariableSize(batch.size);
      int aggVariableSize = aggregationBatchInfo.getVariableSize(batch.size);

      // This assumes the distribution of variable size keys/aggregates in the input
      // is the same as the distribution of variable sizes in the hash entries
      avgVariableSize = (int)((avgVariableSize * sumBatchSize + keyVariableSize +aggVariableSize) /
          (sumBatchSize + batch.size));
    }

    /**
     * Checks if the HT reduces the number of entries by at least minReductionHashAggr factor
     * @throws HiveException
     */
    private void checkHashModeEfficiency() throws HiveException {
      if (lastModeCheckRowCount > numRowsCompareHashAggr) {
        lastModeCheckRowCount = 0;
        if (LOG.isDebugEnabled()) {
          LOG.debug(String.format("checkHashModeEfficiency: HT:%d RC:%d MIN:%d",
              numEntriesHashTable, sumBatchSize, (long)(sumBatchSize * minReductionHashAggr)));
        }
        if (numEntriesHashTable > sumBatchSize * minReductionHashAggr) {
          flush(true);

          changeToStreamingMode();
        }
      }
    }
  }

  /**
   * Streaming processing mode on ALREADY GROUPED data. Each input VectorizedRowBatch may
   * have a mix of different keys.  Intermediate values are flushed each time key changes.
   */
  private class ProcessingModeStreaming extends ProcessingModeBase {

    /**
     * The aggregation buffers used in streaming mode
     */
    private VectorAggregationBufferRow currentStreamingAggregators;

    /**
     * The current key, used in streaming mode
     */
    private VectorHashKeyWrapper streamingKey;

    /**
     * The keys that needs to be flushed at the end of the current batch
     */
    private final VectorHashKeyWrapper[] keysToFlush =
        new VectorHashKeyWrapper[VectorizedRowBatch.DEFAULT_SIZE];

    /**
     * The aggregates that needs to be flushed at the end of the current batch
     */
    private final VectorAggregationBufferRow[] rowsToFlush =
        new VectorAggregationBufferRow[VectorizedRowBatch.DEFAULT_SIZE];

    /**
     * A pool of VectorAggregationBufferRow to avoid repeated allocations
     */
    private VectorUtilBatchObjectPool<VectorAggregationBufferRow>
      streamAggregationBufferRowPool;

    @Override
    public void initialize(Configuration hconf) throws HiveException {
      streamAggregationBufferRowPool = new VectorUtilBatchObjectPool<VectorAggregationBufferRow>(
          VectorizedRowBatch.DEFAULT_SIZE,
          new VectorUtilBatchObjectPool.IAllocator<VectorAggregationBufferRow>() {

            @Override
            public VectorAggregationBufferRow alloc() throws HiveException {
              return allocateAggregationBuffer();
            }

            @Override
            public void free(VectorAggregationBufferRow t) {
              // Nothing to do
            }
          });
      LOG.info("using unsorted streaming aggregation processing mode");
    }

    @Override
    public void doProcessBatch(VectorizedRowBatch batch, boolean isFirstGroupingSet,
        boolean[] currentGroupingSetsOverrideIsNulls) throws HiveException {

      if (!groupingSetsPresent || isFirstGroupingSet) {

        // Evaluate the key expressions once.
        for(int i = 0; i < keyExpressions.length; ++i) {
          keyExpressions[i].evaluate(batch);
        }
      }

      // First we traverse the batch to evaluate and prepare the KeyWrappers
      // After this the KeyWrappers are properly set and hash code is computed
      if (!groupingSetsPresent) {
        keyWrappersBatch.evaluateBatch(batch);
      } else {
        keyWrappersBatch.evaluateBatchGroupingSets(batch, currentGroupingSetsOverrideIsNulls);
      }

      VectorHashKeyWrapper[] batchKeys = keyWrappersBatch.getVectorHashKeyWrappers();

      if (streamingKey == null) {
        // This is the first batch we process after switching from hash mode
        currentStreamingAggregators = streamAggregationBufferRowPool.getFromPool();
        streamingKey = (VectorHashKeyWrapper) batchKeys[0].copyKey();
      }

      aggregationBatchInfo.startBatch();
      int flushMark = 0;

      for(int i = 0; i < batch.size; ++i) {
        if (!batchKeys[i].equals(streamingKey)) {
          // We've encountered a new key, must save current one
          // We can't forward yet, the aggregators have not been evaluated
          rowsToFlush[flushMark] = currentStreamingAggregators;
          if (keysToFlush[flushMark] == null) {
            keysToFlush[flushMark] = (VectorHashKeyWrapper) streamingKey.copyKey();
          } else {
            streamingKey.duplicateTo(keysToFlush[flushMark]);
          }

          currentStreamingAggregators = streamAggregationBufferRowPool.getFromPool();
          batchKeys[i].duplicateTo(streamingKey);
          ++flushMark;
        }
        aggregationBatchInfo.mapAggregationBufferSet(currentStreamingAggregators, i);
      }

      // evaluate the aggregators
      processAggregators(batch);

      // Now flush/forward all keys/rows, except the last (current) one
      for (int i = 0; i < flushMark; ++i) {
        writeSingleRow(keysToFlush[i], rowsToFlush[i]);
        rowsToFlush[i].reset();
        streamAggregationBufferRowPool.putInPool(rowsToFlush[i]);
      }
    }

    @Override
    public void close(boolean aborted) throws HiveException {
      if (!aborted && null != streamingKey) {
        writeSingleRow(streamingKey, currentStreamingAggregators);
      }
    }
  }

  /**
   * Sorted reduce group batch processing mode. Each input VectorizedRowBatch will have the
   * same key.  On endGroup (or close), the intermediate values are flushed.
   *
   * We build the output rows one-at-a-time in the output vectorized row batch (outputBatch)
   * in 2 steps:
   *
   *   1) Just after startGroup, we copy the group key to the next position in the output batch,
   *      but don't increment the size in the batch (yet).  This is done with the copyGroupKey
   *      method of VectorGroupKeyHelper.  The next position is outputBatch.size
   *
   *      We know the same key is used for the whole batch (i.e. repeating) since that is how
   *      vectorized reduce-shuffle feeds the batches to us.
   *
   *   2) Later at endGroup after reduce-shuffle has fed us all the input batches for the group,
   *      we fill in the aggregation columns in outputBatch at outputBatch.size.  Our method
   *      writeGroupRow does this and finally increments outputBatch.size.
   *
   */
  private class ProcessingModeReduceMergePartial extends ProcessingModeBase {

    private boolean inGroup;
    private boolean first;

    /**
     * The group vector key helper.
     */
    VectorGroupKeyHelper groupKeyHelper;

    /**
     * The group vector aggregation buffers.
     */
    private VectorAggregationBufferRow groupAggregators;

    /**
     * Buffer to hold string values.
     */
    private DataOutputBuffer buffer;

    @Override
    public void initialize(Configuration hconf) throws HiveException {
      inGroup = false;

      // We do not include the dummy grouping set column in the output.  So we pass outputKeyLength
      // instead of keyExpressions.length
      groupKeyHelper = new VectorGroupKeyHelper(outputKeyLength);
      groupKeyHelper.init(keyExpressions);
      groupAggregators = allocateAggregationBuffer();
      buffer = new DataOutputBuffer();
      LOG.info("using sorted group batch aggregation processing mode");
    }

    @Override
    public void startGroup() throws HiveException {
      inGroup = true;
      first = true;
    }

    @Override
    public void endGroup() throws HiveException {
      if (inGroup && !first) {
        writeGroupRow(groupAggregators, buffer);
        groupAggregators.reset();
      }
      inGroup = false;
    }

    @Override
    public void doProcessBatch(VectorizedRowBatch batch, boolean isFirstGroupingSet,
        boolean[] currentGroupingSetsOverrideIsNulls) throws HiveException {
      assert(inGroup);
      if (first) {
        // Copy the group key to output batch now.  We'll copy in the aggregates at the end of the group.
        first = false;

        // Evaluate the key expressions of just this first batch to get the correct key.
        for (int i = 0; i < outputKeyLength; i++) {
          keyExpressions[i].evaluate(batch);
        }

        groupKeyHelper.copyGroupKey(batch, outputBatch, buffer);
      }

      // Aggregate this batch.
      for (int i = 0; i < aggregators.length; ++i) {
        aggregators[i].aggregateInput(groupAggregators.getAggregationBuffer(i), batch);
      }
    }

    @Override
    public void close(boolean aborted) throws HiveException {
      if (!aborted && inGroup && !first) {
        writeGroupRow(groupAggregators, buffer);
      }
    }
  }

  /**
   * Current processing mode. Processing mode can change (eg. hash -> streaming).
   */
  private transient IProcessingMode processingMode;

  private static final long serialVersionUID = 1L;

  public VectorGroupByOperator(CompilationOpContext ctx,
      VectorizationContext vContext, OperatorDesc conf) throws HiveException {
    this(ctx);
    GroupByDesc desc = (GroupByDesc) conf;
    this.conf = desc;
    vectorDesc = (VectorGroupByDesc) desc.getVectorDesc();
    keyExpressions = vectorDesc.getKeyExpressions();
    aggregators = vectorDesc.getAggregators();
    isVectorOutput = vectorDesc.isVectorOutput();

    vOutContext = new VectorizationContext(getName(), desc.getOutputColumnNames(),
        /* vContextEnvironment */ vContext);
  }

  /** Kryo ctor. */
  @VisibleForTesting
  public VectorGroupByOperator() {
    super();
  }

  public VectorGroupByOperator(CompilationOpContext ctx) {
    super(ctx);
  }

  private void setupGroupingSets() {

    groupingSetsPresent = conf.isGroupingSetsPresent();
    if (!groupingSetsPresent) {
      groupingSets = null;
      groupingSetsPosition = -1;
      groupingSetsDummyVectorExpression = null;
      allGroupingSetsOverrideIsNulls = null;
      return;
    }

    groupingSets = ArrayUtils.toPrimitive(conf.getListGroupingSets().toArray(new Integer[0]));
    groupingSetsPosition = conf.getGroupingSetPosition();

    allGroupingSetsOverrideIsNulls = new boolean[groupingSets.length][];

    int pos = 0;
    for (int groupingSet: groupingSets) {

      // Create the mapping corresponding to the grouping set

      // Assume all columns are null, except the dummy column is always non-null.
      boolean[] groupingSetsOverrideIsNull = new boolean[keyExpressions.length];
      Arrays.fill(groupingSetsOverrideIsNull, true);
      groupingSetsOverrideIsNull[groupingSetsPosition] = false;

      // Add keys of this grouping set.
      FastBitSet bitset = GroupByOperator.groupingSet2BitSet(groupingSet, groupingSetsPosition);
      for (int keyPos = bitset.nextClearBit(0); keyPos < groupingSetsPosition;
        keyPos = bitset.nextClearBit(keyPos+1)) {
        groupingSetsOverrideIsNull[keyPos] = false;
      }

      allGroupingSetsOverrideIsNulls[pos] =  groupingSetsOverrideIsNull;
      pos++;
    }

    // The last key column is the dummy grouping set id.
    //
    // Figure out which (scratch) column was used so we can overwrite the dummy id.

    groupingSetsDummyVectorExpression = (ConstantVectorExpression) keyExpressions[groupingSetsPosition];
  }

  @Override
  protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);

    List<ObjectInspector> objectInspectors = new ArrayList<ObjectInspector>();

    List<ExprNodeDesc> keysDesc = conf.getKeys();
    try {

      List<String> outputFieldNames = conf.getOutputColumnNames();

      // grouping id should be pruned, which is the last of key columns
      // see ColumnPrunerGroupByProc
      outputKeyLength =
          conf.pruneGroupingSetId() ? keyExpressions.length - 1 : keyExpressions.length;

      keyOutputWriters = new VectorExpressionWriter[outputKeyLength];

      for(int i = 0; i < outputKeyLength; ++i) {
        keyOutputWriters[i] = VectorExpressionWriterFactory.
            genVectorExpressionWritable(keysDesc.get(i));
        objectInspectors.add(keyOutputWriters[i].getObjectInspector());
      }

      for (int i = 0; i < aggregators.length; ++i) {
        aggregators[i].init(conf.getAggregators().get(i));
        objectInspectors.add(aggregators[i].getOutputObjectInspector());
      }

      keyWrappersBatch = VectorHashKeyWrapperBatch.compileKeyWrapperBatch(keyExpressions);
      aggregationBatchInfo = new VectorAggregationBufferBatch();
      aggregationBatchInfo.compileAggregationBatchInfo(aggregators);

      LOG.info("VectorGroupByOperator is vector output {}", isVectorOutput);
      outputObjInspector = ObjectInspectorFactory.getStandardStructObjectInspector(
          outputFieldNames, objectInspectors);
      if (isVectorOutput) {
        vrbCtx = new VectorizedRowBatchCtx();
        vrbCtx.init((StructObjectInspector) outputObjInspector, vOutContext.getScratchColumnTypeNames());
        outputBatch = vrbCtx.createVectorizedRowBatch();
        vectorAssignRow = new VectorAssignRow();
        vectorAssignRow.init((StructObjectInspector) outputObjInspector, vOutContext.getProjectedColumns());
      }

    } catch (HiveException he) {
      throw he;
    } catch (Throwable e) {
      throw new HiveException(e);
    }

    forwardCache = new Object[outputKeyLength + aggregators.length];

    setupGroupingSets();

    switch (vectorDesc.getProcessingMode()) {
    case GLOBAL:
      Preconditions.checkState(outputKeyLength == 0);
      Preconditions.checkState(!groupingSetsPresent);
      processingMode = this.new ProcessingModeGlobalAggregate();
      break;
    case HASH:
      processingMode = this.new ProcessingModeHashAggregate();
      break;
    case MERGE_PARTIAL:
      Preconditions.checkState(!groupingSetsPresent);
      processingMode = this.new ProcessingModeReduceMergePartial();
      break;
    case STREAMING:
      processingMode = this.new ProcessingModeStreaming();
      break;
    default:
      throw new RuntimeException("Unsupported vector GROUP BY processing mode " +
          vectorDesc.getProcessingMode().name());
    }
    processingMode.initialize(hconf);
  }

  /**
   * changes the processing mode to streaming
   * This is done at the request of the hash agg mode, if the number of keys
   * exceeds the minReductionHashAggr factor
   * @throws HiveException
   */
  private void changeToStreamingMode() throws HiveException {
    processingMode = this.new ProcessingModeStreaming();
    processingMode.initialize(null);
    LOG.trace("switched to streaming mode");
  }

  @Override
  public void startGroup() throws HiveException {
    processingMode.startGroup();

    // We do not call startGroup on operators below because we are batching rows in
    // an output batch and the semantics will not work.
    // super.startGroup();
  }

  @Override
  public void endGroup() throws HiveException {
    processingMode.endGroup();

    // We do not call endGroup on operators below because we are batching rows in
    // an output batch and the semantics will not work.
    // super.endGroup();
  }

  @Override
  public void process(Object row, int tag) throws HiveException {
    VectorizedRowBatch batch = (VectorizedRowBatch) row;
    if (batch.size > 0) {
      processingMode.processBatch(batch);
    }
  }

  /**
   * Emits a single row, made from the key and the row aggregation buffers values
   * kw is null if keyExpressions.length is 0
   * @param kw
   * @param agg
   * @throws HiveException
   */
  private void writeSingleRow(VectorHashKeyWrapper kw, VectorAggregationBufferRow agg)
      throws HiveException {
    int fi = 0;
    if (!isVectorOutput) {
      // Output row.
      for (int i = 0; i < outputKeyLength; ++i) {
        forwardCache[fi++] = keyWrappersBatch.getWritableKeyValue (
            kw, i, keyOutputWriters[i]);
      }
      for (int i = 0; i < aggregators.length; ++i) {
        forwardCache[fi++] = aggregators[i].evaluateOutput(agg.getAggregationBuffer(i));
      }
      forward(forwardCache, outputObjInspector);
    } else {
      // Output keys and aggregates into the output batch.
      for (int i = 0; i < outputKeyLength; ++i) {
        vectorAssignRow.assignRowColumn(outputBatch, outputBatch.size, fi++,
                keyWrappersBatch.getWritableKeyValue (kw, i, keyOutputWriters[i]));
      }
      for (int i = 0; i < aggregators.length; ++i) {
        vectorAssignRow.assignRowColumn(outputBatch, outputBatch.size, fi++,
                aggregators[i].evaluateOutput(agg.getAggregationBuffer(i)));
      }
      ++outputBatch.size;
      if (outputBatch.size == VectorizedRowBatch.DEFAULT_SIZE) {
        flushOutput();
      }
    }
  }

  /**
   * Emits a (reduce) group row, made from the key (copied in at the beginning of the group) and
   * the row aggregation buffers values
   * @param agg
   * @param buffer
   * @throws HiveException
   */
  private void writeGroupRow(VectorAggregationBufferRow agg, DataOutputBuffer buffer)
      throws HiveException {
    int fi = outputKeyLength;   // Start after group keys.
    for (int i = 0; i < aggregators.length; ++i) {
      vectorAssignRow.assignRowColumn(outputBatch, outputBatch.size, fi++,
              aggregators[i].evaluateOutput(agg.getAggregationBuffer(i)));
    }
    ++outputBatch.size;
    if (outputBatch.size == VectorizedRowBatch.DEFAULT_SIZE) {
      flushOutput();
      buffer.reset();
    }
  }

  private void flushOutput() throws HiveException {
    forward(outputBatch, null);
    outputBatch.reset();
  }

  @Override
  public void closeOp(boolean aborted) throws HiveException {
    processingMode.close(aborted);
    if (!aborted && isVectorOutput && outputBatch.size > 0) {
      flushOutput();
    }
  }

  public VectorExpression[] getKeyExpressions() {
    return keyExpressions;
  }

  public void setKeyExpressions(VectorExpression[] keyExpressions) {
    this.keyExpressions = keyExpressions;
  }

  public VectorAggregateExpression[] getAggregators() {
    return aggregators;
  }

  public void setAggregators(VectorAggregateExpression[] aggregators) {
    this.aggregators = aggregators;
  }

  @Override
  public VectorizationContext getOuputVectorizationContext() {
    return vOutContext;
  }

  @Override
  public OperatorType getType() {
    return OperatorType.GROUPBY;
  }

  @Override
  public String getName() {
    return getOperatorName();
  }

  static public String getOperatorName() {
    return "GBY";
  }

}