VectorReduceSinkCommonOperator.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec.vector.reducesink;

import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Properties;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator.Counter;
import org.apache.hadoop.hive.ql.exec.TerminalOperator;
import org.apache.hadoop.hive.ql.exec.TopNHash;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.vector.VectorSerializeRow;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContextRegion;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression;
import org.apache.hadoop.hive.ql.exec.vector.keyseries.VectorKeySeriesSerialized;
import org.apache.hadoop.hive.ql.io.HiveKey;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.BaseWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.plan.VectorReduceSinkDesc;
import org.apache.hadoop.hive.ql.plan.VectorReduceSinkInfo;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.ByteStream.Output;
import org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe;
import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite;
import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinarySerializeWrite;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hive.common.util.HashCodeUtil;

import com.google.common.base.Preconditions;

/**
 * This class is common operator class for native vectorized reduce sink.
 */
public abstract class VectorReduceSinkCommonOperator extends TerminalOperator<ReduceSinkDesc>
    implements Serializable, TopNHash.BinaryCollector, VectorizationContextRegion {

  private static final long serialVersionUID = 1L;
  private static final String CLASS_NAME = VectorReduceSinkCommonOperator.class.getName();
  private static final Log LOG = LogFactory.getLog(CLASS_NAME);

  protected VectorReduceSinkDesc vectorDesc;

  /**
   * Information about our native vectorized reduce sink created by the Vectorizer class during
   * it decision process and useful for execution.
   */
  protected VectorReduceSinkInfo vectorReduceSinkInfo;

  protected VectorizationContext vContext;

  /**
   * Reduce sink key vector expressions.
   */

  // This is map of which vectorized row batch columns are the key columns.
  // And, their types.
  protected boolean isEmptyKey;
  protected int[] reduceSinkKeyColumnMap;
  protected TypeInfo[] reduceSinkKeyTypeInfos;

  // Optional vectorized key expressions that need to be run on each batch.
  protected VectorExpression[] reduceSinkKeyExpressions;

  // This is map of which vectorized row batch columns are the value columns.
  // And, their types.
  protected boolean isEmptyValue;
  protected int[] reduceSinkValueColumnMap;
  protected TypeInfo[] reduceSinkValueTypeInfos;

  // Optional vectorized value expressions that need to be run on each batch.
  protected VectorExpression[] reduceSinkValueExpressions;

  // The above members are initialized by the constructor and must not be
  // transient.
  //---------------------------------------------------------------------------

  // Whether there is to be a tag added to the end of each key and the tag value.
  protected transient boolean reduceSkipTag;
  protected transient byte reduceTagByte;

  // Binary sortable key serializer.
  protected transient BinarySortableSerializeWrite keyBinarySortableSerializeWrite;

  // Lazy binary value serializer.
  protected transient LazyBinarySerializeWrite valueLazyBinarySerializeWrite;

  // This helper object serializes LazyBinary format reducer values from columns of a row
  // in a vectorized row batch.
  protected transient VectorSerializeRow<LazyBinarySerializeWrite> valueVectorSerializeRow;

  // The output buffer used to serialize a value into.
  protected transient Output valueOutput;

  // The hive key and bytes writable value needed to pass the key and value to the collector.
  protected transient HiveKey keyWritable;
  protected transient BytesWritable valueBytesWritable;

  // Picks topN K:V pairs from input.
  protected transient TopNHash reducerHash;

  // Where to write our key and value pairs.
  private transient OutputCollector out;

  private transient long numRows = 0;
  private transient long cntr = 1;
  private transient long logEveryNRows = 0;
  private final transient LongWritable recordCounter = new LongWritable();

  // For debug tracing: the name of the map or reduce task.
  protected transient String taskName;

  // Debug display.
  protected transient long batchCounter;

  //---------------------------------------------------------------------------

  /** Kryo ctor. */
  protected VectorReduceSinkCommonOperator() {
    super();
  }

  public VectorReduceSinkCommonOperator(CompilationOpContext ctx) {
    super(ctx);
  }

  public VectorReduceSinkCommonOperator(CompilationOpContext ctx,
      VectorizationContext vContext, OperatorDesc conf) throws HiveException {
    this(ctx);

    LOG.info("VectorReduceSinkCommonOperator constructor");

    ReduceSinkDesc desc = (ReduceSinkDesc) conf;
    this.conf = desc;
    vectorDesc = (VectorReduceSinkDesc) desc.getVectorDesc();
    vectorReduceSinkInfo = vectorDesc.getVectorReduceSinkInfo();
    this.vContext = vContext;

    isEmptyKey = vectorDesc.getIsEmptyKey();
    if (!isEmptyKey) {
      // Since a key expression can be a calculation and the key will go into a scratch column,
      // we need the mapping and type information.
      reduceSinkKeyColumnMap = vectorReduceSinkInfo.getReduceSinkKeyColumnMap();
      reduceSinkKeyTypeInfos = vectorReduceSinkInfo.getReduceSinkKeyTypeInfos();
      reduceSinkKeyExpressions = vectorReduceSinkInfo.getReduceSinkKeyExpressions();
    }

    isEmptyValue = vectorDesc.getIsEmptyValue();
    if (!isEmptyValue) {
      reduceSinkValueColumnMap = vectorReduceSinkInfo.getReduceSinkValueColumnMap();
      reduceSinkValueTypeInfos = vectorReduceSinkInfo.getReduceSinkValueTypeInfos();
      reduceSinkValueExpressions = vectorReduceSinkInfo.getReduceSinkValueExpressions();
    }
  }

  // Get the sort order
  private boolean[] getColumnSortOrder(Properties properties, int columnCount) {
    String columnSortOrder = properties.getProperty(serdeConstants.SERIALIZATION_SORT_ORDER);
    boolean[] columnSortOrderIsDesc = new boolean[columnCount];
    if (columnSortOrder == null) {
      Arrays.fill(columnSortOrderIsDesc, false);
    } else {
      for (int i = 0; i < columnSortOrderIsDesc.length; i++) {
        columnSortOrderIsDesc[i] = (columnSortOrder.charAt(i) == '-');
      }
    }
    return columnSortOrderIsDesc;
  }

  private byte[] getColumnNullMarker(Properties properties, int columnCount, boolean[] columnSortOrder) {
    String columnNullOrder = properties.getProperty(serdeConstants.SERIALIZATION_NULL_SORT_ORDER);
    byte[] columnNullMarker = new byte[columnCount];
      for (int i = 0; i < columnNullMarker.length; i++) {
        if (columnSortOrder[i]) {
          // Descending
          if (columnNullOrder != null && columnNullOrder.charAt(i) == 'a') {
            // Null first
            columnNullMarker[i] = BinarySortableSerDe.ONE;
          } else {
            // Null last (default for descending order)
            columnNullMarker[i] = BinarySortableSerDe.ZERO;
          }
        } else {
          // Ascending
          if (columnNullOrder != null && columnNullOrder.charAt(i) == 'z') {
            // Null last
            columnNullMarker[i] = BinarySortableSerDe.ONE;
          } else {
            // Null first (default for ascending order)
            columnNullMarker[i] = BinarySortableSerDe.ZERO;
          }
        }
    }
    return columnNullMarker;
  }

  private byte[] getColumnNotNullMarker(Properties properties, int columnCount, boolean[] columnSortOrder) {
    String columnNullOrder = properties.getProperty(serdeConstants.SERIALIZATION_NULL_SORT_ORDER);
    byte[] columnNotNullMarker = new byte[columnCount];
      for (int i = 0; i < columnNotNullMarker.length; i++) {
        if (columnSortOrder[i]) {
          // Descending
          if (columnNullOrder != null && columnNullOrder.charAt(i) == 'a') {
            // Null first
            columnNotNullMarker[i] = BinarySortableSerDe.ZERO;
          } else {
            // Null last (default for descending order)
            columnNotNullMarker[i] = BinarySortableSerDe.ONE;
          }
        } else {
          // Ascending
          if (columnNullOrder != null && columnNullOrder.charAt(i) == 'z') {
            // Null last
            columnNotNullMarker[i] = BinarySortableSerDe.ZERO;
          } else {
            // Null first (default for ascending order)
            columnNotNullMarker[i] = BinarySortableSerDe.ONE;
          }
        }
    }
    return columnNotNullMarker;
  }

  @Override
  protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);

    if (isLogDebugEnabled) {
      LOG.debug("useUniformHash " + vectorReduceSinkInfo.getUseUniformHash());
  
      LOG.debug("reduceSinkKeyColumnMap " +
          (vectorReduceSinkInfo.getReduceSinkKeyColumnMap() == null ? "NULL" : Arrays.toString(vectorReduceSinkInfo.getReduceSinkKeyColumnMap())));
      LOG.debug("reduceSinkKeyTypeInfos " +
          (vectorReduceSinkInfo.getReduceSinkKeyTypeInfos() == null ? "NULL" : Arrays.toString(vectorReduceSinkInfo.getReduceSinkKeyTypeInfos())));
      LOG.debug("reduceSinkKeyColumnVectorTypes " +
          (vectorReduceSinkInfo.getReduceSinkKeyColumnVectorTypes() == null ? "NULL" : Arrays.toString(vectorReduceSinkInfo.getReduceSinkKeyColumnVectorTypes())));
      LOG.debug("reduceSinkKeyExpressions " +
          (vectorReduceSinkInfo.getReduceSinkKeyExpressions() == null ? "NULL" : Arrays.toString(vectorReduceSinkInfo.getReduceSinkKeyExpressions())));
  
      LOG.debug("reduceSinkValueColumnMap " +
          (vectorReduceSinkInfo.getReduceSinkValueColumnMap() == null ? "NULL" : Arrays.toString(vectorReduceSinkInfo.getReduceSinkValueColumnMap())));
      LOG.debug("reduceSinkValueTypeInfos " +
          (vectorReduceSinkInfo.getReduceSinkValueTypeInfos() == null ? "NULL" : Arrays.toString(vectorReduceSinkInfo.getReduceSinkValueTypeInfos())));
      LOG.debug("reduceSinkValueColumnVectorTypes " +
          (vectorReduceSinkInfo.getReduceSinkValueColumnVectorTypes() == null ? "NULL" : Arrays.toString(vectorReduceSinkInfo.getReduceSinkValueColumnVectorTypes())));
      LOG.debug("reduceSinkValueExpressions " +
          (vectorReduceSinkInfo.getReduceSinkValueExpressions() == null ? "NULL" : Arrays.toString(vectorReduceSinkInfo.getReduceSinkValueExpressions())));
  
      LOG.debug("reduceSinkBucketColumnMap " +
          (vectorReduceSinkInfo.getReduceSinkBucketColumnMap() == null ? "NULL" : Arrays.toString(vectorReduceSinkInfo.getReduceSinkBucketColumnMap())));
      LOG.debug("reduceSinkBucketTypeInfos " +
          (vectorReduceSinkInfo.getReduceSinkBucketTypeInfos() == null ? "NULL" : Arrays.toString(vectorReduceSinkInfo.getReduceSinkBucketTypeInfos())));
      LOG.debug("reduceSinkBucketColumnVectorTypes " +
          (vectorReduceSinkInfo.getReduceSinkBucketColumnVectorTypes() == null ? "NULL" : Arrays.toString(vectorReduceSinkInfo.getReduceSinkBucketColumnVectorTypes())));
      LOG.debug("reduceSinkBucketExpressions " +
          (vectorReduceSinkInfo.getReduceSinkBucketExpressions() == null ? "NULL" : Arrays.toString(vectorReduceSinkInfo.getReduceSinkBucketExpressions())));
  
      LOG.debug("reduceSinkPartitionColumnMap " +
          (vectorReduceSinkInfo.getReduceSinkPartitionColumnMap() == null ? "NULL" : Arrays.toString(vectorReduceSinkInfo.getReduceSinkPartitionColumnMap())));
      LOG.debug("reduceSinkPartitionTypeInfos " +
          (vectorReduceSinkInfo.getReduceSinkPartitionTypeInfos() == null ? "NULL" : Arrays.toString(vectorReduceSinkInfo.getReduceSinkPartitionTypeInfos())));
      LOG.debug("reduceSinkPartitionColumnVectorTypes " +
          (vectorReduceSinkInfo.getReduceSinkPartitionColumnVectorTypes() == null ? "NULL" : Arrays.toString(vectorReduceSinkInfo.getReduceSinkPartitionColumnVectorTypes())));
      LOG.debug("reduceSinkPartitionExpressions " +
          (vectorReduceSinkInfo.getReduceSinkPartitionExpressions() == null ? "NULL" : Arrays.toString(vectorReduceSinkInfo.getReduceSinkPartitionExpressions())));
    }

    if (LOG.isDebugEnabled()) {
      // Determine the name of our map or reduce task for debug tracing.
      BaseWork work = Utilities.getMapWork(hconf);
      if (work == null) {
        work = Utilities.getReduceWork(hconf);
      }
      taskName = work.getName();
    }

    String context = hconf.get(Operator.CONTEXT_NAME_KEY, "");
    if (context != null && !context.isEmpty()) {
      context = "_" + context.replace(" ","_");
    }
    statsMap.put(Counter.RECORDS_OUT_INTERMEDIATE + context, recordCounter);

    reduceSkipTag = conf.getSkipTag();
    reduceTagByte = (byte) conf.getTag();

    if (isLogInfoEnabled) {
      LOG.info("Using tag = " + (int) reduceTagByte);
    }

    if (!isEmptyKey) {
      TableDesc keyTableDesc = conf.getKeySerializeInfo();
      boolean[] columnSortOrder =
          getColumnSortOrder(keyTableDesc.getProperties(), reduceSinkKeyColumnMap.length);
      byte[] columnNullMarker =
          getColumnNullMarker(keyTableDesc.getProperties(), reduceSinkKeyColumnMap.length, columnSortOrder);
      byte[] columnNotNullMarker =
          getColumnNotNullMarker(keyTableDesc.getProperties(), reduceSinkKeyColumnMap.length, columnSortOrder);

      keyBinarySortableSerializeWrite =
          new BinarySortableSerializeWrite(
              columnSortOrder,
              columnNullMarker,
              columnNotNullMarker);
    }

    if (!isEmptyValue) {
      valueLazyBinarySerializeWrite = new LazyBinarySerializeWrite(reduceSinkValueColumnMap.length);

      valueVectorSerializeRow =
          new VectorSerializeRow<LazyBinarySerializeWrite>(
              valueLazyBinarySerializeWrite);
      valueVectorSerializeRow.init(reduceSinkValueTypeInfos, reduceSinkValueColumnMap);

      valueOutput = new Output();
      valueVectorSerializeRow.setOutput(valueOutput);
    }

    keyWritable = new HiveKey();

    valueBytesWritable = new BytesWritable();

    int limit = conf.getTopN();
    float memUsage = conf.getTopNMemoryUsage();

    if (limit >= 0 && memUsage > 0) {
      reducerHash = new TopNHash();
      reducerHash.initialize(limit, memUsage, conf.isMapGroupBy(), this, conf, hconf);
    }

    batchCounter = 0;
  }

  protected void initializeEmptyKey(int tag) {

    // Use the same logic as ReduceSinkOperator.toHiveKey.
    //
    if (tag == -1 || reduceSkipTag) {
      keyWritable.setSize(0);
    } else {
      keyWritable.setSize(1);
      keyWritable.get()[0] = reduceTagByte;
    }
    keyWritable.setDistKeyLength(0);
    keyWritable.setHashCode(0);
  }

  // The collect method override for TopNHash.BinaryCollector
  @Override
  public void collect(byte[] key, byte[] value, int hash) throws IOException {
    HiveKey keyWritable = new HiveKey(key, hash);
    BytesWritable valueWritable = new BytesWritable(value);
    doCollect(keyWritable, valueWritable);
  }

  protected void collect(HiveKey keyWritable, BytesWritable valueWritable)
      throws HiveException, IOException {
    if (reducerHash != null) {
      // NOTE: partColsIsNull is only used for PTF, which isn't supported yet.
      final int firstIndex =
          reducerHash.tryStoreKey(keyWritable, /* partColsIsNull */ false);

      if (firstIndex == TopNHash.EXCLUDE) return;   // Nothing to do.

      if (firstIndex == TopNHash.FORWARD) {
        doCollect(keyWritable, valueWritable);
      } else {
        Preconditions.checkState(firstIndex >= 0);
        reducerHash.storeValue(firstIndex, keyWritable.hashCode(), valueWritable, false);
      }
    } else {
      doCollect(keyWritable, valueWritable);
    }
  }

  private void doCollect(HiveKey keyWritable, BytesWritable valueWritable) throws IOException {
    // Since this is a terminal operator, update counters explicitly -
    // forward is not called
    if (null != out) {
      numRows++;
      if (isLogInfoEnabled) {
        if (numRows == cntr) {
          cntr = logEveryNRows == 0 ? cntr * 10 : numRows + logEveryNRows;
          if (cntr < 0 || numRows < 0) {
            cntr = 0;
            numRows = 1;
          }
          LOG.info(toString() + ": records written - " + numRows);
        }
      }

      // BytesWritable valueBytesWritable = (BytesWritable) valueWritable;
      // LOG.info("VectorReduceSinkCommonOperator collect keyWritable " + keyWritable.getLength() + " " +
      //     VectorizedBatchUtil.displayBytes(keyWritable.getBytes(), 0, keyWritable.getLength()) +
      //     " valueWritable " + valueBytesWritable.getLength() +
      //     VectorizedBatchUtil.displayBytes(valueBytesWritable.getBytes(), 0, valueBytesWritable.getLength()));

      out.collect(keyWritable, valueWritable);
    }
  }

  @Override
  protected void closeOp(boolean abort) throws HiveException {
    if (!abort && reducerHash != null) {
      reducerHash.flush();
    }
    super.closeOp(abort);
    out = null;
    reducerHash = null;
    if (isLogInfoEnabled) {
      LOG.info(toString() + ": records written - " + numRows);
    }
    recordCounter.set(numRows);
  }

  /**
   * @return the name of the operator
   */
  @Override
  public String getName() {
    return getOperatorName();
  }

  static public String getOperatorName() {
    return "RS";
  }

  @Override
  public OperatorType getType() {
    return OperatorType.REDUCESINK;
  }

  @Override
  public VectorizationContext getOuputVectorizationContext() {
    return vContext;
  }

  @Override
  public boolean getIsReduceSink() {
    return true;
  }

  @Override
  public String getReduceOutputName() {
    return conf.getOutputName();
  }

  @Override
  public void setOutputCollector(OutputCollector _out) {
    this.out = _out;
  }
}