VectorizedRowBatchCtx.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.exec.vector;

import java.io.IOException;
import java.sql.Date;
import java.sql.Timestamp;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.hive.common.type.HiveChar;
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveIntervalDayTime;
import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
import org.apache.hadoop.hive.ql.io.IOPrepareCache;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.Explain;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.Explain.Level;
import org.apache.hadoop.hive.ql.plan.Explain.Vectorization;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hive.common.util.DateUtils;

import com.google.common.base.Preconditions;

/**
 * Context for Vectorized row batch. this class does eager deserialization of row data using serde
 * in the RecordReader layer.
 * It has supports partitions in this layer so that the vectorized batch is populated correctly
 * with the partition column.
 */
public class VectorizedRowBatchCtx {

  private static final long serialVersionUID = 1L;

  private static final Logger LOG = LoggerFactory.getLogger(VectorizedRowBatchCtx.class.getName());

  // The following information is for creating VectorizedRowBatch and for helping with
  // knowing how the table is partitioned.
  //
  // It will be stored in MapWork and ReduceWork.
  private String[] rowColumnNames;
  private TypeInfo[] rowColumnTypeInfos;
  private int[] dataColumnNums;
  private int dataColumnCount;
  private int partitionColumnCount;

  private String[] scratchColumnTypeNames;

  /**
   * Constructor for VectorizedRowBatchCtx
   */
  public VectorizedRowBatchCtx() {
  }

  public VectorizedRowBatchCtx(String[] rowColumnNames, TypeInfo[] rowColumnTypeInfos,
      int[] dataColumnNums, int partitionColumnCount, String[] scratchColumnTypeNames) {
    this.rowColumnNames = rowColumnNames;
    this.rowColumnTypeInfos = rowColumnTypeInfos;
    this.dataColumnNums = dataColumnNums;
    this.partitionColumnCount = partitionColumnCount;
    this.scratchColumnTypeNames = scratchColumnTypeNames;

    dataColumnCount = rowColumnTypeInfos.length - partitionColumnCount;
  }

  public String[] getRowColumnNames() {
    return rowColumnNames;
  }

  public TypeInfo[] getRowColumnTypeInfos() {
    return rowColumnTypeInfos;
  }

  public int[] getDataColumnNums() {
    return dataColumnNums;
  }

  public int getDataColumnCount() {
    return dataColumnCount;
  }

  public int getPartitionColumnCount() {
    return partitionColumnCount;
  }

  public String[] getScratchColumnTypeNames() {
    return scratchColumnTypeNames;
  }

  /**
   * Initializes the VectorizedRowBatch context based on an scratch column type names and
   * object inspector.
   * @param structObjectInspector
   * @param scratchColumnTypeNames
   *          Object inspector that shapes the column types
   * @throws HiveException
   */
  public void init(StructObjectInspector structObjectInspector, String[] scratchColumnTypeNames)
          throws HiveException {

    // Row column information.
    rowColumnNames = VectorizedBatchUtil.columnNamesFromStructObjectInspector(structObjectInspector);
    rowColumnTypeInfos = VectorizedBatchUtil.typeInfosFromStructObjectInspector(structObjectInspector);
    dataColumnNums = null;
    partitionColumnCount = 0;
    dataColumnCount = rowColumnTypeInfos.length;

    // Scratch column information.
    this.scratchColumnTypeNames = scratchColumnTypeNames;
  }

  public static void getPartitionValues(VectorizedRowBatchCtx vrbCtx, Configuration hiveConf,
      FileSplit split, Object[] partitionValues) throws IOException {

    Map<Path, PartitionDesc> pathToPartitionInfo = Utilities
        .getMapWork(hiveConf).getPathToPartitionInfo();

    PartitionDesc partDesc = HiveFileFormatUtils
        .getPartitionDescFromPathRecursively(pathToPartitionInfo,
            split.getPath(), IOPrepareCache.get().getPartitionDescMap());

    getPartitionValues(vrbCtx, partDesc, partitionValues);

  }

  public static void getPartitionValues(VectorizedRowBatchCtx vrbCtx, PartitionDesc partDesc,
      Object[] partitionValues) {

    LinkedHashMap<String, String> partSpec = partDesc.getPartSpec();

    for (int i = 0; i < vrbCtx.partitionColumnCount; i++) {
      Object objectValue;
      if (partSpec == null) {
        // For partition-less table, initialize partValue to empty string.
        // We can have partition-less table even if we have partition keys
        // when there is only only partition selected and the partition key is not
        // part of the projection/include list.
        objectValue = null;
      } else {
        String key = vrbCtx.rowColumnNames[vrbCtx.dataColumnCount + i];

        // Create a Standard java object Inspector
        TypeInfo partColTypeInfo = vrbCtx.rowColumnTypeInfos[vrbCtx.dataColumnCount + i];
        ObjectInspector objectInspector =
            TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(partColTypeInfo);
        objectValue =
            ObjectInspectorConverters.
                getConverter(PrimitiveObjectInspectorFactory.
                    javaStringObjectInspector, objectInspector).
                        convert(partSpec.get(key));
        if (partColTypeInfo instanceof CharTypeInfo) {
          objectValue = ((HiveChar) objectValue).getStrippedValue();
        }
      }
      partitionValues[i] = objectValue;
    }
  }

  /**
   * Creates a Vectorized row batch and the column vectors.
   *
   * @return VectorizedRowBatch
   * @throws HiveException
   */
  public VectorizedRowBatch createVectorizedRowBatch()
  {
    final int dataAndPartColumnCount = rowColumnTypeInfos.length;
    final int totalColumnCount = dataAndPartColumnCount + scratchColumnTypeNames.length;
    VectorizedRowBatch result = new VectorizedRowBatch(totalColumnCount);

    if (dataColumnNums == null) {
        // All data and partition columns.
      for (int i = 0; i < dataAndPartColumnCount; i++) {
        TypeInfo typeInfo = rowColumnTypeInfos[i];
        result.cols[i] = VectorizedBatchUtil.createColumnVector(typeInfo);
      }
    } else {
      // Create only needed/included columns data columns.
      for (int i = 0; i < dataColumnNums.length; i++) {
        int columnNum = dataColumnNums[i];
        Preconditions.checkState(columnNum < dataAndPartColumnCount);
        TypeInfo typeInfo = rowColumnTypeInfos[columnNum];
        result.cols[columnNum] = VectorizedBatchUtil.createColumnVector(typeInfo);
      }
      // Always create partition columns.
      final int endColumnNum = dataColumnCount + partitionColumnCount;
      for (int partitionColumnNum = dataColumnCount; partitionColumnNum < endColumnNum; partitionColumnNum++) {
        TypeInfo typeInfo = rowColumnTypeInfos[partitionColumnNum];
        result.cols[partitionColumnNum] = VectorizedBatchUtil.createColumnVector(typeInfo);
      }
    }

    for (int i = 0; i < scratchColumnTypeNames.length; i++) {
      String typeName = scratchColumnTypeNames[i];
      result.cols[rowColumnTypeInfos.length + i] =
          VectorizedBatchUtil.createColumnVector(typeName);
    }

    result.setPartitionInfo(dataColumnCount, partitionColumnCount);

    result.reset();
    return result;
  }

  /**
   * Add the partition values to the batch
   *
   * @param batch
   * @param partitionValues
   * @throws HiveException
   */
  public void addPartitionColsToBatch(VectorizedRowBatch batch, Object[] partitionValues)
  {
    if (partitionValues != null) {
      for (int i = 0; i < partitionColumnCount; i++) {
        Object value = partitionValues[i];

        int colIndex = dataColumnCount + i;
        String partitionColumnName = rowColumnNames[colIndex];
        PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) rowColumnTypeInfos[colIndex];
        switch (primitiveTypeInfo.getPrimitiveCategory()) {
        case BOOLEAN: {
          LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
          if (value == null) {
            lcv.noNulls = false;
            lcv.isNull[0] = true;
            lcv.isRepeating = true;
          } else {
            lcv.fill((Boolean) value == true ? 1 : 0);
            lcv.isNull[0] = false;
          }
        }
        break;

        case BYTE: {
          LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
          if (value == null) {
            lcv.noNulls = false;
            lcv.isNull[0] = true;
            lcv.isRepeating = true;
          } else {
            lcv.fill((Byte) value);
            lcv.isNull[0] = false;
          }
        }
        break;

        case SHORT: {
          LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
          if (value == null) {
            lcv.noNulls = false;
            lcv.isNull[0] = true;
            lcv.isRepeating = true;
          } else {
            lcv.fill((Short) value);
            lcv.isNull[0] = false;
          }
        }
        break;

        case INT: {
          LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
          if (value == null) {
            lcv.noNulls = false;
            lcv.isNull[0] = true;
            lcv.isRepeating = true;
          } else {
            lcv.fill((Integer) value);
            lcv.isNull[0] = false;
          }
        }
        break;

        case LONG: {
          LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
          if (value == null) {
            lcv.noNulls = false;
            lcv.isNull[0] = true;
            lcv.isRepeating = true;
          } else {
            lcv.fill((Long) value);
            lcv.isNull[0] = false;
          }
        }
        break;

        case DATE: {
          LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
          if (value == null) {
            lcv.noNulls = false;
            lcv.isNull[0] = true;
            lcv.isRepeating = true;
          } else {
            lcv.fill(DateWritable.dateToDays((Date) value));
            lcv.isNull[0] = false;
          }
        }
        break;

        case TIMESTAMP: {
          TimestampColumnVector lcv = (TimestampColumnVector) batch.cols[colIndex];
          if (value == null) {
            lcv.noNulls = false;
            lcv.isNull[0] = true;
            lcv.isRepeating = true;
          } else {
            lcv.fill((Timestamp) value);
            lcv.isNull[0] = false;
          }
        }
        break;

        case INTERVAL_YEAR_MONTH: {
          LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
          if (value == null) {
            lcv.noNulls = false;
            lcv.isNull[0] = true;
            lcv.isRepeating = true;
          } else {
            lcv.fill(((HiveIntervalYearMonth) value).getTotalMonths());
            lcv.isNull[0] = false;
          }
        }

        case INTERVAL_DAY_TIME: {
          IntervalDayTimeColumnVector icv = (IntervalDayTimeColumnVector) batch.cols[colIndex];
          if (value == null) {
            icv.noNulls = false;
            icv.isNull[0] = true;
            icv.isRepeating = true;
          } else {
            icv.fill(((HiveIntervalDayTime) value));
            icv.isNull[0] = false;
          }
        }

        case FLOAT: {
          DoubleColumnVector dcv = (DoubleColumnVector) batch.cols[colIndex];
          if (value == null) {
            dcv.noNulls = false;
            dcv.isNull[0] = true;
            dcv.isRepeating = true;
          } else {
            dcv.fill((Float) value);
            dcv.isNull[0] = false;
          }
        }
        break;

        case DOUBLE: {
          DoubleColumnVector dcv = (DoubleColumnVector) batch.cols[colIndex];
          if (value == null) {
            dcv.noNulls = false;
            dcv.isNull[0] = true;
            dcv.isRepeating = true;
          } else {
            dcv.fill((Double) value);
            dcv.isNull[0] = false;
          }
        }
        break;

        case DECIMAL: {
          DecimalColumnVector dv = (DecimalColumnVector) batch.cols[colIndex];
          if (value == null) {
            dv.noNulls = false;
            dv.isNull[0] = true;
            dv.isRepeating = true;
          } else {
            HiveDecimal hd = (HiveDecimal) value;
            dv.set(0, hd);
            dv.isRepeating = true;
            dv.isNull[0] = false;
          }
        }
        break;

        case BINARY: {
            BytesColumnVector bcv = (BytesColumnVector) batch.cols[colIndex];
            byte[] bytes = (byte[]) value;
            if (bytes == null) {
              bcv.noNulls = false;
              bcv.isNull[0] = true;
              bcv.isRepeating = true;
            } else {
              bcv.fill(bytes);
              bcv.isNull[0] = false;
            }
          }
          break;

        case STRING:
        case CHAR:
        case VARCHAR: {
          BytesColumnVector bcv = (BytesColumnVector) batch.cols[colIndex];
          String sVal = value.toString();
          if (sVal == null) {
            bcv.noNulls = false;
            bcv.isNull[0] = true;
            bcv.isRepeating = true;
          } else {
            bcv.setVal(0, sVal.getBytes());
            bcv.isRepeating = true;
          }
        }
        break;

        default:
          throw new RuntimeException("Unable to recognize the partition type " + primitiveTypeInfo.getPrimitiveCategory() +
              " for column " + partitionColumnName);
        }
      }
    }
  }

  /**
   * Determine whether a given column is a partition column
   * @param colNum column number in
   * {@link org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch}s created by this context.
   * @return true if it is a partition column, false otherwise
   */
  public final boolean isPartitionCol(int colNum) {
    return colNum >= dataColumnCount && colNum < rowColumnTypeInfos.length;
  }

}