VectorExtractRow.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec.vector;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable;
import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable;
import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Charsets;

/**
 * This class extracts specified VectorizedRowBatch row columns into writables.
 *
 * The caller provides the data types and projection column numbers of a subset of the columns
 * to extract.
 */
public class VectorExtractRow {

  private static final long serialVersionUID = 1L;
  private static final Logger LOG = LoggerFactory.getLogger(VectorExtractRow.class);

  /*
   * These members have information for extracting a row column objects from VectorizedRowBatch
   * columns.
   */
  int[] projectionColumnNums;
              // Extraction can be a subset of columns, so this is the projection --
              // the batch column numbers.

  TypeInfo[] typeInfos;
  ObjectInspector[] objectInspectors;

  /*
   * Allocate the various arrays.
   */
  private void allocateArrays(int count) {
    projectionColumnNums = new int[count];
    typeInfos = new TypeInfo[count];
    objectInspectors = new ObjectInspector[count];
  }

  /*
   * Initialize one column's array entries.
   */
  private void initEntry(int logicalColumnIndex, int projectionColumnNum, TypeInfo typeInfo) {
    projectionColumnNums[logicalColumnIndex] = projectionColumnNum;
    typeInfos[logicalColumnIndex] = typeInfo;
    objectInspectors[logicalColumnIndex] = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo);
  }

  /*
   * Initialize using an StructObjectInspector and a column projection list.
   */
  public void init(StructObjectInspector structObjectInspector, List<Integer> projectedColumns)
      throws HiveException {

    List<? extends StructField> fields = structObjectInspector.getAllStructFieldRefs();
    final int count = fields.size();
    allocateArrays(count);

    for (int i = 0; i < count; i++) {

      int projectionColumnNum = projectedColumns.get(i);

      StructField field = fields.get(i);
      ObjectInspector fieldInspector = field.getFieldObjectInspector();
      TypeInfo typeInfo =
          TypeInfoUtils.getTypeInfoFromTypeString(fieldInspector.getTypeName());

      initEntry(i, projectionColumnNum, typeInfo);
    }
  }

  /*
   * Initialize using an ObjectInspector array and a column projection array.
   */
  public void init(TypeInfo[] typeInfos, int[] projectedColumns)
      throws HiveException {

    final int count = typeInfos.length;
    allocateArrays(count);

    for (int i = 0; i < count; i++) {
      initEntry(i, projectedColumns[i], typeInfos[i]);
    }
  }

  /*
   * Initialize using data type names.
   * No projection -- the column range 0 .. types.size()-1
   */
  public void init(List<String> typeNames) throws HiveException {

    final int count = typeNames.size();
    allocateArrays(count);

    for (int i = 0; i < count; i++) {
      initEntry(i, i, TypeInfoUtils.getTypeInfoFromTypeString(typeNames.get(i)));
    }
  }

  public void init(TypeInfo[] typeInfos) throws HiveException {

    final int count = typeInfos.length;
    allocateArrays(count);

    for (int i = 0; i < count; i++) {
      initEntry(i, i, typeInfos[i]);
    }
  }

  public int getCount() {
    return projectionColumnNums.length;
  }

  /**
   * Extract a row's column object from the ColumnVector at batchIndex in the VectorizedRowBatch.
   *
   * @param batch
   * @param batchIndex
   * @param logicalColumnIndex
   * @return
   */
  private Object extractRowColumn(VectorizedRowBatch batch, int batchIndex, int logicalColumnIndex) {

    final int projectionColumnNum = projectionColumnNums[logicalColumnIndex];
    final ColumnVector colVector = batch.cols[projectionColumnNum];
    return extractRowColumn(
        colVector, typeInfos[logicalColumnIndex], objectInspectors[logicalColumnIndex], batchIndex);
  }

  Object extractRowColumn(
      ColumnVector colVector, TypeInfo typeInfo, ObjectInspector objectInspector, int batchIndex) {

    if (colVector == null) {
      // The planner will not include unneeded columns for reading but other parts of execution
      // may ask for them..
      return null;
    }
    final int adjustedIndex = (colVector.isRepeating ? 0 : batchIndex);
    if (!colVector.noNulls && colVector.isNull[adjustedIndex]) {
      return null;
    }

    final Category category = typeInfo.getCategory();
    switch (category) {
    case PRIMITIVE:
      {
        final PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo;
        final PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory();
        final Writable primitiveWritable =
            VectorizedBatchUtil.getPrimitiveWritable(primitiveCategory);
        switch (primitiveCategory) {
        case VOID:
          return null;
        case BOOLEAN:
          ((BooleanWritable) primitiveWritable).set(
              ((LongColumnVector) colVector).vector[adjustedIndex] == 0 ?
                  false : true);
          return primitiveWritable;
        case BYTE:
          ((ByteWritable) primitiveWritable).set(
              (byte) ((LongColumnVector) colVector).vector[adjustedIndex]);
          return primitiveWritable;
        case SHORT:
          ((ShortWritable) primitiveWritable).set(
              (short) ((LongColumnVector) colVector).vector[adjustedIndex]);
          return primitiveWritable;
        case INT:
          ((IntWritable) primitiveWritable).set(
              (int) ((LongColumnVector) colVector).vector[adjustedIndex]);
          return primitiveWritable;
        case LONG:
          ((LongWritable) primitiveWritable).set(
              ((LongColumnVector) colVector).vector[adjustedIndex]);
          return primitiveWritable;
        case TIMESTAMP:
          ((TimestampWritable) primitiveWritable).set(
              ((TimestampColumnVector) colVector).asScratchTimestamp(adjustedIndex));
          return primitiveWritable;
        case DATE:
          ((DateWritable) primitiveWritable).set(
              (int) ((LongColumnVector) colVector).vector[adjustedIndex]);
          return primitiveWritable;
        case FLOAT:
          ((FloatWritable) primitiveWritable).set(
              (float) ((DoubleColumnVector) colVector).vector[adjustedIndex]);
          return primitiveWritable;
        case DOUBLE:
          ((DoubleWritable) primitiveWritable).set(
              ((DoubleColumnVector) colVector).vector[adjustedIndex]);
          return primitiveWritable;
        case BINARY:
          {
            final BytesColumnVector bytesColVector =
                ((BytesColumnVector) colVector);
            final byte[] bytes = bytesColVector.vector[adjustedIndex];
            final int start = bytesColVector.start[adjustedIndex];
            final int length = bytesColVector.length[adjustedIndex];

            if (bytes == null) {
              LOG.info("null binary entry: batchIndex " + batchIndex);
            }

            BytesWritable bytesWritable = (BytesWritable) primitiveWritable;
            bytesWritable.set(bytes, start, length);
            return primitiveWritable;
          }
        case STRING:
          {
            final BytesColumnVector bytesColVector =
                ((BytesColumnVector) colVector);
            final byte[] bytes = bytesColVector.vector[adjustedIndex];
            final int start = bytesColVector.start[adjustedIndex];
            final int length = bytesColVector.length[adjustedIndex];

            if (bytes == null) {
              nullBytesReadError(primitiveCategory, batchIndex);
            }

            // Use org.apache.hadoop.io.Text as our helper to go from byte[] to String.
            ((Text) primitiveWritable).set(bytes, start, length);
            return primitiveWritable;
          }
        case VARCHAR:
          {
            final BytesColumnVector bytesColVector =
                ((BytesColumnVector) colVector);
            final byte[] bytes = bytesColVector.vector[adjustedIndex];
            final int start = bytesColVector.start[adjustedIndex];
            final int length = bytesColVector.length[adjustedIndex];

            if (bytes == null) {
              nullBytesReadError(primitiveCategory, batchIndex);
            }

            final int adjustedLength = StringExpr.truncate(bytes, start, length,
                ((VarcharTypeInfo) primitiveTypeInfo).getLength());

            final HiveVarcharWritable hiveVarcharWritable = (HiveVarcharWritable) primitiveWritable;
            hiveVarcharWritable.set(new String(bytes, start, adjustedLength, Charsets.UTF_8), -1);
            return primitiveWritable;
          }
        case CHAR:
          {
            final BytesColumnVector bytesColVector =
                ((BytesColumnVector) colVector);
            final byte[] bytes = bytesColVector.vector[adjustedIndex];
            final int start = bytesColVector.start[adjustedIndex];
            final int length = bytesColVector.length[adjustedIndex];

            if (bytes == null) {
              nullBytesReadError(primitiveCategory, batchIndex);
            }

            final int adjustedLength = StringExpr.rightTrimAndTruncate(bytes, start, length,
                ((CharTypeInfo) primitiveTypeInfo).getLength());

            final HiveCharWritable hiveCharWritable = (HiveCharWritable) primitiveWritable;
            hiveCharWritable.set(new String(bytes, start, adjustedLength, Charsets.UTF_8),
                ((CharTypeInfo) primitiveTypeInfo).getLength());
            return primitiveWritable;
          }
        case DECIMAL:
          // The HiveDecimalWritable set method will quickly copy the deserialized decimal writable fields.
          ((HiveDecimalWritable) primitiveWritable).set(
              ((DecimalColumnVector) colVector).vector[adjustedIndex]);
          return primitiveWritable;
        case INTERVAL_YEAR_MONTH:
          ((HiveIntervalYearMonthWritable) primitiveWritable).set(
              (int) ((LongColumnVector) colVector).vector[adjustedIndex]);
          return primitiveWritable;
        case INTERVAL_DAY_TIME:
          ((HiveIntervalDayTimeWritable) primitiveWritable).set(
              ((IntervalDayTimeColumnVector) colVector).asScratchIntervalDayTime(adjustedIndex));
          return primitiveWritable;
        default:
          throw new RuntimeException("Primitive category " + primitiveCategory.name() +
              " not supported");
        }
      }
    case LIST:
      {
        final ListColumnVector listColumnVector = (ListColumnVector) colVector;
        final ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo;
        final ListObjectInspector listObjectInspector = (ListObjectInspector) objectInspector;
        final int offset = (int) listColumnVector.offsets[adjustedIndex];
        final int size = (int) listColumnVector.lengths[adjustedIndex];

        final List list = new ArrayList();
        for (int i = 0; i < size; i++) {
          list.add(
              extractRowColumn(
                  listColumnVector.child,
                  listTypeInfo.getListElementTypeInfo(),
                  listObjectInspector.getListElementObjectInspector(),
                  offset + i));
        }
        return list;
      }
    case MAP:
      {
        final MapColumnVector mapColumnVector = (MapColumnVector) colVector;
        final MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo;
        final MapObjectInspector mapObjectInspector = (MapObjectInspector) objectInspector;
        final int offset = (int) mapColumnVector.offsets[adjustedIndex];
        final int size = (int) mapColumnVector.lengths[adjustedIndex];

        final Map map = new HashMap();
        for (int i = 0; i < size; i++) {
          final Object key = extractRowColumn(
              mapColumnVector.keys,
              mapTypeInfo.getMapKeyTypeInfo(),
              mapObjectInspector.getMapKeyObjectInspector(),
              offset + i);
          final Object value = extractRowColumn(
              mapColumnVector.values,
              mapTypeInfo.getMapValueTypeInfo(),
              mapObjectInspector.getMapValueObjectInspector(),
              offset + i);
          map.put(key, value);
        }
        return map;
      }
    case STRUCT:
      {
        final StructColumnVector structColumnVector = (StructColumnVector) colVector;
        final StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
        final StandardStructObjectInspector structInspector =
            (StandardStructObjectInspector) objectInspector;
        final List<TypeInfo> fieldTypeInfos = structTypeInfo.getAllStructFieldTypeInfos();
        final int size = fieldTypeInfos.size();
        final List<? extends StructField> structFields =
            structInspector.getAllStructFieldRefs();

        final Object struct = structInspector.create();
        for (int i = 0; i < size; i++) {
          final StructField structField = structFields.get(i);
          final TypeInfo fieldTypeInfo = fieldTypeInfos.get(i);
          final Object value = extractRowColumn(
              structColumnVector.fields[i],
              fieldTypeInfo,
              structField.getFieldObjectInspector(),
              adjustedIndex);
          structInspector.setStructFieldData(struct, structField, value);
        }
        return struct;
      }
    case UNION:
      {
        final UnionTypeInfo unionTypeInfo = (UnionTypeInfo) typeInfo;
        final List<TypeInfo> objectTypeInfos = unionTypeInfo.getAllUnionObjectTypeInfos();
        final UnionObjectInspector unionInspector = (UnionObjectInspector) objectInspector;
        final List<ObjectInspector> unionInspectors = unionInspector.getObjectInspectors();
        final UnionColumnVector unionColumnVector = (UnionColumnVector) colVector;
        final byte tag = (byte) unionColumnVector.tags[adjustedIndex];
        final Object object = extractRowColumn(
            unionColumnVector.fields[tag],
            objectTypeInfos.get(tag),
            unionInspectors.get(tag),
            adjustedIndex);

        final StandardUnion standardUnion = new StandardUnion();
        standardUnion.setTag(tag);
        standardUnion.setObject(object);
        return standardUnion;
      }
    default:
      throw new RuntimeException("Category " + category.name() + " not supported");
    }
  }

  /**
   * Extract an row object from a VectorizedRowBatch at batchIndex.
   *
   * @param batch
   * @param batchIndex
   * @param objects
   */
  public void extractRow(VectorizedRowBatch batch, int batchIndex, Object[] objects) {
    for (int i = 0; i < projectionColumnNums.length; i++) {
      objects[i] = extractRowColumn(batch, batchIndex, i);
    }
  }

  private void nullBytesReadError(PrimitiveCategory primitiveCategory, int batchIndex) {
    throw new RuntimeException("null " + primitiveCategory.name() +
        " entry: batchIndex " + batchIndex);
  }
}