/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.vector;
import java.io.IOException;
import java.sql.Date;
import java.sql.Timestamp;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.hive.common.type.HiveChar;
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveIntervalDayTime;
import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
import org.apache.hadoop.hive.ql.io.IOPrepareCache;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.Explain;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.Explain.Level;
import org.apache.hadoop.hive.ql.plan.Explain.Vectorization;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hive.common.util.DateUtils;
import com.google.common.base.Preconditions;
/**
* Context for Vectorized row batch. this class does eager deserialization of row data using serde
* in the RecordReader layer.
* It has supports partitions in this layer so that the vectorized batch is populated correctly
* with the partition column.
*/
public class VectorizedRowBatchCtx {
private static final long serialVersionUID = 1L;
private static final Logger LOG = LoggerFactory.getLogger(VectorizedRowBatchCtx.class.getName());
// The following information is for creating VectorizedRowBatch and for helping with
// knowing how the table is partitioned.
//
// It will be stored in MapWork and ReduceWork.
private String[] rowColumnNames;
private TypeInfo[] rowColumnTypeInfos;
private int[] dataColumnNums;
private int dataColumnCount;
private int partitionColumnCount;
private String[] scratchColumnTypeNames;
/**
* Constructor for VectorizedRowBatchCtx
*/
public VectorizedRowBatchCtx() {
}
public VectorizedRowBatchCtx(String[] rowColumnNames, TypeInfo[] rowColumnTypeInfos,
int[] dataColumnNums, int partitionColumnCount, String[] scratchColumnTypeNames) {
this.rowColumnNames = rowColumnNames;
this.rowColumnTypeInfos = rowColumnTypeInfos;
this.dataColumnNums = dataColumnNums;
this.partitionColumnCount = partitionColumnCount;
this.scratchColumnTypeNames = scratchColumnTypeNames;
dataColumnCount = rowColumnTypeInfos.length - partitionColumnCount;
}
public String[] getRowColumnNames() {
return rowColumnNames;
}
public TypeInfo[] getRowColumnTypeInfos() {
return rowColumnTypeInfos;
}
public int[] getDataColumnNums() {
return dataColumnNums;
}
public int getDataColumnCount() {
return dataColumnCount;
}
public int getPartitionColumnCount() {
return partitionColumnCount;
}
public String[] getScratchColumnTypeNames() {
return scratchColumnTypeNames;
}
/**
* Initializes the VectorizedRowBatch context based on an scratch column type names and
* object inspector.
* @param structObjectInspector
* @param scratchColumnTypeNames
* Object inspector that shapes the column types
* @throws HiveException
*/
public void init(StructObjectInspector structObjectInspector, String[] scratchColumnTypeNames)
throws HiveException {
// Row column information.
rowColumnNames = VectorizedBatchUtil.columnNamesFromStructObjectInspector(structObjectInspector);
rowColumnTypeInfos = VectorizedBatchUtil.typeInfosFromStructObjectInspector(structObjectInspector);
dataColumnNums = null;
partitionColumnCount = 0;
dataColumnCount = rowColumnTypeInfos.length;
// Scratch column information.
this.scratchColumnTypeNames = scratchColumnTypeNames;
}
public static void getPartitionValues(VectorizedRowBatchCtx vrbCtx, Configuration hiveConf,
FileSplit split, Object[] partitionValues) throws IOException {
Map<Path, PartitionDesc> pathToPartitionInfo = Utilities
.getMapWork(hiveConf).getPathToPartitionInfo();
PartitionDesc partDesc = HiveFileFormatUtils
.getPartitionDescFromPathRecursively(pathToPartitionInfo,
split.getPath(), IOPrepareCache.get().getPartitionDescMap());
getPartitionValues(vrbCtx, partDesc, partitionValues);
}
public static void getPartitionValues(VectorizedRowBatchCtx vrbCtx, PartitionDesc partDesc,
Object[] partitionValues) {
LinkedHashMap<String, String> partSpec = partDesc.getPartSpec();
for (int i = 0; i < vrbCtx.partitionColumnCount; i++) {
Object objectValue;
if (partSpec == null) {
// For partition-less table, initialize partValue to empty string.
// We can have partition-less table even if we have partition keys
// when there is only only partition selected and the partition key is not
// part of the projection/include list.
objectValue = null;
} else {
String key = vrbCtx.rowColumnNames[vrbCtx.dataColumnCount + i];
// Create a Standard java object Inspector
TypeInfo partColTypeInfo = vrbCtx.rowColumnTypeInfos[vrbCtx.dataColumnCount + i];
ObjectInspector objectInspector =
TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(partColTypeInfo);
objectValue =
ObjectInspectorConverters.
getConverter(PrimitiveObjectInspectorFactory.
javaStringObjectInspector, objectInspector).
convert(partSpec.get(key));
if (partColTypeInfo instanceof CharTypeInfo) {
objectValue = ((HiveChar) objectValue).getStrippedValue();
}
}
partitionValues[i] = objectValue;
}
}
/**
* Creates a Vectorized row batch and the column vectors.
*
* @return VectorizedRowBatch
* @throws HiveException
*/
public VectorizedRowBatch createVectorizedRowBatch()
{
final int dataAndPartColumnCount = rowColumnTypeInfos.length;
final int totalColumnCount = dataAndPartColumnCount + scratchColumnTypeNames.length;
VectorizedRowBatch result = new VectorizedRowBatch(totalColumnCount);
if (dataColumnNums == null) {
// All data and partition columns.
for (int i = 0; i < dataAndPartColumnCount; i++) {
TypeInfo typeInfo = rowColumnTypeInfos[i];
result.cols[i] = VectorizedBatchUtil.createColumnVector(typeInfo);
}
} else {
// Create only needed/included columns data columns.
for (int i = 0; i < dataColumnNums.length; i++) {
int columnNum = dataColumnNums[i];
Preconditions.checkState(columnNum < dataAndPartColumnCount);
TypeInfo typeInfo = rowColumnTypeInfos[columnNum];
result.cols[columnNum] = VectorizedBatchUtil.createColumnVector(typeInfo);
}
// Always create partition columns.
final int endColumnNum = dataColumnCount + partitionColumnCount;
for (int partitionColumnNum = dataColumnCount; partitionColumnNum < endColumnNum; partitionColumnNum++) {
TypeInfo typeInfo = rowColumnTypeInfos[partitionColumnNum];
result.cols[partitionColumnNum] = VectorizedBatchUtil.createColumnVector(typeInfo);
}
}
for (int i = 0; i < scratchColumnTypeNames.length; i++) {
String typeName = scratchColumnTypeNames[i];
result.cols[rowColumnTypeInfos.length + i] =
VectorizedBatchUtil.createColumnVector(typeName);
}
result.setPartitionInfo(dataColumnCount, partitionColumnCount);
result.reset();
return result;
}
/**
* Add the partition values to the batch
*
* @param batch
* @param partitionValues
* @throws HiveException
*/
public void addPartitionColsToBatch(VectorizedRowBatch batch, Object[] partitionValues)
{
if (partitionValues != null) {
for (int i = 0; i < partitionColumnCount; i++) {
Object value = partitionValues[i];
int colIndex = dataColumnCount + i;
String partitionColumnName = rowColumnNames[colIndex];
PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) rowColumnTypeInfos[colIndex];
switch (primitiveTypeInfo.getPrimitiveCategory()) {
case BOOLEAN: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((Boolean) value == true ? 1 : 0);
lcv.isNull[0] = false;
}
}
break;
case BYTE: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((Byte) value);
lcv.isNull[0] = false;
}
}
break;
case SHORT: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((Short) value);
lcv.isNull[0] = false;
}
}
break;
case INT: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((Integer) value);
lcv.isNull[0] = false;
}
}
break;
case LONG: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((Long) value);
lcv.isNull[0] = false;
}
}
break;
case DATE: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill(DateWritable.dateToDays((Date) value));
lcv.isNull[0] = false;
}
}
break;
case TIMESTAMP: {
TimestampColumnVector lcv = (TimestampColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((Timestamp) value);
lcv.isNull[0] = false;
}
}
break;
case INTERVAL_YEAR_MONTH: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill(((HiveIntervalYearMonth) value).getTotalMonths());
lcv.isNull[0] = false;
}
}
case INTERVAL_DAY_TIME: {
IntervalDayTimeColumnVector icv = (IntervalDayTimeColumnVector) batch.cols[colIndex];
if (value == null) {
icv.noNulls = false;
icv.isNull[0] = true;
icv.isRepeating = true;
} else {
icv.fill(((HiveIntervalDayTime) value));
icv.isNull[0] = false;
}
}
case FLOAT: {
DoubleColumnVector dcv = (DoubleColumnVector) batch.cols[colIndex];
if (value == null) {
dcv.noNulls = false;
dcv.isNull[0] = true;
dcv.isRepeating = true;
} else {
dcv.fill((Float) value);
dcv.isNull[0] = false;
}
}
break;
case DOUBLE: {
DoubleColumnVector dcv = (DoubleColumnVector) batch.cols[colIndex];
if (value == null) {
dcv.noNulls = false;
dcv.isNull[0] = true;
dcv.isRepeating = true;
} else {
dcv.fill((Double) value);
dcv.isNull[0] = false;
}
}
break;
case DECIMAL: {
DecimalColumnVector dv = (DecimalColumnVector) batch.cols[colIndex];
if (value == null) {
dv.noNulls = false;
dv.isNull[0] = true;
dv.isRepeating = true;
} else {
HiveDecimal hd = (HiveDecimal) value;
dv.set(0, hd);
dv.isRepeating = true;
dv.isNull[0] = false;
}
}
break;
case BINARY: {
BytesColumnVector bcv = (BytesColumnVector) batch.cols[colIndex];
byte[] bytes = (byte[]) value;
if (bytes == null) {
bcv.noNulls = false;
bcv.isNull[0] = true;
bcv.isRepeating = true;
} else {
bcv.fill(bytes);
bcv.isNull[0] = false;
}
}
break;
case STRING:
case CHAR:
case VARCHAR: {
BytesColumnVector bcv = (BytesColumnVector) batch.cols[colIndex];
String sVal = value.toString();
if (sVal == null) {
bcv.noNulls = false;
bcv.isNull[0] = true;
bcv.isRepeating = true;
} else {
bcv.setVal(0, sVal.getBytes());
bcv.isRepeating = true;
}
}
break;
default:
throw new RuntimeException("Unable to recognize the partition type " + primitiveTypeInfo.getPrimitiveCategory() +
" for column " + partitionColumnName);
}
}
}
}
/**
* Determine whether a given column is a partition column
* @param colNum column number in
* {@link org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch}s created by this context.
* @return true if it is a partition column, false otherwise
*/
public final boolean isPartitionCol(int colNum) {
return colNum >= dataColumnCount && colNum < rowColumnTypeInfos.length;
}
}