/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.vector;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.VectorPartitionConversion;
import org.apache.hadoop.hive.serde2.fast.DeserializeRead;
import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable;
import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable;
import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import com.google.common.base.Charsets;
import com.google.common.base.Preconditions;
/**
* This class deserializes a serialization format into a row of a VectorizedRowBatch.
*
* The caller provides the hive type names and output column numbers in the order desired to
* deserialize.
*
* This class uses an provided DeserializeRead object to directly deserialize by reading
* field-by-field from a serialization format into the primitive values of the VectorizedRowBatch.
*/
public final class VectorDeserializeRow<T extends DeserializeRead> {
private static final long serialVersionUID = 1L;
private static final Logger LOG = LoggerFactory.getLogger(VectorDeserializeRow.class);
private T deserializeRead;
private TypeInfo[] sourceTypeInfos;
private byte[] inputBytes;
/**
* @param deserializeRead Set useExternalBuffer to true to avoid buffer copying and to get
* more efficient reading.
*/
public VectorDeserializeRow(T deserializeRead) {
this();
this.deserializeRead = deserializeRead;
sourceTypeInfos = deserializeRead.typeInfos();
}
// Not public since we must have the deserialize read object.
private VectorDeserializeRow() {
}
private static class Field {
private Category category;
private PrimitiveCategory primitiveCategory;
//The data type primitive category of the column being deserialized.
private int maxLength;
// For the CHAR and VARCHAR data types, the maximum character length of
// the column. Otherwise, 0.
private boolean isConvert;
/*
* This member has information for data type conversion.
* Not defined if there is no conversion.
*/
Writable conversionWritable;
// Conversion requires source be placed in writable so we can call upon
// VectorAssignRow to convert and assign the row column.
private ComplexTypeHelper complexTypeHelper;
// For a complex type, a helper object that describes elements, key/value pairs,
// or fields.
public Field(PrimitiveCategory primitiveCategory, int maxLength) {
this.category = Category.PRIMITIVE;
this.primitiveCategory = primitiveCategory;
this.maxLength = maxLength;
this.isConvert = false;
this.conversionWritable = null;
this.complexTypeHelper = null;
}
public Field(Category category, ComplexTypeHelper complexTypeHelper) {
this.category = category;
this.primitiveCategory = null;
this.maxLength = 0;
this.isConvert = false;
this.conversionWritable = null;
this.complexTypeHelper = complexTypeHelper;
}
public Category getCategory() {
return category;
}
public PrimitiveCategory getPrimitiveCategory() {
return primitiveCategory;
}
public int getMaxLength() {
return maxLength;
}
public void setIsConvert(boolean isConvert) {
this.isConvert = isConvert;
}
public boolean getIsConvert() {
return isConvert;
}
public void setConversionWritable(Writable conversionWritable) {
this.conversionWritable = conversionWritable;
}
public Writable getConversionWritable() {
return conversionWritable;
}
public ComplexTypeHelper getComplexHelper() {
return complexTypeHelper;
}
}
/*
* These members have information for deserializing a row into the VectorizedRowBatch
* columns.
*
* We say "source" because when there is conversion we are converting th deserialized source into
* a target data type.
*/
private boolean useReadField;
// True when the (random access) readField method of DeserializeRead are being used.
private int[] readFieldLogicalIndices;
// The logical indices for reading with readField.
private int[] projectionColumnNums;
// Assigning can be a subset of columns, so this is the projection --
// the batch column numbers.
private Field topLevelFields[];
VectorAssignRow convertVectorAssignRow;
// Use its conversion ability.
/*
* Allocate the source deserialization related arrays.
*/
private void allocateArrays(int count) {
projectionColumnNums = new int[count];
Arrays.fill(projectionColumnNums, -1);
topLevelFields = new Field[count];
}
private Field allocatePrimitiveField(TypeInfo sourceTypeInfo) {
final PrimitiveTypeInfo sourcePrimitiveTypeInfo = (PrimitiveTypeInfo) sourceTypeInfo;
final PrimitiveCategory sourcePrimitiveCategory = sourcePrimitiveTypeInfo.getPrimitiveCategory();
final int maxLength;
switch (sourcePrimitiveCategory) {
case CHAR:
maxLength = ((CharTypeInfo) sourcePrimitiveTypeInfo).getLength();
break;
case VARCHAR:
maxLength = ((VarcharTypeInfo) sourcePrimitiveTypeInfo).getLength();
break;
default:
// No additional data type specific setting.
maxLength = 0;
break;
}
return new Field(sourcePrimitiveCategory, maxLength);
}
private Field allocateComplexField(TypeInfo sourceTypeInfo) {
final Category category = sourceTypeInfo.getCategory();
switch (category) {
case LIST:
{
final ListTypeInfo listTypeInfo = (ListTypeInfo) sourceTypeInfo;
final ListComplexTypeHelper listHelper =
new ListComplexTypeHelper(
allocateField(listTypeInfo.getListElementTypeInfo()));
return new Field(category, listHelper);
}
case MAP:
{
final MapTypeInfo mapTypeInfo = (MapTypeInfo) sourceTypeInfo;
final MapComplexTypeHelper mapHelper =
new MapComplexTypeHelper(
allocateField(mapTypeInfo.getMapKeyTypeInfo()),
allocateField(mapTypeInfo.getMapValueTypeInfo()));
return new Field(category, mapHelper);
}
case STRUCT:
{
final StructTypeInfo structTypeInfo = (StructTypeInfo) sourceTypeInfo;
final ArrayList<TypeInfo> fieldTypeInfoList = structTypeInfo.getAllStructFieldTypeInfos();
final int count = fieldTypeInfoList.size();
final Field[] fields = new Field[count];
for (int i = 0; i < count; i++) {
fields[i] = allocateField(fieldTypeInfoList.get(i));
}
final StructComplexTypeHelper structHelper =
new StructComplexTypeHelper(fields);
return new Field(category, structHelper);
}
case UNION:
{
final UnionTypeInfo unionTypeInfo = (UnionTypeInfo) sourceTypeInfo;
final List<TypeInfo> fieldTypeInfoList = unionTypeInfo.getAllUnionObjectTypeInfos();
final int count = fieldTypeInfoList.size();
final Field[] fields = new Field[count];
for (int i = 0; i < count; i++) {
fields[i] = allocateField(fieldTypeInfoList.get(i));
}
final UnionComplexTypeHelper unionHelper =
new UnionComplexTypeHelper(fields);
return new Field(category, unionHelper);
}
default:
throw new RuntimeException("Category " + category + " not supported");
}
}
private Field allocateField(TypeInfo sourceTypeInfo) {
switch (sourceTypeInfo.getCategory()) {
case PRIMITIVE:
return allocatePrimitiveField(sourceTypeInfo);
case LIST:
case MAP:
case STRUCT:
case UNION:
return allocateComplexField(sourceTypeInfo);
default:
throw new RuntimeException("Category " + sourceTypeInfo.getCategory() + " not supported");
}
}
/*
* Initialize one column's source deserializtion information.
*/
private void initTopLevelField(int logicalColumnIndex, int projectionColumnNum, TypeInfo sourceTypeInfo) {
projectionColumnNums[logicalColumnIndex] = projectionColumnNum;
topLevelFields[logicalColumnIndex] = allocateField(sourceTypeInfo);
}
/*
* Initialize the conversion related arrays. Assumes initTopLevelField has already been called.
*/
private void addTopLevelConversion(int logicalColumnIndex) {
final Field field = topLevelFields[logicalColumnIndex];
field.setIsConvert(true);
if (field.getCategory() == Category.PRIMITIVE) {
field.setConversionWritable(
VectorizedBatchUtil.getPrimitiveWritable(field.getPrimitiveCategory()));
}
}
/*
* Specify the columns to deserialize into as an array.
*/
public void init(int[] outputColumns) throws HiveException {
final int count = sourceTypeInfos.length;
allocateArrays(count);
for (int i = 0; i < count; i++) {
int outputColumn = outputColumns[i];
initTopLevelField(i, outputColumn, sourceTypeInfos[i]);
}
}
/*
* Specify the columns to deserialize into as a list.
*/
public void init(List<Integer> outputColumns) throws HiveException {
final int count = sourceTypeInfos.length;
allocateArrays(count);
for (int i = 0; i < count; i++) {
int outputColumn = outputColumns.get(i);
initTopLevelField(i, outputColumn, sourceTypeInfos[i]);
}
}
/*
* Specify the columns to deserialize into a range starting at a column number.
*/
public void init(int startColumn) throws HiveException {
final int count = sourceTypeInfos.length;
allocateArrays(count);
for (int i = 0; i < count; i++) {
int outputColumn = startColumn + i;
initTopLevelField(i, outputColumn, sourceTypeInfos[i]);
}
}
public void init(boolean[] columnsToIncludeTruncated) throws HiveException {
// When truncated included is used, its length must be at least the number of source type infos.
// When longer, we assume the caller will default with nulls, etc.
Preconditions.checkState(
columnsToIncludeTruncated == null ||
columnsToIncludeTruncated.length == sourceTypeInfos.length);
final int columnCount = sourceTypeInfos.length;
allocateArrays(columnCount);
int includedCount = 0;
final int[] includedIndices = new int[columnCount];
for (int i = 0; i < columnCount; i++) {
if (columnsToIncludeTruncated != null && !columnsToIncludeTruncated[i]) {
// Field not included in query.
} else {
initTopLevelField(i, i, sourceTypeInfos[i]);
includedIndices[includedCount++] = i;
}
}
// Optimizing for readField?
if (includedCount < columnCount && deserializeRead.isReadFieldSupported()) {
useReadField = true;
readFieldLogicalIndices = Arrays.copyOf(includedIndices, includedCount);
}
}
/**
* Initialize for converting the source data type that are going to be read with the
* DeserializedRead interface passed to the constructor to the target data types desired in
* the VectorizedRowBatch.
*
* No projection -- using the column range 0 .. columnCount-1
*
* @param targetTypeInfos
* @param columnsToIncludeTruncated
* @throws HiveException
*/
public void initConversion(TypeInfo[] targetTypeInfos,
boolean[] columnsToIncludeTruncated) throws HiveException {
// We assume the caller will handle extra columns default with nulls, etc.
Preconditions.checkState(targetTypeInfos.length >= sourceTypeInfos.length);
// When truncated included is used, its length must be at least the number of source type infos.
// When longer, we assume the caller will default with nulls, etc.
Preconditions.checkState(
columnsToIncludeTruncated == null ||
columnsToIncludeTruncated.length >= sourceTypeInfos.length);
final int columnCount = sourceTypeInfos.length;
allocateArrays(columnCount);
int includedCount = 0;
int[] includedIndices = new int[columnCount];
boolean atLeastOneConvert = false;
for (int i = 0; i < columnCount; i++) {
if (columnsToIncludeTruncated != null && !columnsToIncludeTruncated[i]) {
// Field not included in query.
} else {
TypeInfo sourceTypeInfo = sourceTypeInfos[i];
TypeInfo targetTypeInfo = targetTypeInfos[i];
if (!sourceTypeInfo.equals(targetTypeInfo)) {
if (VectorPartitionConversion.isImplicitVectorColumnConversion(sourceTypeInfo, targetTypeInfo)) {
// Do implicit conversion from source type to target type.
initTopLevelField(i, i, sourceTypeInfo);
} else {
// Do formal conversion...
initTopLevelField(i, i, sourceTypeInfo);
// UNDONE: No for List and Map; Yes for Struct and Union when field count different...
addTopLevelConversion(i);
atLeastOneConvert = true;
}
} else {
// No conversion.
initTopLevelField(i, i, sourceTypeInfo);
}
includedIndices[includedCount++] = i;
}
}
// Optimizing for readField?
if (includedCount < columnCount && deserializeRead.isReadFieldSupported()) {
useReadField = true;
readFieldLogicalIndices = Arrays.copyOf(includedIndices, includedCount);
}
if (atLeastOneConvert) {
// Let the VectorAssignRow class do the conversion.
convertVectorAssignRow = new VectorAssignRow();
convertVectorAssignRow.initConversion(sourceTypeInfos, targetTypeInfos,
columnsToIncludeTruncated);
}
}
public void init() throws HiveException {
init(0);
}
private void storePrimitiveRowColumn(ColumnVector colVector, Field field,
int batchIndex, boolean canRetainByteRef) throws IOException {
switch (field.getPrimitiveCategory()) {
case VOID:
VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex);
return;
case BOOLEAN:
((LongColumnVector) colVector).vector[batchIndex] = (deserializeRead.currentBoolean ? 1 : 0);
break;
case BYTE:
((LongColumnVector) colVector).vector[batchIndex] = deserializeRead.currentByte;
break;
case SHORT:
((LongColumnVector) colVector).vector[batchIndex] = deserializeRead.currentShort;
break;
case INT:
((LongColumnVector) colVector).vector[batchIndex] = deserializeRead.currentInt;
break;
case LONG:
((LongColumnVector) colVector).vector[batchIndex] = deserializeRead.currentLong;
break;
case TIMESTAMP:
((TimestampColumnVector) colVector).set(
batchIndex, deserializeRead.currentTimestampWritable.getTimestamp());
break;
case DATE:
((LongColumnVector) colVector).vector[batchIndex] = deserializeRead.currentDateWritable.getDays();
break;
case FLOAT:
((DoubleColumnVector) colVector).vector[batchIndex] = deserializeRead.currentFloat;
break;
case DOUBLE:
((DoubleColumnVector) colVector).vector[batchIndex] = deserializeRead.currentDouble;
break;
case BINARY:
case STRING:
{
final BytesColumnVector bytesColVec = ((BytesColumnVector) colVector);
if (deserializeRead.currentExternalBufferNeeded) {
bytesColVec.ensureValPreallocated(deserializeRead.currentExternalBufferNeededLen);
deserializeRead.copyToExternalBuffer(
bytesColVec.getValPreallocatedBytes(), bytesColVec.getValPreallocatedStart());
bytesColVec.setValPreallocated(
batchIndex,
deserializeRead.currentExternalBufferNeededLen);
} else if (canRetainByteRef && inputBytes == deserializeRead.currentBytes) {
bytesColVec.setRef(
batchIndex,
deserializeRead.currentBytes,
deserializeRead.currentBytesStart,
deserializeRead.currentBytesLength);
} else {
bytesColVec.setVal(
batchIndex,
deserializeRead.currentBytes,
deserializeRead.currentBytesStart,
deserializeRead.currentBytesLength);
}
}
break;
case VARCHAR:
{
// Use the basic STRING bytes read to get access, then use our optimal truncate/trim method
// that does not use Java String objects.
final BytesColumnVector bytesColVec = ((BytesColumnVector) colVector);
if (deserializeRead.currentExternalBufferNeeded) {
// Write directly into our BytesColumnVector value buffer.
bytesColVec.ensureValPreallocated(deserializeRead.currentExternalBufferNeededLen);
final byte[] convertBuffer = bytesColVec.getValPreallocatedBytes();
final int convertBufferStart = bytesColVec.getValPreallocatedStart();
deserializeRead.copyToExternalBuffer(
convertBuffer,
convertBufferStart);
bytesColVec.setValPreallocated(
batchIndex,
StringExpr.truncate(
convertBuffer,
convertBufferStart,
deserializeRead.currentExternalBufferNeededLen,
field.getMaxLength()));
} else if (canRetainByteRef && inputBytes == deserializeRead.currentBytes) {
bytesColVec.setRef(
batchIndex,
deserializeRead.currentBytes,
deserializeRead.currentBytesStart,
StringExpr.truncate(
deserializeRead.currentBytes,
deserializeRead.currentBytesStart,
deserializeRead.currentBytesLength,
field.getMaxLength()));
} else {
bytesColVec.setVal(
batchIndex,
deserializeRead.currentBytes,
deserializeRead.currentBytesStart,
StringExpr.truncate(
deserializeRead.currentBytes,
deserializeRead.currentBytesStart,
deserializeRead.currentBytesLength,
field.getMaxLength()));
}
}
break;
case CHAR:
{
// Use the basic STRING bytes read to get access, then use our optimal truncate/trim method
// that does not use Java String objects.
final BytesColumnVector bytesColVec = ((BytesColumnVector) colVector);
if (deserializeRead.currentExternalBufferNeeded) {
// Write directly into our BytesColumnVector value buffer.
bytesColVec.ensureValPreallocated(deserializeRead.currentExternalBufferNeededLen);
final byte[] convertBuffer = bytesColVec.getValPreallocatedBytes();
final int convertBufferStart = bytesColVec.getValPreallocatedStart();
deserializeRead.copyToExternalBuffer(
convertBuffer,
convertBufferStart);
bytesColVec.setValPreallocated(
batchIndex,
StringExpr.rightTrimAndTruncate(
convertBuffer,
convertBufferStart,
deserializeRead.currentExternalBufferNeededLen,
field.getMaxLength()));
} else if (canRetainByteRef && inputBytes == deserializeRead.currentBytes) {
bytesColVec.setRef(
batchIndex,
deserializeRead.currentBytes,
deserializeRead.currentBytesStart,
StringExpr.rightTrimAndTruncate(
deserializeRead.currentBytes,
deserializeRead.currentBytesStart,
deserializeRead.currentBytesLength,
field.getMaxLength()));
} else {
bytesColVec.setVal(
batchIndex,
deserializeRead.currentBytes,
deserializeRead.currentBytesStart,
StringExpr.rightTrimAndTruncate(
deserializeRead.currentBytes,
deserializeRead.currentBytesStart,
deserializeRead.currentBytesLength,
field.getMaxLength()));
}
}
break;
case DECIMAL:
// The DecimalColumnVector set method will quickly copy the deserialized decimal writable fields.
((DecimalColumnVector) colVector).set(
batchIndex, deserializeRead.currentHiveDecimalWritable);
break;
case INTERVAL_YEAR_MONTH:
((LongColumnVector) colVector).vector[batchIndex] =
deserializeRead.currentHiveIntervalYearMonthWritable.getHiveIntervalYearMonth().getTotalMonths();
break;
case INTERVAL_DAY_TIME:
((IntervalDayTimeColumnVector) colVector).set(
batchIndex, deserializeRead.currentHiveIntervalDayTimeWritable.getHiveIntervalDayTime());
break;
default:
throw new RuntimeException("Primitive category " + field.getPrimitiveCategory() +
" not supported");
}
}
private static class ComplexTypeHelper {
}
private static class ListComplexTypeHelper extends ComplexTypeHelper {
private Field elementField;
public ListComplexTypeHelper(Field elementField) {
this.elementField = elementField;
}
public Field getElementField() {
return elementField;
}
}
private static class MapComplexTypeHelper extends ComplexTypeHelper {
private Field keyField;
private Field valueField;
public MapComplexTypeHelper(Field keyField, Field valueField) {
this.keyField = keyField;
this.valueField = valueField;
}
public Field getKeyField() {
return keyField;
}
public Field getValueField() {
return valueField;
}
}
private static class FieldsComplexTypeHelper extends ComplexTypeHelper {
private Field[] fields;
public FieldsComplexTypeHelper(Field[] fields) {
this.fields = fields;
}
public Field[] getFields() {
return fields;
}
}
private static class StructComplexTypeHelper extends FieldsComplexTypeHelper {
public StructComplexTypeHelper(Field[] fields) {
super(fields);
}
}
private static class UnionComplexTypeHelper extends FieldsComplexTypeHelper {
public UnionComplexTypeHelper(Field[] fields) {
super(fields);
}
}
// UNDONE: Presumption of *append*
private void storeComplexFieldRowColumn(ColumnVector fieldColVector,
Field field, int batchIndex, boolean canRetainByteRef) throws IOException {
if (!deserializeRead.readComplexField()) {
fieldColVector.isNull[batchIndex] = true;
fieldColVector.noNulls = false;
return;
}
switch (field.getCategory()) {
case PRIMITIVE:
storePrimitiveRowColumn(fieldColVector, field, batchIndex, canRetainByteRef);
break;
case LIST:
storeListRowColumn(fieldColVector, field, batchIndex, canRetainByteRef);
break;
case MAP:
storeMapRowColumn(fieldColVector, field, batchIndex, canRetainByteRef);
break;
case STRUCT:
storeStructRowColumn(fieldColVector, field, batchIndex, canRetainByteRef);
break;
case UNION:
storeUnionRowColumn(fieldColVector, field, batchIndex, canRetainByteRef);
break;
default:
throw new RuntimeException("Category " + field.getCategory() + " not supported");
}
}
private void storeListRowColumn(ColumnVector colVector,
Field field, int batchIndex, boolean canRetainByteRef) throws IOException {
final ListColumnVector listColVector = (ListColumnVector) colVector;
final ColumnVector elementColVector = listColVector.child;
int offset = listColVector.childCount;
listColVector.isNull[batchIndex] = false;
listColVector.offsets[batchIndex] = offset;
final ListComplexTypeHelper listHelper = (ListComplexTypeHelper) field.getComplexHelper();
int listLength = 0;
while (deserializeRead.isNextComplexMultiValue()) {
// Ensure child size.
final int childCapacity = listColVector.child.isNull.length;
final int childCount = listColVector.childCount;
if (childCapacity < childCount / 0.75) {
listColVector.child.ensureSize(childCapacity * 2, true);
}
storeComplexFieldRowColumn(
elementColVector, listHelper.getElementField(), offset, canRetainByteRef);
offset++;
listLength++;
}
listColVector.childCount += listLength;
listColVector.lengths[batchIndex] = listLength;
}
private void storeMapRowColumn(ColumnVector colVector,
Field field, int batchIndex, boolean canRetainByteRef) throws IOException {
final MapColumnVector mapColVector = (MapColumnVector) colVector;
final MapComplexTypeHelper mapHelper = (MapComplexTypeHelper) field.getComplexHelper();
final ColumnVector keysColVector = mapColVector.keys;
final ColumnVector valuesColVector = mapColVector.values;
int offset = mapColVector.childCount;
mapColVector.offsets[batchIndex] = offset;
mapColVector.isNull[batchIndex] = false;
int keyValueCount = 0;
while (deserializeRead.isNextComplexMultiValue()) {
// Ensure child size.
final int childCapacity = mapColVector.keys.isNull.length;
final int childCount = mapColVector.childCount;
if (childCapacity < childCount / 0.75) {
mapColVector.keys.ensureSize(childCapacity * 2, true);
mapColVector.values.ensureSize(childCapacity * 2, true);
}
// Key.
storeComplexFieldRowColumn(
keysColVector, mapHelper.getKeyField(), offset, canRetainByteRef);
// Value.
storeComplexFieldRowColumn(
valuesColVector, mapHelper.getValueField(), offset, canRetainByteRef);
offset++;
keyValueCount++;
}
mapColVector.childCount += keyValueCount;
mapColVector.lengths[batchIndex] = keyValueCount;
}
private void storeStructRowColumn(ColumnVector colVector,
Field field, int batchIndex, boolean canRetainByteRef) throws IOException {
final StructColumnVector structColVector = (StructColumnVector) colVector;
final ColumnVector[] colVectorFields = structColVector.fields;
final StructComplexTypeHelper structHelper = (StructComplexTypeHelper) field.getComplexHelper();
final Field[] fields = structHelper.getFields();
structColVector.isNull[batchIndex] = false;
int i = 0;
for (ColumnVector colVectorField : colVectorFields) {
storeComplexFieldRowColumn(
colVectorField,
fields[i],
batchIndex,
canRetainByteRef);
i++;
}
deserializeRead.finishComplexVariableFieldsType();
}
private void storeUnionRowColumn(ColumnVector colVector,
Field field, int batchIndex, boolean canRetainByteRef) throws IOException {
deserializeRead.readComplexField();
// The read field of the Union gives us its tag.
final int tag = deserializeRead.currentInt;
final UnionColumnVector unionColVector = (UnionColumnVector) colVector;
final ColumnVector[] colVectorFields = unionColVector.fields;
final UnionComplexTypeHelper unionHelper = (UnionComplexTypeHelper) field.getComplexHelper();
unionColVector.isNull[batchIndex] = false;
unionColVector.tags[batchIndex] = tag;
storeComplexFieldRowColumn(
colVectorFields[tag],
unionHelper.getFields()[tag],
batchIndex,
canRetainByteRef);
deserializeRead.finishComplexVariableFieldsType();
}
/**
* Store one row column value that is the current value in deserializeRead.
*
* @param batch
* @param batchIndex
* @param logicalColumnIndex
* @param canRetainByteRef Specify true when it is safe to retain references to the bytes
* source for DeserializeRead. I.e. the STRING, CHAR/VARCHAR data
* can be set in BytesColumnVector with setRef instead of with setVal
* which copies data. An example of a safe usage is referring to bytes
* in a hash table entry that is immutable.
* @throws IOException
*/
private void storeRowColumn(VectorizedRowBatch batch, int batchIndex,
Field field, int logicalColumnIndex, boolean canRetainByteRef) throws IOException {
final int projectionColumnNum = projectionColumnNums[logicalColumnIndex];
ColumnVector colVector = batch.cols[projectionColumnNum];
switch (field.getCategory()) {
case PRIMITIVE:
storePrimitiveRowColumn(colVector, field, batchIndex, canRetainByteRef);
break;
case LIST:
storeListRowColumn(colVector, field, batchIndex, canRetainByteRef);
break;
case MAP:
storeMapRowColumn(colVector, field, batchIndex, canRetainByteRef);
break;
case STRUCT:
storeStructRowColumn(colVector, field, batchIndex, canRetainByteRef);
break;
case UNION:
storeUnionRowColumn(colVector, field, batchIndex, canRetainByteRef);
break;
default:
throw new RuntimeException("Category " + field.getCategory() + " not supported");
}
// We always set the null flag to false when there is a value.
batch.cols[projectionColumnNum].isNull[batchIndex] = false;
}
/**
* Convert one row column value that is the current value in deserializeRead.
*
* We deserialize into a writable and then pass that writable to an instance of VectorAssignRow
* to convert the writable to the target data type and assign it into the VectorizedRowBatch.
*
* @param batch
* @param batchIndex
* @param logicalColumnIndex
* @throws IOException
*/
private void convertRowColumn(VectorizedRowBatch batch, int batchIndex,
Field field, int logicalColumnIndex) throws IOException {
Writable convertSourceWritable = field.getConversionWritable();
switch (field.getCategory()) {
case PRIMITIVE:
{
switch (field.getPrimitiveCategory()) {
case VOID:
convertSourceWritable = null;
break;
case BOOLEAN:
((BooleanWritable) convertSourceWritable).set(deserializeRead.currentBoolean);
break;
case BYTE:
((ByteWritable) convertSourceWritable).set(deserializeRead.currentByte);
break;
case SHORT:
((ShortWritable) convertSourceWritable).set(deserializeRead.currentShort);
break;
case INT:
((IntWritable) convertSourceWritable).set(deserializeRead.currentInt);
break;
case LONG:
((LongWritable) convertSourceWritable).set(deserializeRead.currentLong);
break;
case TIMESTAMP:
((TimestampWritable) convertSourceWritable).set(deserializeRead.currentTimestampWritable);
break;
case DATE:
((DateWritable) convertSourceWritable).set(deserializeRead.currentDateWritable);
break;
case FLOAT:
((FloatWritable) convertSourceWritable).set(deserializeRead.currentFloat);
break;
case DOUBLE:
((DoubleWritable) convertSourceWritable).set(deserializeRead.currentDouble);
break;
case BINARY:
if (deserializeRead.currentBytes == null) {
LOG.info(
"null binary entry: batchIndex " + batchIndex + " projection column num " +
projectionColumnNums[logicalColumnIndex]);
}
((BytesWritable) convertSourceWritable).set(
deserializeRead.currentBytes,
deserializeRead.currentBytesStart,
deserializeRead.currentBytesLength);
break;
case STRING:
if (deserializeRead.currentBytes == null) {
throw new RuntimeException(
"null string entry: batchIndex " + batchIndex + " projection column num " +
projectionColumnNums[logicalColumnIndex]);
}
// Use org.apache.hadoop.io.Text as our helper to go from byte[] to String.
((Text) convertSourceWritable).set(
deserializeRead.currentBytes,
deserializeRead.currentBytesStart,
deserializeRead.currentBytesLength);
break;
case VARCHAR:
{
// Use the basic STRING bytes read to get access, then use our optimal truncate/trim method
// that does not use Java String objects.
if (deserializeRead.currentBytes == null) {
throw new RuntimeException(
"null varchar entry: batchIndex " + batchIndex + " projection column num " +
projectionColumnNums[logicalColumnIndex]);
}
int adjustedLength = StringExpr.truncate(
deserializeRead.currentBytes,
deserializeRead.currentBytesStart,
deserializeRead.currentBytesLength,
field.getMaxLength());
((HiveVarcharWritable) convertSourceWritable).set(
new String(
deserializeRead.currentBytes,
deserializeRead.currentBytesStart,
adjustedLength,
Charsets.UTF_8),
-1);
}
break;
case CHAR:
{
// Use the basic STRING bytes read to get access, then use our optimal truncate/trim method
// that does not use Java String objects.
if (deserializeRead.currentBytes == null) {
throw new RuntimeException(
"null char entry: batchIndex " + batchIndex + " projection column num " +
projectionColumnNums[logicalColumnIndex]);
}
int adjustedLength = StringExpr.rightTrimAndTruncate(
deserializeRead.currentBytes,
deserializeRead.currentBytesStart,
deserializeRead.currentBytesLength,
field.getMaxLength());
((HiveCharWritable) convertSourceWritable).set(
new String(
deserializeRead.currentBytes,
deserializeRead.currentBytesStart,
adjustedLength, Charsets.UTF_8),
-1);
}
break;
case DECIMAL:
((HiveDecimalWritable) convertSourceWritable).set(
deserializeRead.currentHiveDecimalWritable);
break;
case INTERVAL_YEAR_MONTH:
((HiveIntervalYearMonthWritable) convertSourceWritable).set(
deserializeRead.currentHiveIntervalYearMonthWritable);
break;
case INTERVAL_DAY_TIME:
((HiveIntervalDayTimeWritable) convertSourceWritable).set(
deserializeRead.currentHiveIntervalDayTimeWritable);
break;
default:
throw new RuntimeException("Primitive category " + field.getPrimitiveCategory() +
" not supported");
}
}
break;
case STRUCT:
case UNION:
// The only aspect of conversion to Struct / Union themselves is add fields as NULL on the end
// (no removal from end? which would mean skipping fields...)
// UNDONE
break;
case LIST:
case MAP:
// Conversion only happens below to List elements or Map key and/or values and not to the
// List or Map itself.
default:
throw new RuntimeException("Category " + field.getCategory() + " not supported");
}
/*
* Convert our source object we just read into the target object and store that in the
* VectorizedRowBatch.
*/
convertVectorAssignRow.assignConvertRowColumn(batch, batchIndex, logicalColumnIndex,
convertSourceWritable);
}
/**
* Specify the range of bytes to deserialize in the next call to the deserialize method.
*
* @param bytes
* @param offset
* @param length
*/
public void setBytes(byte[] bytes, int offset, int length) {
inputBytes = bytes;
deserializeRead.set(bytes, offset, length);
}
/**
* Deserialize a row from the range of bytes specified by setBytes.
*
* Use getDetailedReadPositionString to get detailed read position information to help
* diagnose exceptions that are thrown...
*
* This version of deserialize does not keep byte references to string/char/varchar/binary data
* type field. The bytes are copied into the BytesColumnVector buffer with setVal.
* (See deserializeByRef below if keep references is safe).
*
* @param batch
* @param batchIndex
* @throws IOException
*/
public void deserialize(VectorizedRowBatch batch, int batchIndex) throws IOException {
// Pass false for canRetainByteRef since we will NOT be keeping byte references to the input
// bytes with the BytesColumnVector.setRef method.
final int count = topLevelFields.length;
Field field;
if (!useReadField) {
for (int i = 0; i < count; i++) {
final int projectionColumnNum = projectionColumnNums[i];
if (projectionColumnNum == -1) {
// We must read through fields we do not want.
deserializeRead.skipNextField();
continue;
}
if (!deserializeRead.readNextField()) {
ColumnVector colVector = batch.cols[projectionColumnNum];
colVector.isNull[batchIndex] = true;
colVector.noNulls = false;
continue;
}
// The current* members of deserializeRead have the field value.
field = topLevelFields[i];
if (field.getIsConvert()) {
convertRowColumn(batch, batchIndex, field, i);
} else {
storeRowColumn(batch, batchIndex, field, i, /* canRetainByteRef */ false);
}
}
} else {
final int readFieldCount = readFieldLogicalIndices.length;
for (int i = 0; i < readFieldCount; i++) {
final int logicalIndex = readFieldLogicalIndices[i];
// Jump to the field we want and read it.
if (!deserializeRead.readField(logicalIndex)) {
ColumnVector colVector = batch.cols[projectionColumnNums[logicalIndex]];
colVector.isNull[batchIndex] = true;
colVector.noNulls = false;
continue;
}
// The current* members of deserializeRead have the field value.
field = topLevelFields[logicalIndex];
if (field.getIsConvert()) {
convertRowColumn(batch, batchIndex, field, logicalIndex);
} else {
storeRowColumn(batch, batchIndex, field, logicalIndex, /* canRetainByteRef */ false);
}
}
}
}
/**
* Deserialize a row from the range of bytes specified by setBytes.
*
* Use this method instead of deserialize when it is safe to retain references to the bytes source
* for DeserializeRead. I.e. the STRING, CHAR/VARCHAR data can be set in BytesColumnVector with
* setRef instead of with setVal which copies data.
*
* An example of a safe usage:
* Referring to bytes in a hash table entry that is immutable.
*
* An example of a unsafe usage:
* Referring to bytes in a reduce receive buffer that will be overwritten with new data.
*
* Use getDetailedReadPositionString to get detailed read position information to help
* diagnose exceptions that are thrown...
*
* @param batch
* @param batchIndex
* @throws IOException
*/
public void deserializeByRef(VectorizedRowBatch batch, int batchIndex) throws IOException {
final int count = topLevelFields.length;
Field field;
if (!useReadField) {
for (int i = 0; i < count; i++) {
final int projectionColumnNum = projectionColumnNums[i];
if (projectionColumnNum == -1) {
// We must read through fields we do not want.
deserializeRead.skipNextField();
continue;
}
if (!deserializeRead.readNextField()) {
ColumnVector colVector = batch.cols[projectionColumnNum];
colVector.isNull[batchIndex] = true;
colVector.noNulls = false;
continue;
}
// The current* members of deserializeRead have the field value.
field = topLevelFields[i];
if (field.getIsConvert()) {
convertRowColumn(batch, batchIndex, field, i);
} else {
storeRowColumn(batch, batchIndex, field, i, /* canRetainByteRef */ true);
}
}
} else {
final int readFieldCount = readFieldLogicalIndices.length;
for (int i = 0; i < readFieldCount; i++) {
final int logicalIndex = readFieldLogicalIndices[i];
// Jump to the field we want and read it.
if (!deserializeRead.readField(logicalIndex)) {
ColumnVector colVector = batch.cols[projectionColumnNums[logicalIndex]];
colVector.isNull[batchIndex] = true;
colVector.noNulls = false;
continue;
}
// The current* members of deserializeRead have the field value.
field = topLevelFields[logicalIndex];
if (field.getIsConvert()) {
convertRowColumn(batch, batchIndex, field, logicalIndex);
} else {
storeRowColumn(batch, batchIndex, field, logicalIndex, /* canRetainByteRef */ true);
}
}
}
}
public String getDetailedReadPositionString() {
return deserializeRead.getDetailedReadPositionString();
}
}