/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.exec.vector; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.VectorPartitionConversion; import org.apache.hadoop.hive.serde2.fast.DeserializeRead; import org.apache.hadoop.hive.serde2.io.ByteWritable; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.io.HiveCharWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable; import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable; import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; import org.apache.hadoop.hive.serde2.io.ShortWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import com.google.common.base.Charsets; import com.google.common.base.Preconditions; /** * This class deserializes a serialization format into a row of a VectorizedRowBatch. * * The caller provides the hive type names and output column numbers in the order desired to * deserialize. * * This class uses an provided DeserializeRead object to directly deserialize by reading * field-by-field from a serialization format into the primitive values of the VectorizedRowBatch. */ public final class VectorDeserializeRow<T extends DeserializeRead> { private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(VectorDeserializeRow.class); private T deserializeRead; private TypeInfo[] sourceTypeInfos; private byte[] inputBytes; /** * @param deserializeRead Set useExternalBuffer to true to avoid buffer copying and to get * more efficient reading. */ public VectorDeserializeRow(T deserializeRead) { this(); this.deserializeRead = deserializeRead; sourceTypeInfos = deserializeRead.typeInfos(); } // Not public since we must have the deserialize read object. private VectorDeserializeRow() { } private static class Field { private Category category; private PrimitiveCategory primitiveCategory; //The data type primitive category of the column being deserialized. private int maxLength; // For the CHAR and VARCHAR data types, the maximum character length of // the column. Otherwise, 0. private boolean isConvert; /* * This member has information for data type conversion. * Not defined if there is no conversion. */ Writable conversionWritable; // Conversion requires source be placed in writable so we can call upon // VectorAssignRow to convert and assign the row column. private ComplexTypeHelper complexTypeHelper; // For a complex type, a helper object that describes elements, key/value pairs, // or fields. public Field(PrimitiveCategory primitiveCategory, int maxLength) { this.category = Category.PRIMITIVE; this.primitiveCategory = primitiveCategory; this.maxLength = maxLength; this.isConvert = false; this.conversionWritable = null; this.complexTypeHelper = null; } public Field(Category category, ComplexTypeHelper complexTypeHelper) { this.category = category; this.primitiveCategory = null; this.maxLength = 0; this.isConvert = false; this.conversionWritable = null; this.complexTypeHelper = complexTypeHelper; } public Category getCategory() { return category; } public PrimitiveCategory getPrimitiveCategory() { return primitiveCategory; } public int getMaxLength() { return maxLength; } public void setIsConvert(boolean isConvert) { this.isConvert = isConvert; } public boolean getIsConvert() { return isConvert; } public void setConversionWritable(Writable conversionWritable) { this.conversionWritable = conversionWritable; } public Writable getConversionWritable() { return conversionWritable; } public ComplexTypeHelper getComplexHelper() { return complexTypeHelper; } } /* * These members have information for deserializing a row into the VectorizedRowBatch * columns. * * We say "source" because when there is conversion we are converting th deserialized source into * a target data type. */ private boolean useReadField; // True when the (random access) readField method of DeserializeRead are being used. private int[] readFieldLogicalIndices; // The logical indices for reading with readField. private int[] projectionColumnNums; // Assigning can be a subset of columns, so this is the projection -- // the batch column numbers. private Field topLevelFields[]; VectorAssignRow convertVectorAssignRow; // Use its conversion ability. /* * Allocate the source deserialization related arrays. */ private void allocateArrays(int count) { projectionColumnNums = new int[count]; Arrays.fill(projectionColumnNums, -1); topLevelFields = new Field[count]; } private Field allocatePrimitiveField(TypeInfo sourceTypeInfo) { final PrimitiveTypeInfo sourcePrimitiveTypeInfo = (PrimitiveTypeInfo) sourceTypeInfo; final PrimitiveCategory sourcePrimitiveCategory = sourcePrimitiveTypeInfo.getPrimitiveCategory(); final int maxLength; switch (sourcePrimitiveCategory) { case CHAR: maxLength = ((CharTypeInfo) sourcePrimitiveTypeInfo).getLength(); break; case VARCHAR: maxLength = ((VarcharTypeInfo) sourcePrimitiveTypeInfo).getLength(); break; default: // No additional data type specific setting. maxLength = 0; break; } return new Field(sourcePrimitiveCategory, maxLength); } private Field allocateComplexField(TypeInfo sourceTypeInfo) { final Category category = sourceTypeInfo.getCategory(); switch (category) { case LIST: { final ListTypeInfo listTypeInfo = (ListTypeInfo) sourceTypeInfo; final ListComplexTypeHelper listHelper = new ListComplexTypeHelper( allocateField(listTypeInfo.getListElementTypeInfo())); return new Field(category, listHelper); } case MAP: { final MapTypeInfo mapTypeInfo = (MapTypeInfo) sourceTypeInfo; final MapComplexTypeHelper mapHelper = new MapComplexTypeHelper( allocateField(mapTypeInfo.getMapKeyTypeInfo()), allocateField(mapTypeInfo.getMapValueTypeInfo())); return new Field(category, mapHelper); } case STRUCT: { final StructTypeInfo structTypeInfo = (StructTypeInfo) sourceTypeInfo; final ArrayList<TypeInfo> fieldTypeInfoList = structTypeInfo.getAllStructFieldTypeInfos(); final int count = fieldTypeInfoList.size(); final Field[] fields = new Field[count]; for (int i = 0; i < count; i++) { fields[i] = allocateField(fieldTypeInfoList.get(i)); } final StructComplexTypeHelper structHelper = new StructComplexTypeHelper(fields); return new Field(category, structHelper); } case UNION: { final UnionTypeInfo unionTypeInfo = (UnionTypeInfo) sourceTypeInfo; final List<TypeInfo> fieldTypeInfoList = unionTypeInfo.getAllUnionObjectTypeInfos(); final int count = fieldTypeInfoList.size(); final Field[] fields = new Field[count]; for (int i = 0; i < count; i++) { fields[i] = allocateField(fieldTypeInfoList.get(i)); } final UnionComplexTypeHelper unionHelper = new UnionComplexTypeHelper(fields); return new Field(category, unionHelper); } default: throw new RuntimeException("Category " + category + " not supported"); } } private Field allocateField(TypeInfo sourceTypeInfo) { switch (sourceTypeInfo.getCategory()) { case PRIMITIVE: return allocatePrimitiveField(sourceTypeInfo); case LIST: case MAP: case STRUCT: case UNION: return allocateComplexField(sourceTypeInfo); default: throw new RuntimeException("Category " + sourceTypeInfo.getCategory() + " not supported"); } } /* * Initialize one column's source deserializtion information. */ private void initTopLevelField(int logicalColumnIndex, int projectionColumnNum, TypeInfo sourceTypeInfo) { projectionColumnNums[logicalColumnIndex] = projectionColumnNum; topLevelFields[logicalColumnIndex] = allocateField(sourceTypeInfo); } /* * Initialize the conversion related arrays. Assumes initTopLevelField has already been called. */ private void addTopLevelConversion(int logicalColumnIndex) { final Field field = topLevelFields[logicalColumnIndex]; field.setIsConvert(true); if (field.getCategory() == Category.PRIMITIVE) { field.setConversionWritable( VectorizedBatchUtil.getPrimitiveWritable(field.getPrimitiveCategory())); } } /* * Specify the columns to deserialize into as an array. */ public void init(int[] outputColumns) throws HiveException { final int count = sourceTypeInfos.length; allocateArrays(count); for (int i = 0; i < count; i++) { int outputColumn = outputColumns[i]; initTopLevelField(i, outputColumn, sourceTypeInfos[i]); } } /* * Specify the columns to deserialize into as a list. */ public void init(List<Integer> outputColumns) throws HiveException { final int count = sourceTypeInfos.length; allocateArrays(count); for (int i = 0; i < count; i++) { int outputColumn = outputColumns.get(i); initTopLevelField(i, outputColumn, sourceTypeInfos[i]); } } /* * Specify the columns to deserialize into a range starting at a column number. */ public void init(int startColumn) throws HiveException { final int count = sourceTypeInfos.length; allocateArrays(count); for (int i = 0; i < count; i++) { int outputColumn = startColumn + i; initTopLevelField(i, outputColumn, sourceTypeInfos[i]); } } public void init(boolean[] columnsToIncludeTruncated) throws HiveException { // When truncated included is used, its length must be at least the number of source type infos. // When longer, we assume the caller will default with nulls, etc. Preconditions.checkState( columnsToIncludeTruncated == null || columnsToIncludeTruncated.length == sourceTypeInfos.length); final int columnCount = sourceTypeInfos.length; allocateArrays(columnCount); int includedCount = 0; final int[] includedIndices = new int[columnCount]; for (int i = 0; i < columnCount; i++) { if (columnsToIncludeTruncated != null && !columnsToIncludeTruncated[i]) { // Field not included in query. } else { initTopLevelField(i, i, sourceTypeInfos[i]); includedIndices[includedCount++] = i; } } // Optimizing for readField? if (includedCount < columnCount && deserializeRead.isReadFieldSupported()) { useReadField = true; readFieldLogicalIndices = Arrays.copyOf(includedIndices, includedCount); } } /** * Initialize for converting the source data type that are going to be read with the * DeserializedRead interface passed to the constructor to the target data types desired in * the VectorizedRowBatch. * * No projection -- using the column range 0 .. columnCount-1 * * @param targetTypeInfos * @param columnsToIncludeTruncated * @throws HiveException */ public void initConversion(TypeInfo[] targetTypeInfos, boolean[] columnsToIncludeTruncated) throws HiveException { // We assume the caller will handle extra columns default with nulls, etc. Preconditions.checkState(targetTypeInfos.length >= sourceTypeInfos.length); // When truncated included is used, its length must be at least the number of source type infos. // When longer, we assume the caller will default with nulls, etc. Preconditions.checkState( columnsToIncludeTruncated == null || columnsToIncludeTruncated.length >= sourceTypeInfos.length); final int columnCount = sourceTypeInfos.length; allocateArrays(columnCount); int includedCount = 0; int[] includedIndices = new int[columnCount]; boolean atLeastOneConvert = false; for (int i = 0; i < columnCount; i++) { if (columnsToIncludeTruncated != null && !columnsToIncludeTruncated[i]) { // Field not included in query. } else { TypeInfo sourceTypeInfo = sourceTypeInfos[i]; TypeInfo targetTypeInfo = targetTypeInfos[i]; if (!sourceTypeInfo.equals(targetTypeInfo)) { if (VectorPartitionConversion.isImplicitVectorColumnConversion(sourceTypeInfo, targetTypeInfo)) { // Do implicit conversion from source type to target type. initTopLevelField(i, i, sourceTypeInfo); } else { // Do formal conversion... initTopLevelField(i, i, sourceTypeInfo); // UNDONE: No for List and Map; Yes for Struct and Union when field count different... addTopLevelConversion(i); atLeastOneConvert = true; } } else { // No conversion. initTopLevelField(i, i, sourceTypeInfo); } includedIndices[includedCount++] = i; } } // Optimizing for readField? if (includedCount < columnCount && deserializeRead.isReadFieldSupported()) { useReadField = true; readFieldLogicalIndices = Arrays.copyOf(includedIndices, includedCount); } if (atLeastOneConvert) { // Let the VectorAssignRow class do the conversion. convertVectorAssignRow = new VectorAssignRow(); convertVectorAssignRow.initConversion(sourceTypeInfos, targetTypeInfos, columnsToIncludeTruncated); } } public void init() throws HiveException { init(0); } private void storePrimitiveRowColumn(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) throws IOException { switch (field.getPrimitiveCategory()) { case VOID: VectorizedBatchUtil.setNullColIsNullValue(colVector, batchIndex); return; case BOOLEAN: ((LongColumnVector) colVector).vector[batchIndex] = (deserializeRead.currentBoolean ? 1 : 0); break; case BYTE: ((LongColumnVector) colVector).vector[batchIndex] = deserializeRead.currentByte; break; case SHORT: ((LongColumnVector) colVector).vector[batchIndex] = deserializeRead.currentShort; break; case INT: ((LongColumnVector) colVector).vector[batchIndex] = deserializeRead.currentInt; break; case LONG: ((LongColumnVector) colVector).vector[batchIndex] = deserializeRead.currentLong; break; case TIMESTAMP: ((TimestampColumnVector) colVector).set( batchIndex, deserializeRead.currentTimestampWritable.getTimestamp()); break; case DATE: ((LongColumnVector) colVector).vector[batchIndex] = deserializeRead.currentDateWritable.getDays(); break; case FLOAT: ((DoubleColumnVector) colVector).vector[batchIndex] = deserializeRead.currentFloat; break; case DOUBLE: ((DoubleColumnVector) colVector).vector[batchIndex] = deserializeRead.currentDouble; break; case BINARY: case STRING: { final BytesColumnVector bytesColVec = ((BytesColumnVector) colVector); if (deserializeRead.currentExternalBufferNeeded) { bytesColVec.ensureValPreallocated(deserializeRead.currentExternalBufferNeededLen); deserializeRead.copyToExternalBuffer( bytesColVec.getValPreallocatedBytes(), bytesColVec.getValPreallocatedStart()); bytesColVec.setValPreallocated( batchIndex, deserializeRead.currentExternalBufferNeededLen); } else if (canRetainByteRef && inputBytes == deserializeRead.currentBytes) { bytesColVec.setRef( batchIndex, deserializeRead.currentBytes, deserializeRead.currentBytesStart, deserializeRead.currentBytesLength); } else { bytesColVec.setVal( batchIndex, deserializeRead.currentBytes, deserializeRead.currentBytesStart, deserializeRead.currentBytesLength); } } break; case VARCHAR: { // Use the basic STRING bytes read to get access, then use our optimal truncate/trim method // that does not use Java String objects. final BytesColumnVector bytesColVec = ((BytesColumnVector) colVector); if (deserializeRead.currentExternalBufferNeeded) { // Write directly into our BytesColumnVector value buffer. bytesColVec.ensureValPreallocated(deserializeRead.currentExternalBufferNeededLen); final byte[] convertBuffer = bytesColVec.getValPreallocatedBytes(); final int convertBufferStart = bytesColVec.getValPreallocatedStart(); deserializeRead.copyToExternalBuffer( convertBuffer, convertBufferStart); bytesColVec.setValPreallocated( batchIndex, StringExpr.truncate( convertBuffer, convertBufferStart, deserializeRead.currentExternalBufferNeededLen, field.getMaxLength())); } else if (canRetainByteRef && inputBytes == deserializeRead.currentBytes) { bytesColVec.setRef( batchIndex, deserializeRead.currentBytes, deserializeRead.currentBytesStart, StringExpr.truncate( deserializeRead.currentBytes, deserializeRead.currentBytesStart, deserializeRead.currentBytesLength, field.getMaxLength())); } else { bytesColVec.setVal( batchIndex, deserializeRead.currentBytes, deserializeRead.currentBytesStart, StringExpr.truncate( deserializeRead.currentBytes, deserializeRead.currentBytesStart, deserializeRead.currentBytesLength, field.getMaxLength())); } } break; case CHAR: { // Use the basic STRING bytes read to get access, then use our optimal truncate/trim method // that does not use Java String objects. final BytesColumnVector bytesColVec = ((BytesColumnVector) colVector); if (deserializeRead.currentExternalBufferNeeded) { // Write directly into our BytesColumnVector value buffer. bytesColVec.ensureValPreallocated(deserializeRead.currentExternalBufferNeededLen); final byte[] convertBuffer = bytesColVec.getValPreallocatedBytes(); final int convertBufferStart = bytesColVec.getValPreallocatedStart(); deserializeRead.copyToExternalBuffer( convertBuffer, convertBufferStart); bytesColVec.setValPreallocated( batchIndex, StringExpr.rightTrimAndTruncate( convertBuffer, convertBufferStart, deserializeRead.currentExternalBufferNeededLen, field.getMaxLength())); } else if (canRetainByteRef && inputBytes == deserializeRead.currentBytes) { bytesColVec.setRef( batchIndex, deserializeRead.currentBytes, deserializeRead.currentBytesStart, StringExpr.rightTrimAndTruncate( deserializeRead.currentBytes, deserializeRead.currentBytesStart, deserializeRead.currentBytesLength, field.getMaxLength())); } else { bytesColVec.setVal( batchIndex, deserializeRead.currentBytes, deserializeRead.currentBytesStart, StringExpr.rightTrimAndTruncate( deserializeRead.currentBytes, deserializeRead.currentBytesStart, deserializeRead.currentBytesLength, field.getMaxLength())); } } break; case DECIMAL: // The DecimalColumnVector set method will quickly copy the deserialized decimal writable fields. ((DecimalColumnVector) colVector).set( batchIndex, deserializeRead.currentHiveDecimalWritable); break; case INTERVAL_YEAR_MONTH: ((LongColumnVector) colVector).vector[batchIndex] = deserializeRead.currentHiveIntervalYearMonthWritable.getHiveIntervalYearMonth().getTotalMonths(); break; case INTERVAL_DAY_TIME: ((IntervalDayTimeColumnVector) colVector).set( batchIndex, deserializeRead.currentHiveIntervalDayTimeWritable.getHiveIntervalDayTime()); break; default: throw new RuntimeException("Primitive category " + field.getPrimitiveCategory() + " not supported"); } } private static class ComplexTypeHelper { } private static class ListComplexTypeHelper extends ComplexTypeHelper { private Field elementField; public ListComplexTypeHelper(Field elementField) { this.elementField = elementField; } public Field getElementField() { return elementField; } } private static class MapComplexTypeHelper extends ComplexTypeHelper { private Field keyField; private Field valueField; public MapComplexTypeHelper(Field keyField, Field valueField) { this.keyField = keyField; this.valueField = valueField; } public Field getKeyField() { return keyField; } public Field getValueField() { return valueField; } } private static class FieldsComplexTypeHelper extends ComplexTypeHelper { private Field[] fields; public FieldsComplexTypeHelper(Field[] fields) { this.fields = fields; } public Field[] getFields() { return fields; } } private static class StructComplexTypeHelper extends FieldsComplexTypeHelper { public StructComplexTypeHelper(Field[] fields) { super(fields); } } private static class UnionComplexTypeHelper extends FieldsComplexTypeHelper { public UnionComplexTypeHelper(Field[] fields) { super(fields); } } // UNDONE: Presumption of *append* private void storeComplexFieldRowColumn(ColumnVector fieldColVector, Field field, int batchIndex, boolean canRetainByteRef) throws IOException { if (!deserializeRead.readComplexField()) { fieldColVector.isNull[batchIndex] = true; fieldColVector.noNulls = false; return; } switch (field.getCategory()) { case PRIMITIVE: storePrimitiveRowColumn(fieldColVector, field, batchIndex, canRetainByteRef); break; case LIST: storeListRowColumn(fieldColVector, field, batchIndex, canRetainByteRef); break; case MAP: storeMapRowColumn(fieldColVector, field, batchIndex, canRetainByteRef); break; case STRUCT: storeStructRowColumn(fieldColVector, field, batchIndex, canRetainByteRef); break; case UNION: storeUnionRowColumn(fieldColVector, field, batchIndex, canRetainByteRef); break; default: throw new RuntimeException("Category " + field.getCategory() + " not supported"); } } private void storeListRowColumn(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) throws IOException { final ListColumnVector listColVector = (ListColumnVector) colVector; final ColumnVector elementColVector = listColVector.child; int offset = listColVector.childCount; listColVector.isNull[batchIndex] = false; listColVector.offsets[batchIndex] = offset; final ListComplexTypeHelper listHelper = (ListComplexTypeHelper) field.getComplexHelper(); int listLength = 0; while (deserializeRead.isNextComplexMultiValue()) { // Ensure child size. final int childCapacity = listColVector.child.isNull.length; final int childCount = listColVector.childCount; if (childCapacity < childCount / 0.75) { listColVector.child.ensureSize(childCapacity * 2, true); } storeComplexFieldRowColumn( elementColVector, listHelper.getElementField(), offset, canRetainByteRef); offset++; listLength++; } listColVector.childCount += listLength; listColVector.lengths[batchIndex] = listLength; } private void storeMapRowColumn(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) throws IOException { final MapColumnVector mapColVector = (MapColumnVector) colVector; final MapComplexTypeHelper mapHelper = (MapComplexTypeHelper) field.getComplexHelper(); final ColumnVector keysColVector = mapColVector.keys; final ColumnVector valuesColVector = mapColVector.values; int offset = mapColVector.childCount; mapColVector.offsets[batchIndex] = offset; mapColVector.isNull[batchIndex] = false; int keyValueCount = 0; while (deserializeRead.isNextComplexMultiValue()) { // Ensure child size. final int childCapacity = mapColVector.keys.isNull.length; final int childCount = mapColVector.childCount; if (childCapacity < childCount / 0.75) { mapColVector.keys.ensureSize(childCapacity * 2, true); mapColVector.values.ensureSize(childCapacity * 2, true); } // Key. storeComplexFieldRowColumn( keysColVector, mapHelper.getKeyField(), offset, canRetainByteRef); // Value. storeComplexFieldRowColumn( valuesColVector, mapHelper.getValueField(), offset, canRetainByteRef); offset++; keyValueCount++; } mapColVector.childCount += keyValueCount; mapColVector.lengths[batchIndex] = keyValueCount; } private void storeStructRowColumn(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) throws IOException { final StructColumnVector structColVector = (StructColumnVector) colVector; final ColumnVector[] colVectorFields = structColVector.fields; final StructComplexTypeHelper structHelper = (StructComplexTypeHelper) field.getComplexHelper(); final Field[] fields = structHelper.getFields(); structColVector.isNull[batchIndex] = false; int i = 0; for (ColumnVector colVectorField : colVectorFields) { storeComplexFieldRowColumn( colVectorField, fields[i], batchIndex, canRetainByteRef); i++; } deserializeRead.finishComplexVariableFieldsType(); } private void storeUnionRowColumn(ColumnVector colVector, Field field, int batchIndex, boolean canRetainByteRef) throws IOException { deserializeRead.readComplexField(); // The read field of the Union gives us its tag. final int tag = deserializeRead.currentInt; final UnionColumnVector unionColVector = (UnionColumnVector) colVector; final ColumnVector[] colVectorFields = unionColVector.fields; final UnionComplexTypeHelper unionHelper = (UnionComplexTypeHelper) field.getComplexHelper(); unionColVector.isNull[batchIndex] = false; unionColVector.tags[batchIndex] = tag; storeComplexFieldRowColumn( colVectorFields[tag], unionHelper.getFields()[tag], batchIndex, canRetainByteRef); deserializeRead.finishComplexVariableFieldsType(); } /** * Store one row column value that is the current value in deserializeRead. * * @param batch * @param batchIndex * @param logicalColumnIndex * @param canRetainByteRef Specify true when it is safe to retain references to the bytes * source for DeserializeRead. I.e. the STRING, CHAR/VARCHAR data * can be set in BytesColumnVector with setRef instead of with setVal * which copies data. An example of a safe usage is referring to bytes * in a hash table entry that is immutable. * @throws IOException */ private void storeRowColumn(VectorizedRowBatch batch, int batchIndex, Field field, int logicalColumnIndex, boolean canRetainByteRef) throws IOException { final int projectionColumnNum = projectionColumnNums[logicalColumnIndex]; ColumnVector colVector = batch.cols[projectionColumnNum]; switch (field.getCategory()) { case PRIMITIVE: storePrimitiveRowColumn(colVector, field, batchIndex, canRetainByteRef); break; case LIST: storeListRowColumn(colVector, field, batchIndex, canRetainByteRef); break; case MAP: storeMapRowColumn(colVector, field, batchIndex, canRetainByteRef); break; case STRUCT: storeStructRowColumn(colVector, field, batchIndex, canRetainByteRef); break; case UNION: storeUnionRowColumn(colVector, field, batchIndex, canRetainByteRef); break; default: throw new RuntimeException("Category " + field.getCategory() + " not supported"); } // We always set the null flag to false when there is a value. batch.cols[projectionColumnNum].isNull[batchIndex] = false; } /** * Convert one row column value that is the current value in deserializeRead. * * We deserialize into a writable and then pass that writable to an instance of VectorAssignRow * to convert the writable to the target data type and assign it into the VectorizedRowBatch. * * @param batch * @param batchIndex * @param logicalColumnIndex * @throws IOException */ private void convertRowColumn(VectorizedRowBatch batch, int batchIndex, Field field, int logicalColumnIndex) throws IOException { Writable convertSourceWritable = field.getConversionWritable(); switch (field.getCategory()) { case PRIMITIVE: { switch (field.getPrimitiveCategory()) { case VOID: convertSourceWritable = null; break; case BOOLEAN: ((BooleanWritable) convertSourceWritable).set(deserializeRead.currentBoolean); break; case BYTE: ((ByteWritable) convertSourceWritable).set(deserializeRead.currentByte); break; case SHORT: ((ShortWritable) convertSourceWritable).set(deserializeRead.currentShort); break; case INT: ((IntWritable) convertSourceWritable).set(deserializeRead.currentInt); break; case LONG: ((LongWritable) convertSourceWritable).set(deserializeRead.currentLong); break; case TIMESTAMP: ((TimestampWritable) convertSourceWritable).set(deserializeRead.currentTimestampWritable); break; case DATE: ((DateWritable) convertSourceWritable).set(deserializeRead.currentDateWritable); break; case FLOAT: ((FloatWritable) convertSourceWritable).set(deserializeRead.currentFloat); break; case DOUBLE: ((DoubleWritable) convertSourceWritable).set(deserializeRead.currentDouble); break; case BINARY: if (deserializeRead.currentBytes == null) { LOG.info( "null binary entry: batchIndex " + batchIndex + " projection column num " + projectionColumnNums[logicalColumnIndex]); } ((BytesWritable) convertSourceWritable).set( deserializeRead.currentBytes, deserializeRead.currentBytesStart, deserializeRead.currentBytesLength); break; case STRING: if (deserializeRead.currentBytes == null) { throw new RuntimeException( "null string entry: batchIndex " + batchIndex + " projection column num " + projectionColumnNums[logicalColumnIndex]); } // Use org.apache.hadoop.io.Text as our helper to go from byte[] to String. ((Text) convertSourceWritable).set( deserializeRead.currentBytes, deserializeRead.currentBytesStart, deserializeRead.currentBytesLength); break; case VARCHAR: { // Use the basic STRING bytes read to get access, then use our optimal truncate/trim method // that does not use Java String objects. if (deserializeRead.currentBytes == null) { throw new RuntimeException( "null varchar entry: batchIndex " + batchIndex + " projection column num " + projectionColumnNums[logicalColumnIndex]); } int adjustedLength = StringExpr.truncate( deserializeRead.currentBytes, deserializeRead.currentBytesStart, deserializeRead.currentBytesLength, field.getMaxLength()); ((HiveVarcharWritable) convertSourceWritable).set( new String( deserializeRead.currentBytes, deserializeRead.currentBytesStart, adjustedLength, Charsets.UTF_8), -1); } break; case CHAR: { // Use the basic STRING bytes read to get access, then use our optimal truncate/trim method // that does not use Java String objects. if (deserializeRead.currentBytes == null) { throw new RuntimeException( "null char entry: batchIndex " + batchIndex + " projection column num " + projectionColumnNums[logicalColumnIndex]); } int adjustedLength = StringExpr.rightTrimAndTruncate( deserializeRead.currentBytes, deserializeRead.currentBytesStart, deserializeRead.currentBytesLength, field.getMaxLength()); ((HiveCharWritable) convertSourceWritable).set( new String( deserializeRead.currentBytes, deserializeRead.currentBytesStart, adjustedLength, Charsets.UTF_8), -1); } break; case DECIMAL: ((HiveDecimalWritable) convertSourceWritable).set( deserializeRead.currentHiveDecimalWritable); break; case INTERVAL_YEAR_MONTH: ((HiveIntervalYearMonthWritable) convertSourceWritable).set( deserializeRead.currentHiveIntervalYearMonthWritable); break; case INTERVAL_DAY_TIME: ((HiveIntervalDayTimeWritable) convertSourceWritable).set( deserializeRead.currentHiveIntervalDayTimeWritable); break; default: throw new RuntimeException("Primitive category " + field.getPrimitiveCategory() + " not supported"); } } break; case STRUCT: case UNION: // The only aspect of conversion to Struct / Union themselves is add fields as NULL on the end // (no removal from end? which would mean skipping fields...) // UNDONE break; case LIST: case MAP: // Conversion only happens below to List elements or Map key and/or values and not to the // List or Map itself. default: throw new RuntimeException("Category " + field.getCategory() + " not supported"); } /* * Convert our source object we just read into the target object and store that in the * VectorizedRowBatch. */ convertVectorAssignRow.assignConvertRowColumn(batch, batchIndex, logicalColumnIndex, convertSourceWritable); } /** * Specify the range of bytes to deserialize in the next call to the deserialize method. * * @param bytes * @param offset * @param length */ public void setBytes(byte[] bytes, int offset, int length) { inputBytes = bytes; deserializeRead.set(bytes, offset, length); } /** * Deserialize a row from the range of bytes specified by setBytes. * * Use getDetailedReadPositionString to get detailed read position information to help * diagnose exceptions that are thrown... * * This version of deserialize does not keep byte references to string/char/varchar/binary data * type field. The bytes are copied into the BytesColumnVector buffer with setVal. * (See deserializeByRef below if keep references is safe). * * @param batch * @param batchIndex * @throws IOException */ public void deserialize(VectorizedRowBatch batch, int batchIndex) throws IOException { // Pass false for canRetainByteRef since we will NOT be keeping byte references to the input // bytes with the BytesColumnVector.setRef method. final int count = topLevelFields.length; Field field; if (!useReadField) { for (int i = 0; i < count; i++) { final int projectionColumnNum = projectionColumnNums[i]; if (projectionColumnNum == -1) { // We must read through fields we do not want. deserializeRead.skipNextField(); continue; } if (!deserializeRead.readNextField()) { ColumnVector colVector = batch.cols[projectionColumnNum]; colVector.isNull[batchIndex] = true; colVector.noNulls = false; continue; } // The current* members of deserializeRead have the field value. field = topLevelFields[i]; if (field.getIsConvert()) { convertRowColumn(batch, batchIndex, field, i); } else { storeRowColumn(batch, batchIndex, field, i, /* canRetainByteRef */ false); } } } else { final int readFieldCount = readFieldLogicalIndices.length; for (int i = 0; i < readFieldCount; i++) { final int logicalIndex = readFieldLogicalIndices[i]; // Jump to the field we want and read it. if (!deserializeRead.readField(logicalIndex)) { ColumnVector colVector = batch.cols[projectionColumnNums[logicalIndex]]; colVector.isNull[batchIndex] = true; colVector.noNulls = false; continue; } // The current* members of deserializeRead have the field value. field = topLevelFields[logicalIndex]; if (field.getIsConvert()) { convertRowColumn(batch, batchIndex, field, logicalIndex); } else { storeRowColumn(batch, batchIndex, field, logicalIndex, /* canRetainByteRef */ false); } } } } /** * Deserialize a row from the range of bytes specified by setBytes. * * Use this method instead of deserialize when it is safe to retain references to the bytes source * for DeserializeRead. I.e. the STRING, CHAR/VARCHAR data can be set in BytesColumnVector with * setRef instead of with setVal which copies data. * * An example of a safe usage: * Referring to bytes in a hash table entry that is immutable. * * An example of a unsafe usage: * Referring to bytes in a reduce receive buffer that will be overwritten with new data. * * Use getDetailedReadPositionString to get detailed read position information to help * diagnose exceptions that are thrown... * * @param batch * @param batchIndex * @throws IOException */ public void deserializeByRef(VectorizedRowBatch batch, int batchIndex) throws IOException { final int count = topLevelFields.length; Field field; if (!useReadField) { for (int i = 0; i < count; i++) { final int projectionColumnNum = projectionColumnNums[i]; if (projectionColumnNum == -1) { // We must read through fields we do not want. deserializeRead.skipNextField(); continue; } if (!deserializeRead.readNextField()) { ColumnVector colVector = batch.cols[projectionColumnNum]; colVector.isNull[batchIndex] = true; colVector.noNulls = false; continue; } // The current* members of deserializeRead have the field value. field = topLevelFields[i]; if (field.getIsConvert()) { convertRowColumn(batch, batchIndex, field, i); } else { storeRowColumn(batch, batchIndex, field, i, /* canRetainByteRef */ true); } } } else { final int readFieldCount = readFieldLogicalIndices.length; for (int i = 0; i < readFieldCount; i++) { final int logicalIndex = readFieldLogicalIndices[i]; // Jump to the field we want and read it. if (!deserializeRead.readField(logicalIndex)) { ColumnVector colVector = batch.cols[projectionColumnNums[logicalIndex]]; colVector.isNull[batchIndex] = true; colVector.noNulls = false; continue; } // The current* members of deserializeRead have the field value. field = topLevelFields[logicalIndex]; if (field.getIsConvert()) { convertRowColumn(batch, batchIndex, field, logicalIndex); } else { storeRowColumn(batch, batchIndex, field, logicalIndex, /* canRetainByteRef */ true); } } } } public String getDetailedReadPositionString() { return deserializeRead.getDetailedReadPositionString(); } }