/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.exec.vector; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.AbstractMapOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.io.orc.OrcSerde; import org.apache.hadoop.hive.ql.io.orc.OrcStruct; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc; import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc.VectorMapOperatorReadType; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.fast.DeserializeRead; import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead; import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.BinaryComparable; import org.apache.hadoop.io.Writable; import com.google.common.base.Preconditions; /* * * The vectorized MapOperator. * * There are 3 modes of reading for vectorization: * * 1) One for the Vectorized Input File Format which returns VectorizedRowBatch as the row. * * 2) One for using VectorDeserializeRow to deserialize each row into the VectorizedRowBatch. * Currently, these Input File Formats: * TEXTFILE * SEQUENCEFILE * * 3) And one using the regular partition deserializer to get the row object and assigning * the row object into the VectorizedRowBatch with VectorAssignRow. * This picks up Input File Format not supported by the other two. */ public class VectorMapOperator extends AbstractMapOperator { private static final long serialVersionUID = 1L; /* * Overall information on this vectorized Map operation. */ private transient HashMap<String, VectorPartitionContext> fileToPartitionContextMap; private transient Operator<? extends OperatorDesc> oneRootOperator; private transient TypeInfo tableStructTypeInfo; private transient StandardStructObjectInspector tableStandardStructObjectInspector; private transient TypeInfo[] tableRowTypeInfos; private transient int[] dataColumnNums; private transient StandardStructObjectInspector neededStandardStructObjectInspector; private transient VectorizedRowBatchCtx batchContext; // The context for creating the VectorizedRowBatch for this Map node that // the Vectorizer class determined. /* * A different batch for vectorized Input File Format readers so they can do their work * overlapped with work of the row collection that vector/row deserialization does. This allows * the partitions to mix modes (e.g. for us to flush the previously batched rows on file change). */ private transient VectorizedRowBatch vectorizedInputFileFormatBatch; /* * This batch is only used by vector/row deserializer readers. */ private transient VectorizedRowBatch deserializerBatch; private transient long batchCounter; private transient int dataColumnCount; private transient int partitionColumnCount; private transient Object[] partitionValues; private transient boolean[] dataColumnsToIncludeTruncated; /* * The following members have context information for the current partition file being read. */ private transient VectorMapOperatorReadType currentReadType; private transient VectorPartitionContext currentVectorPartContext; // Current vector map operator read type and context. private transient int currentDataColumnCount; // The number of data columns that the current reader will return. // Only applicable for vector/row deserialization. private transient DeserializeRead currentDeserializeRead; private transient VectorDeserializeRow currentVectorDeserializeRow; // When we are doing vector deserialization, these are the fast deserializer and // the vector row deserializer. private Deserializer currentPartDeserializer; private StructObjectInspector currentPartRawRowObjectInspector; private VectorAssignRow currentVectorAssign; // When we are doing row deserialization, these are the regular deserializer, // partition object inspector, and vector row assigner. /* * The abstract context for the 3 kinds of vectorized reading. */ protected abstract class VectorPartitionContext { protected final PartitionDesc partDesc; String tableName; String partName; /* * Initialization here is adapted from MapOperator.MapOpCtx.initObjectInspector method. */ private VectorPartitionContext(PartitionDesc partDesc) { this.partDesc = partDesc; TableDesc td = partDesc.getTableDesc(); // Use table properties in case of unpartitioned tables, // and the union of table properties and partition properties, with partition // taking precedence, in the case of partitioned tables Properties overlayedProps = SerDeUtils.createOverlayedProperties(td.getProperties(), partDesc.getProperties()); Map<String, String> partSpec = partDesc.getPartSpec(); tableName = String.valueOf(overlayedProps.getProperty("name")); partName = String.valueOf(partSpec); } public PartitionDesc getPartDesc() { return partDesc; } /* * Override this for concrete initialization. */ public abstract void init(Configuration hconf) throws SerDeException, Exception; /* * How many data columns is the partition reader actually supplying? */ public abstract int getReaderDataColumnCount(); } /* * Context for reading a Vectorized Input File Format. */ protected class VectorizedInputFileFormatPartitionContext extends VectorPartitionContext { private VectorizedInputFileFormatPartitionContext(PartitionDesc partDesc) { super(partDesc); } public void init(Configuration hconf) { } @Override public int getReaderDataColumnCount() { throw new RuntimeException("Not applicable"); } } /* * Context for using VectorDeserializeRow to deserialize each row from the Input File Format * into the VectorizedRowBatch. */ protected class VectorDeserializePartitionContext extends VectorPartitionContext { // This helper object deserializes known deserialization / input file format combination into // columns of a row in a vectorized row batch. private VectorDeserializeRow vectorDeserializeRow; private DeserializeRead deserializeRead; private int readerColumnCount; private VectorDeserializePartitionContext(PartitionDesc partDesc) { super(partDesc); } public VectorDeserializeRow getVectorDeserializeRow() { return vectorDeserializeRow; } DeserializeRead getDeserializeRead() { return deserializeRead; } @Override public int getReaderDataColumnCount() { return readerColumnCount; } public void init(Configuration hconf) throws SerDeException, HiveException { VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc(); // This type information specifies the data types the partition needs to read. TypeInfo[] dataTypeInfos = vectorPartDesc.getDataTypeInfos(); // We need to provide the minimum number of columns to be read so // LazySimpleDeserializeRead's separator parser does not waste time. // Preconditions.checkState(dataColumnsToIncludeTruncated != null); TypeInfo[] minimalDataTypeInfos; if (dataColumnsToIncludeTruncated.length < dataTypeInfos.length) { minimalDataTypeInfos = Arrays.copyOf(dataTypeInfos, dataColumnsToIncludeTruncated.length); } else { minimalDataTypeInfos = dataTypeInfos; } readerColumnCount = minimalDataTypeInfos.length; switch (vectorPartDesc.getVectorDeserializeType()) { case LAZY_SIMPLE: { LazySerDeParameters simpleSerdeParams = new LazySerDeParameters(hconf, partDesc.getTableDesc().getProperties(), LazySimpleSerDe.class.getName()); LazySimpleDeserializeRead lazySimpleDeserializeRead = new LazySimpleDeserializeRead( minimalDataTypeInfos, /* useExternalBuffer */ true, simpleSerdeParams); vectorDeserializeRow = new VectorDeserializeRow<LazySimpleDeserializeRead>(lazySimpleDeserializeRead); // Initialize with data row type conversion parameters. vectorDeserializeRow.initConversion(tableRowTypeInfos, dataColumnsToIncludeTruncated); deserializeRead = lazySimpleDeserializeRead; } break; case LAZY_BINARY: { LazyBinaryDeserializeRead lazyBinaryDeserializeRead = new LazyBinaryDeserializeRead( dataTypeInfos, /* useExternalBuffer */ true); vectorDeserializeRow = new VectorDeserializeRow<LazyBinaryDeserializeRead>(lazyBinaryDeserializeRead); // Initialize with data row type conversion parameters. vectorDeserializeRow.initConversion(tableRowTypeInfos, dataColumnsToIncludeTruncated); deserializeRead = lazyBinaryDeserializeRead; } break; default: throw new RuntimeException( "Unexpected vector deserialize row type " + vectorPartDesc.getVectorDeserializeType().name()); } } } /* * Context for reading using the regular partition deserializer to get the row object and * assigning the row object into the VectorizedRowBatch with VectorAssignRow */ protected class RowDeserializePartitionContext extends VectorPartitionContext { private Deserializer partDeserializer; private StructObjectInspector partRawRowObjectInspector; private VectorAssignRow vectorAssign; private int readerColumnCount; private RowDeserializePartitionContext(PartitionDesc partDesc) { super(partDesc); } public Deserializer getPartDeserializer() { return partDeserializer; } public StructObjectInspector getPartRawRowObjectInspector() { return partRawRowObjectInspector; } public VectorAssignRow getVectorAssign() { return vectorAssign; } @Override public int getReaderDataColumnCount() { return readerColumnCount; } public void init(Configuration hconf) throws Exception { VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc(); partDeserializer = partDesc.getDeserializer(hconf); if (partDeserializer instanceof OrcSerde) { // UNDONE: We need to get the table schema inspector from self-describing Input File // Formats like ORC. Modify the ORC serde instead? For now, this works. partRawRowObjectInspector = (StructObjectInspector) OrcStruct.createObjectInspector(tableStructTypeInfo); } else { partRawRowObjectInspector = (StructObjectInspector) partDeserializer.getObjectInspector(); } TypeInfo[] dataTypeInfos = vectorPartDesc.getDataTypeInfos(); vectorAssign = new VectorAssignRow(); // Initialize with data type conversion parameters. readerColumnCount = vectorAssign.initConversion(dataTypeInfos, tableRowTypeInfos, dataColumnsToIncludeTruncated); } } public VectorPartitionContext createAndInitPartitionContext(PartitionDesc partDesc, Configuration hconf) throws SerDeException, Exception { VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc(); VectorPartitionContext vectorPartitionContext; VectorMapOperatorReadType vectorMapOperatorReadType = vectorPartDesc.getVectorMapOperatorReadType(); if (vectorMapOperatorReadType == VectorMapOperatorReadType.VECTOR_DESERIALIZE || vectorMapOperatorReadType == VectorMapOperatorReadType.ROW_DESERIALIZE) { // Verify hive.exec.schema.evolution is true or we have an ACID table so we are producing // the table schema from ORC. The Vectorizer class assures this. boolean isAcid = AcidUtils.isTablePropertyTransactional(partDesc.getTableDesc().getProperties()); Preconditions.checkState(Utilities.isSchemaEvolutionEnabled(hconf, isAcid)); } switch (vectorMapOperatorReadType) { case VECTORIZED_INPUT_FILE_FORMAT: vectorPartitionContext = new VectorizedInputFileFormatPartitionContext(partDesc); break; case VECTOR_DESERIALIZE: vectorPartitionContext = new VectorDeserializePartitionContext(partDesc); break; case ROW_DESERIALIZE: vectorPartitionContext = new RowDeserializePartitionContext(partDesc); break; default: throw new RuntimeException("Unexpected vector MapOperator read type " + vectorMapOperatorReadType.name()); } vectorPartitionContext.init(hconf); return vectorPartitionContext; } private void determineDataColumnsToIncludeTruncated() { Preconditions.checkState(batchContext != null); Preconditions.checkState(dataColumnNums != null); boolean[] columnsToInclude = new boolean[dataColumnCount];; final int count = dataColumnNums.length; int columnNum = -1; for (int i = 0; i < count; i++) { columnNum = dataColumnNums[i]; Preconditions.checkState(columnNum < dataColumnCount); columnsToInclude[columnNum] = true; } if (columnNum == -1) { dataColumnsToIncludeTruncated = new boolean[0]; } else { dataColumnsToIncludeTruncated = Arrays.copyOf(columnsToInclude, columnNum + 1); } } /** Kryo ctor. */ public VectorMapOperator() { super(); } public VectorMapOperator(CompilationOpContext ctx) { super(ctx); } /* * This is the same as the setChildren method below but for empty tables. */ @Override public void initEmptyInputChildren(List<Operator<?>> children, Configuration hconf) throws SerDeException, Exception { // Get the single TableScanOperator. Vectorization only supports one input tree. Preconditions.checkState(children.size() == 1); oneRootOperator = children.get(0); internalSetChildren(hconf); } @Override public void setChildren(Configuration hconf) throws Exception { // Get the single TableScanOperator. Vectorization only supports one input tree. Iterator<Operator<? extends OperatorDesc>> aliasToWorkIterator = conf.getAliasToWork().values().iterator(); oneRootOperator = aliasToWorkIterator.next(); Preconditions.checkState(!aliasToWorkIterator.hasNext()); internalSetChildren(hconf); } /* * Create information for vector map operator. * The member oneRootOperator has been set. */ private void internalSetChildren(Configuration hconf) throws Exception { // The setupPartitionContextVars uses the prior read type to flush the prior deserializerBatch, // so set it here to none. currentReadType = VectorMapOperatorReadType.NONE; batchContext = conf.getVectorizedRowBatchCtx(); /* * Use a different batch for vectorized Input File Format readers so they can do their work * overlapped with work of the row collection that vector/row deserialization does. This allows * the partitions to mix modes (e.g. for us to flush the previously batched rows on file change). */ vectorizedInputFileFormatBatch = batchContext.createVectorizedRowBatch(); conf.setVectorizedRowBatch(vectorizedInputFileFormatBatch); /* * This batch is used by vector/row deserializer readers. */ deserializerBatch = batchContext.createVectorizedRowBatch(); batchCounter = 0; dataColumnCount = batchContext.getDataColumnCount(); partitionColumnCount = batchContext.getPartitionColumnCount(); partitionValues = new Object[partitionColumnCount]; dataColumnNums = batchContext.getDataColumnNums(); Preconditions.checkState(dataColumnNums != null); // Form a truncated boolean include array for our vector/row deserializers. determineDataColumnsToIncludeTruncated(); /* * Create table related objects */ final String[] rowColumnNames = batchContext.getRowColumnNames(); final TypeInfo[] rowColumnTypeInfos = batchContext.getRowColumnTypeInfos(); tableStructTypeInfo = TypeInfoFactory.getStructTypeInfo( Arrays.asList(rowColumnNames), Arrays.asList(rowColumnTypeInfos)); tableStandardStructObjectInspector = (StandardStructObjectInspector) TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(tableStructTypeInfo); tableRowTypeInfos = batchContext.getRowColumnTypeInfos(); /* * NOTE: We do not alter the projectedColumns / projectionSize of the batches to just be * the included columns (+ partition columns). * * For now, we need to model the object inspector rows because there are still several * vectorized operators that use them. * * We need to continue to model the Object[] as having null objects for not included columns * until the following has been fixed: * o When we have to output a STRUCT for AVG we switch to row GroupBy operators. * o Some variations of VectorMapOperator, VectorReduceSinkOperator, VectorFileSinkOperator * use the row super class to process rows. */ /* * The Vectorizer class enforces that there is only one TableScanOperator, so * we don't need the more complicated multiple root operator mapping that MapOperator has. */ fileToPartitionContextMap = new HashMap<String, VectorPartitionContext>(); // Temporary map so we only create one partition context entry. HashMap<PartitionDesc, VectorPartitionContext> partitionContextMap = new HashMap<PartitionDesc, VectorPartitionContext>(); for (Map.Entry<Path, ArrayList<String>> entry : conf.getPathToAliases().entrySet()) { Path path = entry.getKey(); PartitionDesc partDesc = conf.getPathToPartitionInfo().get(path); VectorPartitionContext vectorPartitionContext; if (!partitionContextMap.containsKey(partDesc)) { vectorPartitionContext = createAndInitPartitionContext(partDesc, hconf); partitionContextMap.put(partDesc, vectorPartitionContext); } else { vectorPartitionContext = partitionContextMap.get(partDesc); } fileToPartitionContextMap.put(path.toString(), vectorPartitionContext); } // Create list of one. List<Operator<? extends OperatorDesc>> children = new ArrayList<Operator<? extends OperatorDesc>>(); children.add(oneRootOperator); setChildOperators(children); } @Override public void initializeMapOperator(Configuration hconf) throws HiveException { super.initializeMapOperator(hconf); oneRootOperator.initialize(hconf, new ObjectInspector[] {tableStandardStructObjectInspector}); } public void initializeContexts() throws HiveException { Path fpath = getExecContext().getCurrentInputPath(); String nominalPath = getNominalPath(fpath); setupPartitionContextVars(nominalPath); } // Find context for current input file @Override public void cleanUpInputFileChangedOp() throws HiveException { super.cleanUpInputFileChangedOp(); Path fpath = getExecContext().getCurrentInputPath(); String nominalPath = getNominalPath(fpath); setupPartitionContextVars(nominalPath); // Add alias, table name, and partitions to hadoop conf so that their // children will inherit these oneRootOperator.setInputContext(currentVectorPartContext.tableName, currentVectorPartContext.partName); } /* * Setup the context for reading from the next partition file. */ private void setupPartitionContextVars(String nominalPath) throws HiveException { currentVectorPartContext = fileToPartitionContextMap.get(nominalPath); PartitionDesc partDesc = currentVectorPartContext.getPartDesc(); VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc(); currentReadType = vectorPartDesc.getVectorMapOperatorReadType(); /* * Setup for 3 different kinds of vectorized reading supported: * * 1) Read the Vectorized Input File Format which returns VectorizedRowBatch as the row. * * 2) Read using VectorDeserializeRow to deserialize each row into the VectorizedRowBatch. * * 3) And read using the regular partition deserializer to get the row object and assigning * the row object into the VectorizedRowBatch with VectorAssignRow. */ if (currentReadType == VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT) { /* * The Vectorized Input File Format reader is responsible for setting the partition column * values, resetting and filling in the batch, etc. */ /* * Clear all the reading variables. */ currentDataColumnCount = 0; currentDeserializeRead = null; currentVectorDeserializeRow = null; currentPartDeserializer = null; currentPartRawRowObjectInspector = null; currentVectorAssign = null; } else { /* * We will get "regular" single rows from the Input File Format reader that we will need * to {vector|row} deserialize. */ Preconditions.checkState( currentReadType == VectorMapOperatorReadType.VECTOR_DESERIALIZE || currentReadType == VectorMapOperatorReadType.ROW_DESERIALIZE); if (deserializerBatch.size > 0) { /* * Clear out any rows in the batch from previous partition since we are going to change * the repeating partition column values. */ batchCounter++; oneRootOperator.process(deserializerBatch, 0); deserializerBatch.reset(); if (oneRootOperator.getDone()) { setDone(true); return; } } /* * For this particular file, how many columns will we actually read? */ currentDataColumnCount = currentVectorPartContext.getReaderDataColumnCount(); if (currentDataColumnCount < dataColumnCount) { /* * Default any additional data columns to NULL once for the file (if they are present). */ for (int i = currentDataColumnCount; i < dataColumnCount; i++) { ColumnVector colVector = deserializerBatch.cols[i]; if (colVector != null) { colVector.isNull[0] = true; colVector.noNulls = false; colVector.isRepeating = true; } } } if (batchContext.getPartitionColumnCount() > 0) { /* * The partition columns are set once for the partition and are marked repeating. */ VectorizedRowBatchCtx.getPartitionValues(batchContext, partDesc, partitionValues); batchContext.addPartitionColsToBatch(deserializerBatch, partitionValues); } /* * Set or clear the rest of the reading variables based on {vector|row} deserialization. */ switch (currentReadType) { case VECTOR_DESERIALIZE: { VectorDeserializePartitionContext vectorDeserPartContext = (VectorDeserializePartitionContext) currentVectorPartContext; // Set ours. currentDeserializeRead = vectorDeserPartContext.getDeserializeRead(); currentVectorDeserializeRow = vectorDeserPartContext.getVectorDeserializeRow(); // Clear the other ones. currentPartDeserializer = null; currentPartRawRowObjectInspector = null; currentVectorAssign = null; } break; case ROW_DESERIALIZE: { RowDeserializePartitionContext rowDeserPartContext = (RowDeserializePartitionContext) currentVectorPartContext; // Clear the other ones. currentDeserializeRead = null; currentVectorDeserializeRow = null; // Set ours. currentPartDeserializer = rowDeserPartContext.getPartDeserializer(); currentPartRawRowObjectInspector = rowDeserPartContext.getPartRawRowObjectInspector(); currentVectorAssign = rowDeserPartContext.getVectorAssign(); } break; default: throw new RuntimeException("Unexpected VectorMapOperator read type " + currentReadType.name()); } } } @Override public Deserializer getCurrentDeserializer() { // Not applicable. return null; } @Override public void process(Writable value) throws HiveException { // A mapper can span multiple files/partitions. // The VectorPartitionContext need to be changed if the input file changed ExecMapperContext context = getExecContext(); if (context != null && context.inputFileChanged()) { // The child operators cleanup if input file has changed cleanUpInputFileChanged(); } if (!oneRootOperator.getDone()) { /* * 3 different kinds of vectorized reading supported: * * 1) Read the Vectorized Input File Format which returns VectorizedRowBatch as the row. * * 2) Read using VectorDeserializeRow to deserialize each row into the VectorizedRowBatch. * * 3) And read using the regular partition deserializer to get the row object and assigning * the row object into the VectorizedRowBatch with VectorAssignRow. */ try { if (currentReadType == VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT) { /* * The Vectorized Input File Format reader has already set the partition column * values, reset and filled in the batch, etc. * * We pass the VectorizedRowBatch through here. */ batchCounter++; if (value != null) { numRows += ((VectorizedRowBatch) value).size; } oneRootOperator.process(value, 0); if (oneRootOperator.getDone()) { setDone(true); return; } } else { /* * We have a "regular" single rows from the Input File Format reader that we will need * to deserialize. */ Preconditions.checkState( currentReadType == VectorMapOperatorReadType.VECTOR_DESERIALIZE || currentReadType == VectorMapOperatorReadType.ROW_DESERIALIZE); if (deserializerBatch.size == deserializerBatch.DEFAULT_SIZE) { numRows += deserializerBatch.size; /* * Feed current full batch to operator tree. */ batchCounter++; oneRootOperator.process(deserializerBatch, 0); /** * Only reset the current data columns. Not any data columns defaulted to NULL * because they are not present in the partition, and not partition columns. */ for (int c = 0; c < currentDataColumnCount; c++) { ColumnVector colVector = deserializerBatch.cols[c]; if (colVector != null) { colVector.reset(); colVector.init(); } } deserializerBatch.selectedInUse = false; deserializerBatch.size = 0; deserializerBatch.endOfFile = false; if (oneRootOperator.getDone()) { setDone(true); return; } } /* * Do the {vector|row} deserialization of the one row into the VectorizedRowBatch. */ switch (currentReadType) { case VECTOR_DESERIALIZE: { BinaryComparable binComp = (BinaryComparable) value; currentDeserializeRead.set(binComp.getBytes(), 0, binComp.getLength()); // Deserialize and append new row using the current batch size as the index. try { currentVectorDeserializeRow.deserialize( deserializerBatch, deserializerBatch.size++); } catch (Exception e) { throw new HiveException( "\nDeserializeRead detail: " + currentVectorDeserializeRow.getDetailedReadPositionString(), e); } } break; case ROW_DESERIALIZE: { Object deserialized = currentPartDeserializer.deserialize(value); // Note: Regardless of what the Input File Format returns, we have determined // with VectorAppendRow.initConversion that only currentDataColumnCount columns // have values we want. // // Any extra columns needed by the table schema were set to repeating null // in the batch by setupPartitionContextVars. // Convert input row to standard objects. List<Object> standardObjects = new ArrayList<Object>(); ObjectInspectorUtils.copyToStandardObject(standardObjects, deserialized, currentPartRawRowObjectInspector, ObjectInspectorCopyOption.WRITABLE); if (standardObjects.size() < currentDataColumnCount) { throw new HiveException("Input File Format returned row with too few columns"); } // Append the deserialized standard object row using the current batch size // as the index. currentVectorAssign.assignRow(deserializerBatch, deserializerBatch.size++, standardObjects, currentDataColumnCount); } break; default: throw new RuntimeException("Unexpected vector MapOperator read type " + currentReadType.name()); } } } catch (Exception e) { throw new HiveException("Hive Runtime Error while processing row ", e); } } } @Override public void process(Object row, int tag) throws HiveException { throw new HiveException("Hive 2 Internal error: should not be called!"); } @Override public void closeOp(boolean abort) throws HiveException { if (!abort && oneRootOperator != null && !oneRootOperator.getDone() && currentReadType != VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT) { if (deserializerBatch.size > 0) { numRows += deserializerBatch.size; batchCounter++; oneRootOperator.process(deserializerBatch, 0); deserializerBatch.size = 0; } } super.closeOp(abort); } @Override public String getName() { return getOperatorName(); } static public String getOperatorName() { return "MAP"; } @Override public OperatorType getType() { return null; } }