/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.optimizer.physical; import static org.apache.hadoop.hive.ql.plan.ReduceSinkDesc.ReducerTraits.UNIFORM; import java.io.Serializable; import java.lang.annotation.Annotation; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Map.Entry; import java.util.Properties; import java.util.Set; import java.util.Stack; import java.util.regex.Pattern; import org.apache.commons.lang.ArrayUtils; import org.apache.calcite.util.Pair; import org.apache.commons.lang3.tuple.ImmutablePair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.*; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; import org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey; import org.apache.hadoop.hive.ql.exec.spark.SparkTask; import org.apache.hadoop.hive.ql.exec.tez.TezTask; import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerBigOnlyLongOperator; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerBigOnlyMultiKeyOperator; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerBigOnlyStringOperator; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerLongOperator; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerMultiKeyOperator; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerStringOperator; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinLeftSemiLongOperator; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinLeftSemiMultiKeyOperator; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinLeftSemiStringOperator; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterLongOperator; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterMultiKeyOperator; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterStringOperator; import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkEmptyKeyOperator; import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkLongOperator; import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkMultiKeyOperator; import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkObjectHashOperator; import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkStringOperator; import org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFAdaptor; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type; import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOutputMapping; import org.apache.hadoop.hive.ql.exec.vector.VectorColumnSourceMapping; import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOuterFilteredOperator; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext.HiveVectorAdaptorUsageMode; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext.InConstantType; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContextRegion; import org.apache.hadoop.hive.ql.exec.vector.expressions.IdentityExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorAggregateExpression; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; import org.apache.hadoop.hive.ql.lib.GraphWalker; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.lib.PreOrderOnceWalker; import org.apache.hadoop.hive.ql.lib.PreOrderWalker; import org.apache.hadoop.hive.ql.lib.Rule; import org.apache.hadoop.hive.ql.lib.RuleRegExp; import org.apache.hadoop.hive.ql.lib.TaskGraphWalker; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.AbstractOperatorDesc; import org.apache.hadoop.hive.ql.plan.AggregationDesc; import org.apache.hadoop.hive.ql.plan.AppMasterEventDesc; import org.apache.hadoop.hive.ql.plan.BaseWork; import org.apache.hadoop.hive.ql.plan.Explain; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.FilterDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.JoinDesc; import org.apache.hadoop.hive.ql.plan.LimitDesc; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.SelectDesc; import org.apache.hadoop.hive.ql.plan.VectorAppMasterEventDesc; import org.apache.hadoop.hive.ql.plan.VectorFileSinkDesc; import org.apache.hadoop.hive.ql.plan.VectorFilterDesc; import org.apache.hadoop.hive.ql.plan.VectorTableScanDesc; import org.apache.hadoop.hive.ql.plan.VectorizationCondition; import org.apache.hadoop.hive.ql.plan.VectorGroupByDesc.ProcessingMode; import org.apache.hadoop.hive.ql.plan.VectorSparkHashTableSinkDesc; import org.apache.hadoop.hive.ql.plan.VectorSparkPartitionPruningSinkDesc; import org.apache.hadoop.hive.ql.plan.VectorLimitDesc; import org.apache.hadoop.hive.ql.plan.VectorMapJoinInfo; import org.apache.hadoop.hive.ql.plan.VectorSMBJoinDesc; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; import org.apache.hadoop.hive.ql.plan.ReduceWork; import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; import org.apache.hadoop.hive.ql.plan.SparkHashTableSinkDesc; import org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc; import org.apache.hadoop.hive.ql.plan.SparkWork; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.TezWork; import org.apache.hadoop.hive.ql.plan.VectorGroupByDesc; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind; import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.OperatorVariation; import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc.VectorDeserializeType; import org.apache.hadoop.hive.ql.plan.VectorReduceSinkDesc; import org.apache.hadoop.hive.ql.plan.VectorReduceSinkInfo; import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc; import org.apache.hadoop.hive.ql.plan.VectorSelectDesc; import org.apache.hadoop.hive.ql.plan.VectorReduceSinkDesc; import org.apache.hadoop.hive.ql.plan.VectorReduceSinkInfo; import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.ql.udf.UDFAcos; import org.apache.hadoop.hive.ql.udf.UDFAsin; import org.apache.hadoop.hive.ql.udf.UDFAtan; import org.apache.hadoop.hive.ql.udf.UDFBin; import org.apache.hadoop.hive.ql.udf.UDFConv; import org.apache.hadoop.hive.ql.udf.UDFCos; import org.apache.hadoop.hive.ql.udf.UDFDayOfMonth; import org.apache.hadoop.hive.ql.udf.UDFDayOfWeek; import org.apache.hadoop.hive.ql.udf.UDFDegrees; import org.apache.hadoop.hive.ql.udf.UDFExp; import org.apache.hadoop.hive.ql.udf.UDFFromUnixTime; import org.apache.hadoop.hive.ql.udf.UDFHex; import org.apache.hadoop.hive.ql.udf.UDFHour; import org.apache.hadoop.hive.ql.udf.UDFLike; import org.apache.hadoop.hive.ql.udf.UDFLn; import org.apache.hadoop.hive.ql.udf.UDFLog; import org.apache.hadoop.hive.ql.udf.UDFLog10; import org.apache.hadoop.hive.ql.udf.UDFLog2; import org.apache.hadoop.hive.ql.udf.UDFMinute; import org.apache.hadoop.hive.ql.udf.UDFMonth; import org.apache.hadoop.hive.ql.udf.UDFRadians; import org.apache.hadoop.hive.ql.udf.UDFRand; import org.apache.hadoop.hive.ql.udf.UDFRegExpExtract; import org.apache.hadoop.hive.ql.udf.UDFRegExpReplace; import org.apache.hadoop.hive.ql.udf.UDFSecond; import org.apache.hadoop.hive.ql.udf.UDFSign; import org.apache.hadoop.hive.ql.udf.UDFSin; import org.apache.hadoop.hive.ql.udf.UDFSqrt; import org.apache.hadoop.hive.ql.udf.UDFSubstr; import org.apache.hadoop.hive.ql.udf.UDFTan; import org.apache.hadoop.hive.ql.udf.UDFToBoolean; import org.apache.hadoop.hive.ql.udf.UDFToByte; import org.apache.hadoop.hive.ql.udf.UDFToDouble; import org.apache.hadoop.hive.ql.udf.UDFToFloat; import org.apache.hadoop.hive.ql.udf.UDFToInteger; import org.apache.hadoop.hive.ql.udf.UDFToLong; import org.apache.hadoop.hive.ql.udf.UDFToShort; import org.apache.hadoop.hive.ql.udf.UDFToString; import org.apache.hadoop.hive.ql.udf.UDFWeekOfYear; import org.apache.hadoop.hive.ql.udf.UDFYear; import org.apache.hadoop.hive.ql.udf.generic.*; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.NullStructSerDe; import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hive.common.util.AnnotationUtils; import org.apache.hadoop.util.ReflectionUtils; import com.google.common.base.Preconditions; public class Vectorizer implements PhysicalPlanResolver { protected static transient final Logger LOG = LoggerFactory.getLogger(Vectorizer.class); static Pattern supportedDataTypesPattern; static { StringBuilder patternBuilder = new StringBuilder(); patternBuilder.append("int"); patternBuilder.append("|smallint"); patternBuilder.append("|tinyint"); patternBuilder.append("|bigint"); patternBuilder.append("|integer"); patternBuilder.append("|long"); patternBuilder.append("|short"); patternBuilder.append("|timestamp"); patternBuilder.append("|" + serdeConstants.INTERVAL_YEAR_MONTH_TYPE_NAME); patternBuilder.append("|" + serdeConstants.INTERVAL_DAY_TIME_TYPE_NAME); patternBuilder.append("|boolean"); patternBuilder.append("|binary"); patternBuilder.append("|string"); patternBuilder.append("|byte"); patternBuilder.append("|float"); patternBuilder.append("|double"); patternBuilder.append("|date"); patternBuilder.append("|void"); // Decimal types can be specified with different precision and scales e.g. decimal(10,5), // as opposed to other data types which can be represented by constant strings. // The regex matches only the "decimal" prefix of the type. patternBuilder.append("|decimal.*"); // CHAR and VARCHAR types can be specified with maximum length. patternBuilder.append("|char.*"); patternBuilder.append("|varchar.*"); supportedDataTypesPattern = Pattern.compile(patternBuilder.toString()); } private Set<Class<?>> supportedGenericUDFs = new HashSet<Class<?>>(); private Set<String> supportedAggregationUdfs = new HashSet<String>(); private HiveConf hiveConf; private boolean useVectorizedInputFileFormat; private boolean useVectorDeserialize; private boolean useRowDeserialize; private boolean isReduceVectorizationEnabled; private boolean isSchemaEvolution; private HiveVectorAdaptorUsageMode hiveVectorAdaptorUsageMode; private BaseWork currentBaseWork; private Operator<? extends OperatorDesc> currentOperator; public void testSetCurrentBaseWork(BaseWork testBaseWork) { currentBaseWork = testBaseWork; } private void setNodeIssue(String issue) { currentBaseWork.setNotVectorizedReason( VectorizerReason.createNodeIssue(issue)); } private void setOperatorIssue(String issue) { currentBaseWork.setNotVectorizedReason( VectorizerReason.createOperatorIssue(currentOperator, issue)); } private void setExpressionIssue(String expressionTitle, String issue) { currentBaseWork.setNotVectorizedReason( VectorizerReason.createExpressionIssue(currentOperator, expressionTitle, issue)); } private void clearNotVectorizedReason() { currentBaseWork.setNotVectorizedReason(null); } private long vectorizedVertexNum = -1; public Vectorizer() { /* * We check UDFs against the supportedGenericUDFs when * hive.vectorized.adaptor.usage.mode=chosen or none. * * We allow all UDFs for hive.vectorized.adaptor.usage.mode=all. */ supportedGenericUDFs.add(GenericUDFOPPlus.class); supportedGenericUDFs.add(GenericUDFOPMinus.class); supportedGenericUDFs.add(GenericUDFOPMultiply.class); supportedGenericUDFs.add(GenericUDFOPDivide.class); supportedGenericUDFs.add(GenericUDFOPMod.class); supportedGenericUDFs.add(GenericUDFOPNegative.class); supportedGenericUDFs.add(GenericUDFOPPositive.class); supportedGenericUDFs.add(GenericUDFOPEqualOrLessThan.class); supportedGenericUDFs.add(GenericUDFOPEqualOrGreaterThan.class); supportedGenericUDFs.add(GenericUDFOPGreaterThan.class); supportedGenericUDFs.add(GenericUDFOPLessThan.class); supportedGenericUDFs.add(GenericUDFOPNot.class); supportedGenericUDFs.add(GenericUDFOPNotEqual.class); supportedGenericUDFs.add(GenericUDFOPNotNull.class); supportedGenericUDFs.add(GenericUDFOPNull.class); supportedGenericUDFs.add(GenericUDFOPOr.class); supportedGenericUDFs.add(GenericUDFOPAnd.class); supportedGenericUDFs.add(GenericUDFOPEqual.class); supportedGenericUDFs.add(GenericUDFLength.class); supportedGenericUDFs.add(GenericUDFCharacterLength.class); supportedGenericUDFs.add(GenericUDFOctetLength.class); supportedGenericUDFs.add(UDFYear.class); supportedGenericUDFs.add(UDFMonth.class); supportedGenericUDFs.add(UDFDayOfMonth.class); supportedGenericUDFs.add(UDFDayOfWeek.class); supportedGenericUDFs.add(UDFHour.class); supportedGenericUDFs.add(UDFMinute.class); supportedGenericUDFs.add(UDFSecond.class); supportedGenericUDFs.add(UDFWeekOfYear.class); supportedGenericUDFs.add(GenericUDFToUnixTimeStamp.class); supportedGenericUDFs.add(UDFFromUnixTime.class); supportedGenericUDFs.add(GenericUDFDateAdd.class); supportedGenericUDFs.add(GenericUDFDateSub.class); supportedGenericUDFs.add(GenericUDFDate.class); supportedGenericUDFs.add(GenericUDFDateDiff.class); supportedGenericUDFs.add(UDFLike.class); supportedGenericUDFs.add(GenericUDFRegExp.class); supportedGenericUDFs.add(UDFRegExpExtract.class); supportedGenericUDFs.add(UDFRegExpReplace.class); supportedGenericUDFs.add(UDFSubstr.class); supportedGenericUDFs.add(GenericUDFLTrim.class); supportedGenericUDFs.add(GenericUDFRTrim.class); supportedGenericUDFs.add(GenericUDFTrim.class); supportedGenericUDFs.add(UDFSin.class); supportedGenericUDFs.add(UDFCos.class); supportedGenericUDFs.add(UDFTan.class); supportedGenericUDFs.add(UDFAsin.class); supportedGenericUDFs.add(UDFAcos.class); supportedGenericUDFs.add(UDFAtan.class); supportedGenericUDFs.add(UDFDegrees.class); supportedGenericUDFs.add(UDFRadians.class); supportedGenericUDFs.add(GenericUDFFloor.class); supportedGenericUDFs.add(GenericUDFCeil.class); supportedGenericUDFs.add(UDFExp.class); supportedGenericUDFs.add(UDFLn.class); supportedGenericUDFs.add(UDFLog2.class); supportedGenericUDFs.add(UDFLog10.class); supportedGenericUDFs.add(UDFLog.class); supportedGenericUDFs.add(GenericUDFPower.class); supportedGenericUDFs.add(GenericUDFRound.class); supportedGenericUDFs.add(GenericUDFBRound.class); supportedGenericUDFs.add(GenericUDFPosMod.class); supportedGenericUDFs.add(UDFSqrt.class); supportedGenericUDFs.add(UDFSign.class); supportedGenericUDFs.add(UDFRand.class); supportedGenericUDFs.add(UDFBin.class); supportedGenericUDFs.add(UDFHex.class); supportedGenericUDFs.add(UDFConv.class); supportedGenericUDFs.add(GenericUDFLower.class); supportedGenericUDFs.add(GenericUDFUpper.class); supportedGenericUDFs.add(GenericUDFConcat.class); supportedGenericUDFs.add(GenericUDFAbs.class); supportedGenericUDFs.add(GenericUDFBetween.class); supportedGenericUDFs.add(GenericUDFIn.class); supportedGenericUDFs.add(GenericUDFCase.class); supportedGenericUDFs.add(GenericUDFWhen.class); supportedGenericUDFs.add(GenericUDFCoalesce.class); supportedGenericUDFs.add(GenericUDFNvl.class); supportedGenericUDFs.add(GenericUDFElt.class); supportedGenericUDFs.add(GenericUDFInitCap.class); supportedGenericUDFs.add(GenericUDFInBloomFilter.class); // For type casts supportedGenericUDFs.add(UDFToLong.class); supportedGenericUDFs.add(UDFToInteger.class); supportedGenericUDFs.add(UDFToShort.class); supportedGenericUDFs.add(UDFToByte.class); supportedGenericUDFs.add(UDFToBoolean.class); supportedGenericUDFs.add(UDFToFloat.class); supportedGenericUDFs.add(UDFToDouble.class); supportedGenericUDFs.add(UDFToString.class); supportedGenericUDFs.add(GenericUDFTimestamp.class); supportedGenericUDFs.add(GenericUDFToDecimal.class); supportedGenericUDFs.add(GenericUDFToDate.class); supportedGenericUDFs.add(GenericUDFToChar.class); supportedGenericUDFs.add(GenericUDFToVarchar.class); supportedGenericUDFs.add(GenericUDFToIntervalYearMonth.class); supportedGenericUDFs.add(GenericUDFToIntervalDayTime.class); // For conditional expressions supportedGenericUDFs.add(GenericUDFIf.class); supportedAggregationUdfs.add("min"); supportedAggregationUdfs.add("max"); supportedAggregationUdfs.add("count"); supportedAggregationUdfs.add("sum"); supportedAggregationUdfs.add("avg"); supportedAggregationUdfs.add("variance"); supportedAggregationUdfs.add("var_pop"); supportedAggregationUdfs.add("var_samp"); supportedAggregationUdfs.add("std"); supportedAggregationUdfs.add("stddev"); supportedAggregationUdfs.add("stddev_pop"); supportedAggregationUdfs.add("stddev_samp"); supportedAggregationUdfs.add("bloom_filter"); } private class VectorTaskColumnInfo { List<String> allColumnNames; List<TypeInfo> allTypeInfos; List<Integer> dataColumnNums; int partitionColumnCount; boolean useVectorizedInputFileFormat; boolean groupByVectorOutput; boolean allNative; boolean usesVectorUDFAdaptor; String[] scratchTypeNameArray; Set<Operator<? extends OperatorDesc>> nonVectorizedOps; String reduceColumnSortOrder; String reduceColumnNullOrder; VectorTaskColumnInfo() { partitionColumnCount = 0; } public void assume() { groupByVectorOutput = true; allNative = true; usesVectorUDFAdaptor = false; } public void setAllColumnNames(List<String> allColumnNames) { this.allColumnNames = allColumnNames; } public void setAllTypeInfos(List<TypeInfo> allTypeInfos) { this.allTypeInfos = allTypeInfos; } public void setDataColumnNums(List<Integer> dataColumnNums) { this.dataColumnNums = dataColumnNums; } public void setPartitionColumnCount(int partitionColumnCount) { this.partitionColumnCount = partitionColumnCount; } public void setScratchTypeNameArray(String[] scratchTypeNameArray) { this.scratchTypeNameArray = scratchTypeNameArray; } public void setGroupByVectorOutput(boolean groupByVectorOutput) { this.groupByVectorOutput = groupByVectorOutput; } public void setAllNative(boolean allNative) { this.allNative = allNative; } public void setUsesVectorUDFAdaptor(boolean usesVectorUDFAdaptor) { this.usesVectorUDFAdaptor = usesVectorUDFAdaptor; } public void setUseVectorizedInputFileFormat(boolean useVectorizedInputFileFormat) { this.useVectorizedInputFileFormat = useVectorizedInputFileFormat; } public void setNonVectorizedOps(Set<Operator<? extends OperatorDesc>> nonVectorizedOps) { this.nonVectorizedOps = nonVectorizedOps; } public Set<Operator<? extends OperatorDesc>> getNonVectorizedOps() { return nonVectorizedOps; } public void setReduceColumnSortOrder(String reduceColumnSortOrder) { this.reduceColumnSortOrder = reduceColumnSortOrder; } public void setReduceColumnNullOrder(String reduceColumnNullOrder) { this.reduceColumnNullOrder = reduceColumnNullOrder; } public void transferToBaseWork(BaseWork baseWork) { String[] allColumnNameArray = allColumnNames.toArray(new String[0]); TypeInfo[] allTypeInfoArray = allTypeInfos.toArray(new TypeInfo[0]); int[] dataColumnNumsArray; if (dataColumnNums != null) { dataColumnNumsArray = ArrayUtils.toPrimitive(dataColumnNums.toArray(new Integer[0])); } else { dataColumnNumsArray = null; } VectorizedRowBatchCtx vectorizedRowBatchCtx = new VectorizedRowBatchCtx( allColumnNameArray, allTypeInfoArray, dataColumnNumsArray, partitionColumnCount, scratchTypeNameArray); baseWork.setVectorizedRowBatchCtx(vectorizedRowBatchCtx); if (baseWork instanceof MapWork) { MapWork mapWork = (MapWork) baseWork; mapWork.setUseVectorizedInputFileFormat(useVectorizedInputFileFormat); } if (baseWork instanceof ReduceWork) { ReduceWork reduceWork = (ReduceWork) baseWork; reduceWork.setVectorReduceColumnSortOrder(reduceColumnSortOrder); reduceWork.setVectorReduceColumnNullOrder(reduceColumnNullOrder); } baseWork.setAllNative(allNative); baseWork.setGroupByVectorOutput(groupByVectorOutput); baseWork.setUsesVectorUDFAdaptor(usesVectorUDFAdaptor); } } class VectorizationDispatcher implements Dispatcher { @Override public Object dispatch(Node nd, Stack<Node> stack, Object... nodeOutputs) throws SemanticException { Task<? extends Serializable> currTask = (Task<? extends Serializable>) nd; if (currTask instanceof MapRedTask) { MapredWork mapredWork = ((MapRedTask) currTask).getWork(); convertMapWork(mapredWork.getMapWork(), false); ReduceWork reduceWork = mapredWork.getReduceWork(); if (reduceWork != null) { // Always set the EXPLAIN conditions. setReduceWorkExplainConditions(reduceWork); // We do not vectorize MR Reduce. } } else if (currTask instanceof TezTask) { TezWork work = ((TezTask) currTask).getWork(); for (BaseWork baseWork: work.getAllWork()) { if (baseWork instanceof MapWork) { convertMapWork((MapWork) baseWork, true); } else if (baseWork instanceof ReduceWork) { ReduceWork reduceWork = (ReduceWork) baseWork; // Always set the EXPLAIN conditions. setReduceWorkExplainConditions(reduceWork); // We are only vectorizing Reduce under Tez/Spark. if (isReduceVectorizationEnabled) { convertReduceWork(reduceWork); } } } } else if (currTask instanceof SparkTask) { SparkWork sparkWork = (SparkWork) currTask.getWork(); for (BaseWork baseWork : sparkWork.getAllWork()) { if (baseWork instanceof MapWork) { convertMapWork((MapWork) baseWork, true); } else if (baseWork instanceof ReduceWork) { ReduceWork reduceWork = (ReduceWork) baseWork; // Always set the EXPLAIN conditions. setReduceWorkExplainConditions(reduceWork); if (isReduceVectorizationEnabled) { convertReduceWork(reduceWork); } } } } return null; } private void convertMapWork(MapWork mapWork, boolean isTezOrSpark) throws SemanticException { mapWork.setVectorizationExamined(true); // Global used when setting errors, etc. currentBaseWork = mapWork; VectorTaskColumnInfo vectorTaskColumnInfo = new VectorTaskColumnInfo(); vectorTaskColumnInfo.assume(); mapWork.setVectorizedVertexNum(++vectorizedVertexNum); boolean ret; try { ret = validateMapWork(mapWork, vectorTaskColumnInfo, isTezOrSpark); } catch (Exception e) { String issue = "exception: " + VectorizationContext.getStackTraceAsSingleLine(e); setNodeIssue(issue); ret = false; } if (ret) { vectorizeMapWork(mapWork, vectorTaskColumnInfo, isTezOrSpark); } else if (currentBaseWork.getVectorizationEnabled()) { VectorizerReason notVectorizedReason = currentBaseWork.getNotVectorizedReason(); if (notVectorizedReason == null) { LOG.info("Cannot vectorize: unknown"); } else { LOG.info("Cannot vectorize: " + notVectorizedReason.toString()); } clearMapWorkVectorDescs(mapWork); } } private void addMapWorkRules(Map<Rule, NodeProcessor> opRules, NodeProcessor np) { opRules.put(new RuleRegExp("R1", TableScanOperator.getOperatorName() + ".*" + FileSinkOperator.getOperatorName()), np); opRules.put(new RuleRegExp("R2", TableScanOperator.getOperatorName() + ".*" + ReduceSinkOperator.getOperatorName()), np); } /* * Determine if there is only one TableScanOperator. Currently in Map vectorization, we do not * try to vectorize multiple input trees. */ private ImmutablePair<String, TableScanOperator> verifyOnlyOneTableScanOperator(MapWork mapWork) { // Eliminate MR plans with more than one TableScanOperator. LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = mapWork.getAliasToWork(); if ((aliasToWork == null) || (aliasToWork.size() == 0)) { setNodeIssue("Vectorized map work requires work"); return null; } int tableScanCount = 0; String alias = ""; TableScanOperator tableScanOperator = null; for (Entry<String, Operator<? extends OperatorDesc>> entry : aliasToWork.entrySet()) { Operator<?> op = entry.getValue(); if (op == null) { setNodeIssue("Vectorized map work requires a valid alias"); return null; } if (op instanceof TableScanOperator) { tableScanCount++; alias = entry.getKey(); tableScanOperator = (TableScanOperator) op; } } if (tableScanCount > 1) { setNodeIssue("Vectorized map work only works with 1 TableScanOperator"); return null; } return new ImmutablePair(alias, tableScanOperator); } private void getTableScanOperatorSchemaInfo(TableScanOperator tableScanOperator, List<String> logicalColumnNameList, List<TypeInfo> logicalTypeInfoList) { // Add all non-virtual columns to make a vectorization context for // the TableScan operator. RowSchema rowSchema = tableScanOperator.getSchema(); for (ColumnInfo c : rowSchema.getSignature()) { // Validation will later exclude vectorization of virtual columns usage (HIVE-5560). if (!isVirtualColumn(c)) { String columnName = c.getInternalName(); String typeName = c.getTypeName(); TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName); logicalColumnNameList.add(columnName); logicalTypeInfoList.add(typeInfo); } } } private void determineDataColumnNums(TableScanOperator tableScanOperator, List<String> allColumnNameList, int dataColumnCount, List<Integer> dataColumnNums) { /* * The TableScanOperator's needed columns are just the data columns. */ Set<String> neededColumns = new HashSet<String>(tableScanOperator.getNeededColumns()); for (int dataColumnNum = 0; dataColumnNum < dataColumnCount; dataColumnNum++) { String columnName = allColumnNameList.get(dataColumnNum); if (neededColumns.contains(columnName)) { dataColumnNums.add(dataColumnNum); } } } /* * There are 3 modes of reading for vectorization: * * 1) One for the Vectorized Input File Format which returns VectorizedRowBatch as the row. * * 2) One for using VectorDeserializeRow to deserialize each row into the VectorizedRowBatch. * Currently, these Input File Formats: * TEXTFILE * SEQUENCEFILE * * 3) And one using the regular partition deserializer to get the row object and assigning * the row object into the VectorizedRowBatch with VectorAssignRow. * This picks up Input File Format not supported by the other two. */ private boolean verifyAndSetVectorPartDesc(PartitionDesc pd, boolean isAcidTable, HashSet<String> inputFileFormatClassNameSet, HashSet<String> enabledConditionsMetSet, ArrayList<String> enabledConditionsNotMetList) { String inputFileFormatClassName = pd.getInputFileFormatClassName(); // Always collect input file formats. inputFileFormatClassNameSet.add(inputFileFormatClassName); boolean isInputFileFormatVectorized = Utilities.isInputFileFormatVectorized(pd); if (isAcidTable) { // Today, ACID tables are only ORC and that format is vectorizable. Verify these // assumptions. Preconditions.checkState(isInputFileFormatVectorized); Preconditions.checkState(inputFileFormatClassName.equals(OrcInputFormat.class.getName())); if (!useVectorizedInputFileFormat) { enabledConditionsNotMetList.add( "Vectorizing ACID tables requires " + HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname); return false; } pd.setVectorPartitionDesc( VectorPartitionDesc.createVectorizedInputFileFormat( inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd))); enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname); return true; } // Look for Pass-Thru case where InputFileFormat has VectorizedInputFormatInterface // and reads VectorizedRowBatch as a "row". if (useVectorizedInputFileFormat) { if (isInputFileFormatVectorized) { pd.setVectorPartitionDesc( VectorPartitionDesc.createVectorizedInputFileFormat( inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd))); enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname); return true; } // Fall through and look for other options... } if (!isSchemaEvolution) { enabledConditionsNotMetList.add( "Vectorizing tables without Schema Evolution requires " + HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname); return false; } String deserializerClassName = pd.getDeserializerClassName(); // Look for InputFileFormat / Serde combinations we can deserialize more efficiently // using VectorDeserializeRow and a deserialize class with the DeserializeRead interface. // // Do the "vectorized" row-by-row deserialization into a VectorizedRowBatch in the // VectorMapOperator. boolean isTextFormat = inputFileFormatClassName.equals(TextInputFormat.class.getName()) && deserializerClassName.equals(LazySimpleSerDe.class.getName()); boolean isSequenceFormat = inputFileFormatClassName.equals(SequenceFileInputFormat.class.getName()) && deserializerClassName.equals(LazyBinarySerDe.class.getName()); boolean isVectorDeserializeEligable = isTextFormat || isSequenceFormat; if (useVectorDeserialize) { // Currently, we support LazySimple deserialization: // // org.apache.hadoop.mapred.TextInputFormat // org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe // // AND // // org.apache.hadoop.mapred.SequenceFileInputFormat // org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe if (isTextFormat) { Properties properties = pd.getTableDesc().getProperties(); String lastColumnTakesRestString = properties.getProperty(serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST); boolean lastColumnTakesRest = (lastColumnTakesRestString != null && lastColumnTakesRestString.equalsIgnoreCase("true")); if (lastColumnTakesRest) { // If row mode will not catch this input file format, then not enabled. if (useRowDeserialize) { enabledConditionsNotMetList.add( inputFileFormatClassName + " " + serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST + " must be disabled "); return false; } } else { pd.setVectorPartitionDesc( VectorPartitionDesc.createVectorDeserialize( inputFileFormatClassName, VectorDeserializeType.LAZY_SIMPLE)); enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE.varname); return true; } } else if (isSequenceFormat) { pd.setVectorPartitionDesc( VectorPartitionDesc.createVectorDeserialize( inputFileFormatClassName, VectorDeserializeType.LAZY_BINARY)); enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE.varname); return true; } // Fall through and look for other options... } // Otherwise, if enabled, deserialize rows using regular Serde and add the object // inspect-able Object[] row to a VectorizedRowBatch in the VectorMapOperator. if (useRowDeserialize) { pd.setVectorPartitionDesc( VectorPartitionDesc.createRowDeserialize( inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd), deserializerClassName)); enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_ROW_DESERIALIZE.varname); return true; } if (isInputFileFormatVectorized) { Preconditions.checkState(!useVectorizedInputFileFormat); enabledConditionsNotMetList.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname); } else { // Only offer these when the input file format is not the fast vectorized formats. if (isVectorDeserializeEligable) { Preconditions.checkState(!useVectorDeserialize); enabledConditionsNotMetList.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE.varname); } else { // Since row mode takes everyone. enabledConditionsNotMetList.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_ROW_DESERIALIZE.varname); } } return false; } private ImmutablePair<Boolean, Boolean> validateInputFormatAndSchemaEvolution(MapWork mapWork, String alias, TableScanOperator tableScanOperator, VectorTaskColumnInfo vectorTaskColumnInfo) throws SemanticException { boolean isAcidTable = tableScanOperator.getConf().isAcidTable(); // These names/types are the data columns plus partition columns. final List<String> allColumnNameList = new ArrayList<String>(); final List<TypeInfo> allTypeInfoList = new ArrayList<TypeInfo>(); getTableScanOperatorSchemaInfo(tableScanOperator, allColumnNameList, allTypeInfoList); final List<Integer> dataColumnNums = new ArrayList<Integer>(); final int allColumnCount = allColumnNameList.size(); /* * Validate input formats of all the partitions can be vectorized. */ boolean isFirst = true; int dataColumnCount = 0; int partitionColumnCount = 0; List<String> tableDataColumnList = null; List<TypeInfo> tableDataTypeInfoList = null; LinkedHashMap<Path, ArrayList<String>> pathToAliases = mapWork.getPathToAliases(); LinkedHashMap<Path, PartitionDesc> pathToPartitionInfo = mapWork.getPathToPartitionInfo(); // Remember the input file formats we validated and why. HashSet<String> inputFileFormatClassNameSet = new HashSet<String>(); HashSet<String> enabledConditionsMetSet = new HashSet<String>(); ArrayList<String> enabledConditionsNotMetList = new ArrayList<String>(); for (Entry<Path, ArrayList<String>> entry: pathToAliases.entrySet()) { Path path = entry.getKey(); List<String> aliases = entry.getValue(); boolean isPresent = (aliases != null && aliases.indexOf(alias) != -1); if (!isPresent) { setOperatorIssue("Alias " + alias + " not present in aliases " + aliases); return new ImmutablePair<Boolean,Boolean>(false, false); } PartitionDesc partDesc = pathToPartitionInfo.get(path); if (partDesc.getVectorPartitionDesc() != null) { // We've seen this already. continue; } if (!verifyAndSetVectorPartDesc(partDesc, isAcidTable, inputFileFormatClassNameSet, enabledConditionsMetSet, enabledConditionsNotMetList)) { // Always set these so EXPLAIN can see. mapWork.setVectorizationInputFileFormatClassNameSet(inputFileFormatClassNameSet); mapWork.setVectorizationEnabledConditionsMet(new ArrayList(enabledConditionsMetSet)); mapWork.setVectorizationEnabledConditionsNotMet(enabledConditionsNotMetList); // We consider this an enable issue, not a not vectorized issue. LOG.info("Cannot enable vectorization because input file format(s) " + inputFileFormatClassNameSet + " do not met conditions " + VectorizationCondition.addBooleans(enabledConditionsNotMetList, false)); return new ImmutablePair<Boolean,Boolean>(false, true); } VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc(); if (isFirst) { // Determine the data and partition columns using the first partition descriptor. LinkedHashMap<String, String> partSpec = partDesc.getPartSpec(); if (partSpec != null && partSpec.size() > 0) { partitionColumnCount = partSpec.size(); dataColumnCount = allColumnCount - partitionColumnCount; } else { partitionColumnCount = 0; dataColumnCount = allColumnCount; } determineDataColumnNums(tableScanOperator, allColumnNameList, dataColumnCount, dataColumnNums); tableDataColumnList = allColumnNameList.subList(0, dataColumnCount); tableDataTypeInfoList = allTypeInfoList.subList(0, dataColumnCount); isFirst = false; } // We need to get the partition's column names from the partition serde. // (e.g. Avro provides the table schema and ignores the partition schema..). // Deserializer deserializer; StructObjectInspector partObjectInspector; try { deserializer = partDesc.getDeserializer(hiveConf); partObjectInspector = (StructObjectInspector) deserializer.getObjectInspector(); } catch (Exception e) { throw new SemanticException(e); } String nextDataColumnsString = ObjectInspectorUtils.getFieldNames(partObjectInspector); String[] nextDataColumns = nextDataColumnsString.split(","); List<String> nextDataColumnList = Arrays.asList(nextDataColumns); /* * Validate the column names that are present are the same. Missing columns will be * implicitly defaulted to null. */ if (nextDataColumnList.size() > tableDataColumnList.size()) { setOperatorIssue( String.format( "Could not vectorize partition %s " + "(deserializer " + deserializer.getClass().getName() + ")" + "The partition column names %d is greater than the number of table columns %d", path, nextDataColumnList.size(), tableDataColumnList.size())); return new ImmutablePair<Boolean,Boolean>(false, false); } if (!(deserializer instanceof NullStructSerDe)) { // (Don't insist NullStructSerDe produce correct column names). for (int i = 0; i < nextDataColumnList.size(); i++) { String nextColumnName = nextDataColumnList.get(i); String tableColumnName = tableDataColumnList.get(i); if (!nextColumnName.equals(tableColumnName)) { setOperatorIssue( String.format( "Could not vectorize partition %s " + "(deserializer " + deserializer.getClass().getName() + ")" + "The partition column name %s is does not match table column name %s", path, nextColumnName, tableColumnName)); return new ImmutablePair<Boolean,Boolean>(false, false); } } } List<TypeInfo> nextDataTypeInfoList; if (vectorPartDesc.getIsInputFileFormatSelfDescribing()) { /* * Self-Describing Input Format will convert its data to the table schema. */ nextDataTypeInfoList = tableDataTypeInfoList; } else { String nextDataTypesString = ObjectInspectorUtils.getFieldTypes(partObjectInspector); // We convert to an array of TypeInfo using a library routine since it parses the information // and can handle use of different separators, etc. We cannot use the raw type string // for comparison in the map because of the different separators used. nextDataTypeInfoList = TypeInfoUtils.getTypeInfosFromTypeString(nextDataTypesString); } vectorPartDesc.setDataTypeInfos(nextDataTypeInfoList); } vectorTaskColumnInfo.setAllColumnNames(allColumnNameList); vectorTaskColumnInfo.setAllTypeInfos(allTypeInfoList); vectorTaskColumnInfo.setDataColumnNums(dataColumnNums); vectorTaskColumnInfo.setPartitionColumnCount(partitionColumnCount); vectorTaskColumnInfo.setUseVectorizedInputFileFormat(useVectorizedInputFileFormat); // Always set these so EXPLAIN can see. mapWork.setVectorizationInputFileFormatClassNameSet(inputFileFormatClassNameSet); mapWork.setVectorizationEnabledConditionsMet(new ArrayList(enabledConditionsMetSet)); mapWork.setVectorizationEnabledConditionsNotMet(enabledConditionsNotMetList); return new ImmutablePair<Boolean,Boolean>(true, false); } private boolean validateMapWork(MapWork mapWork, VectorTaskColumnInfo vectorTaskColumnInfo, boolean isTezOrSpark) throws SemanticException { LOG.info("Validating MapWork..."); ImmutablePair<String,TableScanOperator> onlyOneTableScanPair = verifyOnlyOneTableScanOperator(mapWork); if (onlyOneTableScanPair == null) { VectorizerReason notVectorizedReason = currentBaseWork.getNotVectorizedReason(); Preconditions.checkState(notVectorizedReason != null); mapWork.setVectorizationEnabledConditionsNotMet(Arrays.asList(new String[] {notVectorizedReason.toString()})); return false; } String alias = onlyOneTableScanPair.left; TableScanOperator tableScanOperator = onlyOneTableScanPair.right; // This call fills in the column names, types, and partition column count in // vectorTaskColumnInfo. currentOperator = tableScanOperator; ImmutablePair<Boolean, Boolean> validateInputFormatAndSchemaEvolutionPair = validateInputFormatAndSchemaEvolution(mapWork, alias, tableScanOperator, vectorTaskColumnInfo); if (!validateInputFormatAndSchemaEvolutionPair.left) { // Have we already set the enabled conditions not met? if (!validateInputFormatAndSchemaEvolutionPair.right) { VectorizerReason notVectorizedReason = currentBaseWork.getNotVectorizedReason(); Preconditions.checkState(notVectorizedReason != null); mapWork.setVectorizationEnabledConditionsNotMet(Arrays.asList(new String[] {notVectorizedReason.toString()})); } return false; } // Now we are enabled and any issues found from here on out are considered // not vectorized issues. mapWork.setVectorizationEnabled(true); Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); MapWorkValidationNodeProcessor vnp = new MapWorkValidationNodeProcessor(mapWork, isTezOrSpark); addMapWorkRules(opRules, vnp); Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); GraphWalker ogw = new DefaultGraphWalker(disp); // iterator the mapper operator tree ArrayList<Node> topNodes = new ArrayList<Node>(); topNodes.addAll(mapWork.getAliasToWork().values()); HashMap<Node, Object> nodeOutput = new HashMap<Node, Object>(); ogw.startWalking(topNodes, nodeOutput); for (Node n : nodeOutput.keySet()) { if (nodeOutput.get(n) != null) { if (!((Boolean)nodeOutput.get(n)).booleanValue()) { return false; } } } vectorTaskColumnInfo.setNonVectorizedOps(vnp.getNonVectorizedOps()); return true; } private void vectorizeMapWork(MapWork mapWork, VectorTaskColumnInfo vectorTaskColumnInfo, boolean isTezOrSpark) throws SemanticException { LOG.info("Vectorizing MapWork..."); mapWork.setVectorMode(true); Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); MapWorkVectorizationNodeProcessor vnp = new MapWorkVectorizationNodeProcessor(mapWork, isTezOrSpark, vectorTaskColumnInfo); addMapWorkRules(opRules, vnp); Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); GraphWalker ogw = new PreOrderOnceWalker(disp); // iterator the mapper operator tree ArrayList<Node> topNodes = new ArrayList<Node>(); topNodes.addAll(mapWork.getAliasToWork().values()); HashMap<Node, Object> nodeOutput = new HashMap<Node, Object>(); ogw.startWalking(topNodes, nodeOutput); vectorTaskColumnInfo.setScratchTypeNameArray(vnp.getVectorScratchColumnTypeNames()); vectorTaskColumnInfo.transferToBaseWork(mapWork); if (LOG.isDebugEnabled()) { debugDisplayAllMaps(mapWork); } return; } private void setReduceWorkExplainConditions(ReduceWork reduceWork) { reduceWork.setVectorizationExamined(true); reduceWork.setReduceVectorizationEnabled(isReduceVectorizationEnabled); reduceWork.setVectorReduceEngine( HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE)); } private void convertReduceWork(ReduceWork reduceWork) throws SemanticException { // Global used when setting errors, etc. currentBaseWork = reduceWork; currentBaseWork.setVectorizationEnabled(true); VectorTaskColumnInfo vectorTaskColumnInfo = new VectorTaskColumnInfo(); vectorTaskColumnInfo.assume(); reduceWork.setVectorizedVertexNum(++vectorizedVertexNum); boolean ret; try { ret = validateReduceWork(reduceWork, vectorTaskColumnInfo); } catch (Exception e) { String issue = "exception: " + VectorizationContext.getStackTraceAsSingleLine(e); setNodeIssue(issue); ret = false; } if (ret) { vectorizeReduceWork(reduceWork, vectorTaskColumnInfo); } else if (currentBaseWork.getVectorizationEnabled()) { VectorizerReason notVectorizedReason = currentBaseWork.getNotVectorizedReason(); if (notVectorizedReason == null) { LOG.info("Cannot vectorize: unknown"); } else { LOG.info("Cannot vectorize: " + notVectorizedReason.toString()); } clearReduceWorkVectorDescs(reduceWork); } } private boolean getOnlyStructObjectInspectors(ReduceWork reduceWork, VectorTaskColumnInfo vectorTaskColumnInfo) throws SemanticException { ArrayList<String> reduceColumnNames = new ArrayList<String>(); ArrayList<TypeInfo> reduceTypeInfos = new ArrayList<TypeInfo>(); if (reduceWork.getNeedsTagging()) { setNodeIssue("Tagging not supported"); return false; } String columnSortOrder; String columnNullOrder; try { TableDesc keyTableDesc = reduceWork.getKeyDesc(); if (LOG.isDebugEnabled()) { LOG.debug("Using reduce tag " + reduceWork.getTag()); } TableDesc valueTableDesc = reduceWork.getTagToValueDesc().get(reduceWork.getTag()); Properties keyTableProperties = keyTableDesc.getProperties(); Deserializer keyDeserializer = ReflectionUtils.newInstance( keyTableDesc.getDeserializerClass(), null); SerDeUtils.initializeSerDe(keyDeserializer, null, keyTableProperties, null); ObjectInspector keyObjectInspector = keyDeserializer.getObjectInspector(); if (keyObjectInspector == null) { setNodeIssue("Key object inspector null"); return false; } if (!(keyObjectInspector instanceof StructObjectInspector)) { setNodeIssue("Key object inspector not StructObjectInspector"); return false; } StructObjectInspector keyStructObjectInspector = (StructObjectInspector) keyObjectInspector; List<? extends StructField> keyFields = keyStructObjectInspector.getAllStructFieldRefs(); for (StructField field: keyFields) { reduceColumnNames.add(Utilities.ReduceField.KEY.toString() + "." + field.getFieldName()); reduceTypeInfos.add(TypeInfoUtils.getTypeInfoFromTypeString(field.getFieldObjectInspector().getTypeName())); } columnSortOrder = keyTableProperties.getProperty(serdeConstants.SERIALIZATION_SORT_ORDER); columnNullOrder = keyTableProperties.getProperty(serdeConstants.SERIALIZATION_NULL_SORT_ORDER); Deserializer valueDeserializer = ReflectionUtils.newInstance( valueTableDesc.getDeserializerClass(), null); SerDeUtils.initializeSerDe(valueDeserializer, null, valueTableDesc.getProperties(), null); ObjectInspector valueObjectInspector = valueDeserializer.getObjectInspector(); if (valueObjectInspector != null) { if (!(valueObjectInspector instanceof StructObjectInspector)) { setNodeIssue("Value object inspector not StructObjectInspector"); return false; } StructObjectInspector valueStructObjectInspector = (StructObjectInspector) valueObjectInspector; List<? extends StructField> valueFields = valueStructObjectInspector.getAllStructFieldRefs(); for (StructField field: valueFields) { reduceColumnNames.add(Utilities.ReduceField.VALUE.toString() + "." + field.getFieldName()); reduceTypeInfos.add(TypeInfoUtils.getTypeInfoFromTypeString(field.getFieldObjectInspector().getTypeName())); } } } catch (Exception e) { throw new SemanticException(e); } vectorTaskColumnInfo.setAllColumnNames(reduceColumnNames); vectorTaskColumnInfo.setAllTypeInfos(reduceTypeInfos); vectorTaskColumnInfo.setReduceColumnSortOrder(columnSortOrder); vectorTaskColumnInfo.setReduceColumnNullOrder(columnNullOrder); return true; } private void addReduceWorkRules(Map<Rule, NodeProcessor> opRules, NodeProcessor np) { opRules.put(new RuleRegExp("R1", GroupByOperator.getOperatorName() + ".*"), np); opRules.put(new RuleRegExp("R2", SelectOperator.getOperatorName() + ".*"), np); } private boolean validateReduceWork(ReduceWork reduceWork, VectorTaskColumnInfo vectorTaskColumnInfo) throws SemanticException { LOG.info("Validating ReduceWork..."); // Validate input to ReduceWork. if (!getOnlyStructObjectInspectors(reduceWork, vectorTaskColumnInfo)) { return false; } // Now check the reduce operator tree. Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); ReduceWorkValidationNodeProcessor vnp = new ReduceWorkValidationNodeProcessor(); addReduceWorkRules(opRules, vnp); Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); GraphWalker ogw = new DefaultGraphWalker(disp); // iterator the reduce operator tree ArrayList<Node> topNodes = new ArrayList<Node>(); topNodes.add(reduceWork.getReducer()); HashMap<Node, Object> nodeOutput = new HashMap<Node, Object>(); ogw.startWalking(topNodes, nodeOutput); for (Node n : nodeOutput.keySet()) { if (nodeOutput.get(n) != null) { if (!((Boolean)nodeOutput.get(n)).booleanValue()) { return false; } } } vectorTaskColumnInfo.setNonVectorizedOps(vnp.getNonVectorizedOps()); return true; } private void vectorizeReduceWork(ReduceWork reduceWork, VectorTaskColumnInfo vectorTaskColumnInfo) throws SemanticException { LOG.info("Vectorizing ReduceWork..."); reduceWork.setVectorMode(true); // For some reason, the DefaultGraphWalker does not descend down from the reducer Operator as // expected. We need to descend down, otherwise it breaks our algorithm that determines // VectorizationContext... Do we use PreOrderWalker instead of DefaultGraphWalker. Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); ReduceWorkVectorizationNodeProcessor vnp = new ReduceWorkVectorizationNodeProcessor(vectorTaskColumnInfo); addReduceWorkRules(opRules, vnp); Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); GraphWalker ogw = new PreOrderWalker(disp); // iterator the reduce operator tree ArrayList<Node> topNodes = new ArrayList<Node>(); topNodes.add(reduceWork.getReducer()); LOG.info("vectorizeReduceWork reducer Operator: " + reduceWork.getReducer().getName() + "..."); HashMap<Node, Object> nodeOutput = new HashMap<Node, Object>(); ogw.startWalking(topNodes, nodeOutput); // Necessary since we are vectorizing the root operator in reduce. reduceWork.setReducer(vnp.getRootVectorOp()); vectorTaskColumnInfo.setScratchTypeNameArray(vnp.getVectorScratchColumnTypeNames()); vectorTaskColumnInfo.transferToBaseWork(reduceWork); if (LOG.isDebugEnabled()) { debugDisplayAllMaps(reduceWork); } } class ClearVectorDescsNodeProcessor implements NodeProcessor { public ClearVectorDescsNodeProcessor() { } @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { for (Node n : stack) { Operator<? extends OperatorDesc> op = (Operator<? extends OperatorDesc>) n; OperatorDesc desc = op.getConf(); if (desc instanceof AbstractOperatorDesc) { AbstractOperatorDesc abstractDesc = (AbstractOperatorDesc) desc; abstractDesc.setVectorDesc(null); } } return null; } } private void clearMapWorkVectorDescs(MapWork mapWork) throws SemanticException { Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); ClearVectorDescsNodeProcessor vnp = new ClearVectorDescsNodeProcessor(); addMapWorkRules(opRules, vnp); Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); GraphWalker ogw = new DefaultGraphWalker(disp); ArrayList<Node> topNodes = new ArrayList<Node>(); topNodes.addAll(mapWork.getAliasToWork().values()); ogw.startWalking(topNodes, null); } private void clearReduceWorkVectorDescs(ReduceWork reduceWork) throws SemanticException { Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); ClearVectorDescsNodeProcessor vnp = new ClearVectorDescsNodeProcessor(); addReduceWorkRules(opRules, vnp); Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); GraphWalker ogw = new DefaultGraphWalker(disp); ArrayList<Node> topNodes = new ArrayList<Node>(); topNodes.add(reduceWork.getReducer()); ogw.startWalking(topNodes, null); } } class MapWorkValidationNodeProcessor implements NodeProcessor { private final MapWork mapWork; private final boolean isTezOrSpark; // Children of Vectorized GROUPBY that outputs rows instead of vectorized row batchs. protected final Set<Operator<? extends OperatorDesc>> nonVectorizedOps = new HashSet<Operator<? extends OperatorDesc>>(); public Set<Operator<? extends OperatorDesc>> getNonVectorizedOps() { return nonVectorizedOps; } public MapWorkValidationNodeProcessor(MapWork mapWork, boolean isTezOrSpark) { this.mapWork = mapWork; this.isTezOrSpark = isTezOrSpark; } @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { for (Node n : stack) { Operator<? extends OperatorDesc> op = (Operator<? extends OperatorDesc>) n; if (nonVectorizedOps.contains(op)) { return new Boolean(true); } boolean ret; currentOperator = op; try { ret = validateMapWorkOperator(op, mapWork, isTezOrSpark); } catch (Exception e) { throw new SemanticException(e); } if (!ret) { return new Boolean(false); } // When Vectorized GROUPBY outputs rows instead of vectorized row batches, we don't // vectorize the operators below it. if (isVectorizedGroupByThatOutputsRows(op)) { addOperatorChildrenToSet(op, nonVectorizedOps); return new Boolean(true); } } return new Boolean(true); } } class ReduceWorkValidationNodeProcessor implements NodeProcessor { // Children of Vectorized GROUPBY that outputs rows instead of vectorized row batchs. protected final Set<Operator<? extends OperatorDesc>> nonVectorizedOps = new HashSet<Operator<? extends OperatorDesc>>(); public Set<Operator<? extends OperatorDesc>> getNonVectorizeOps() { return nonVectorizedOps; } public Set<Operator<? extends OperatorDesc>> getNonVectorizedOps() { return nonVectorizedOps; } @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { for (Node n : stack) { Operator<? extends OperatorDesc> op = (Operator<? extends OperatorDesc>) n; if (nonVectorizedOps.contains(op)) { return new Boolean(true); } currentOperator = op; boolean ret = validateReduceWorkOperator(op); if (!ret) { return new Boolean(false); } // When Vectorized GROUPBY outputs rows instead of vectorized row batches, we don't // vectorize the operators below it. if (isVectorizedGroupByThatOutputsRows(op)) { addOperatorChildrenToSet(op, nonVectorizedOps); return new Boolean(true); } } return new Boolean(true); } } // This class has common code used by both MapWorkVectorizationNodeProcessor and // ReduceWorkVectorizationNodeProcessor. class VectorizationNodeProcessor implements NodeProcessor { // The vectorization context for the Map or Reduce task. protected VectorizationContext taskVectorizationContext; protected final VectorTaskColumnInfo vectorTaskColumnInfo; protected final Set<Operator<? extends OperatorDesc>> nonVectorizedOps; VectorizationNodeProcessor(VectorTaskColumnInfo vectorTaskColumnInfo, Set<Operator<? extends OperatorDesc>> nonVectorizedOps) { this.vectorTaskColumnInfo = vectorTaskColumnInfo; this.nonVectorizedOps = nonVectorizedOps; } public String[] getVectorScratchColumnTypeNames() { return taskVectorizationContext.getScratchColumnTypeNames(); } protected final Set<Operator<? extends OperatorDesc>> opsDone = new HashSet<Operator<? extends OperatorDesc>>(); protected final Map<Operator<? extends OperatorDesc>, Operator<? extends OperatorDesc>> opToVectorOpMap = new HashMap<Operator<? extends OperatorDesc>, Operator<? extends OperatorDesc>>(); public VectorizationContext walkStackToFindVectorizationContext(Stack<Node> stack, Operator<? extends OperatorDesc> op) throws SemanticException { VectorizationContext vContext = null; if (stack.size() <= 1) { throw new SemanticException( String.format("Expected operator stack for operator %s to have at least 2 operators", op.getName())); } // Walk down the stack of operators until we found one willing to give us a context. // At the bottom will be the root operator, guaranteed to have a context int i= stack.size()-2; while (vContext == null) { if (i < 0) { return null; } Operator<? extends OperatorDesc> opParent = (Operator<? extends OperatorDesc>) stack.get(i); Operator<? extends OperatorDesc> vectorOpParent = opToVectorOpMap.get(opParent); if (vectorOpParent != null) { if (vectorOpParent instanceof VectorizationContextRegion) { VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOpParent; vContext = vcRegion.getOuputVectorizationContext(); LOG.info("walkStackToFindVectorizationContext " + vectorOpParent.getName() + " has new vectorization context " + vContext.toString()); } else { LOG.info("walkStackToFindVectorizationContext " + vectorOpParent.getName() + " does not have new vectorization context"); } } else { LOG.info("walkStackToFindVectorizationContext " + opParent.getName() + " is not vectorized"); } --i; } return vContext; } public Operator<? extends OperatorDesc> doVectorize(Operator<? extends OperatorDesc> op, VectorizationContext vContext, boolean isTezOrSpark) throws SemanticException { Operator<? extends OperatorDesc> vectorOp = op; try { if (!opsDone.contains(op)) { vectorOp = vectorizeOperator(op, vContext, isTezOrSpark, vectorTaskColumnInfo); opsDone.add(op); if (vectorOp != op) { opToVectorOpMap.put(op, vectorOp); opsDone.add(vectorOp); } } } catch (HiveException e) { throw new SemanticException(e); } return vectorOp; } @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { throw new SemanticException("Must be overridden"); } } class MapWorkVectorizationNodeProcessor extends VectorizationNodeProcessor { private final VectorTaskColumnInfo vectorTaskColumnInfo; private final boolean isTezOrSpark; public MapWorkVectorizationNodeProcessor(MapWork mWork, boolean isTezOrSpark, VectorTaskColumnInfo vectorTaskColumnInfo) { super(vectorTaskColumnInfo, vectorTaskColumnInfo.getNonVectorizedOps()); this.vectorTaskColumnInfo = vectorTaskColumnInfo; this.isTezOrSpark = isTezOrSpark; } @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { Operator<? extends OperatorDesc> op = (Operator<? extends OperatorDesc>) nd; if (nonVectorizedOps.contains(op)) { return null; } VectorizationContext vContext = null; currentOperator = op; if (op instanceof TableScanOperator) { if (taskVectorizationContext == null) { taskVectorizationContext = getVectorizationContext(op.getName(), vectorTaskColumnInfo); if (LOG.isInfoEnabled()) { LOG.info("MapWorkVectorizationNodeProcessor process vectorizedVertexNum " + vectorizedVertexNum + " mapColumnNames " + vectorTaskColumnInfo.allColumnNames.toString()); LOG.info("MapWorkVectorizationNodeProcessor process vectorizedVertexNum " + vectorizedVertexNum + " mapTypeInfos " + vectorTaskColumnInfo.allTypeInfos.toString()); } } vContext = taskVectorizationContext; } else { LOG.debug("MapWorkVectorizationNodeProcessor process going to walk the operator stack to get vectorization context for " + op.getName()); vContext = walkStackToFindVectorizationContext(stack, op); if (vContext == null) { // No operator has "pushed" a new context -- so use the task vectorization context. vContext = taskVectorizationContext; } } assert vContext != null; if (LOG.isDebugEnabled()) { LOG.debug("MapWorkVectorizationNodeProcessor process operator " + op.getName() + " using vectorization context" + vContext.toString()); } Operator<? extends OperatorDesc> vectorOp = doVectorize(op, vContext, isTezOrSpark); if (LOG.isDebugEnabled()) { if (vectorOp instanceof VectorizationContextRegion) { VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOp; VectorizationContext vNewContext = vcRegion.getOuputVectorizationContext(); LOG.debug("Vectorized MapWork operator " + vectorOp.getName() + " added vectorization context " + vNewContext.toString()); } } return null; } } class ReduceWorkVectorizationNodeProcessor extends VectorizationNodeProcessor { private final VectorTaskColumnInfo vectorTaskColumnInfo; private Operator<? extends OperatorDesc> rootVectorOp; public Operator<? extends OperatorDesc> getRootVectorOp() { return rootVectorOp; } public ReduceWorkVectorizationNodeProcessor(VectorTaskColumnInfo vectorTaskColumnInfo) { super(vectorTaskColumnInfo, vectorTaskColumnInfo.getNonVectorizedOps()); this.vectorTaskColumnInfo = vectorTaskColumnInfo; rootVectorOp = null; } @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { Operator<? extends OperatorDesc> op = (Operator<? extends OperatorDesc>) nd; if (nonVectorizedOps.contains(op)) { return null; } VectorizationContext vContext = null; boolean saveRootVectorOp = false; currentOperator = op; if (op.getParentOperators().size() == 0) { if (LOG.isInfoEnabled()) { LOG.info("ReduceWorkVectorizationNodeProcessor process vectorizedVertexNum " + vectorizedVertexNum + " reduceColumnNames " + vectorTaskColumnInfo.allColumnNames.toString()); LOG.info("ReduceWorkVectorizationNodeProcessor process vectorizedVertexNum " + vectorizedVertexNum + " reduceTypeInfos " + vectorTaskColumnInfo.allTypeInfos.toString()); } vContext = new VectorizationContext("__Reduce_Shuffle__", vectorTaskColumnInfo.allColumnNames, hiveConf); taskVectorizationContext = vContext; saveRootVectorOp = true; if (LOG.isDebugEnabled()) { LOG.debug("Vectorized ReduceWork reduce shuffle vectorization context " + vContext.toString()); } } else { LOG.info("ReduceWorkVectorizationNodeProcessor process going to walk the operator stack to get vectorization context for " + op.getName()); vContext = walkStackToFindVectorizationContext(stack, op); if (vContext == null) { // If we didn't find a context among the operators, assume the top -- reduce shuffle's // vectorization context. vContext = taskVectorizationContext; } } assert vContext != null; LOG.info("ReduceWorkVectorizationNodeProcessor process operator " + op.getName() + " using vectorization context" + vContext.toString()); Operator<? extends OperatorDesc> vectorOp = doVectorize(op, vContext, true); if (LOG.isDebugEnabled()) { if (vectorOp instanceof VectorizationContextRegion) { VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOp; VectorizationContext vNewContext = vcRegion.getOuputVectorizationContext(); LOG.debug("Vectorized ReduceWork operator " + vectorOp.getName() + " added vectorization context " + vNewContext.toString()); } } if (saveRootVectorOp && op != vectorOp) { rootVectorOp = vectorOp; } return null; } } private static class ValidatorVectorizationContext extends VectorizationContext { private ValidatorVectorizationContext(HiveConf hiveConf) { super("No Name", hiveConf); } @Override protected int getInputColumnIndex(String name) { return 0; } @Override protected int getInputColumnIndex(ExprNodeColumnDesc colExpr) { return 0; } } @Override public PhysicalContext resolve(PhysicalContext physicalContext) throws SemanticException { hiveConf = physicalContext.getConf(); boolean vectorPath = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED); if (!vectorPath) { LOG.info("Vectorization is disabled"); return physicalContext; } useVectorizedInputFileFormat = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT); useVectorDeserialize = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE); useRowDeserialize = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_USE_ROW_DESERIALIZE); // TODO: we could also vectorize some formats based on hive.llap.io.encode.formats if LLAP IO // is enabled and we are going to run in LLAP. However, we don't know if we end up in // LLAP or not at this stage, so don't do this now. We may need to add a 'force' option. isReduceVectorizationEnabled = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCE_ENABLED); isSchemaEvolution = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_SCHEMA_EVOLUTION); hiveVectorAdaptorUsageMode = HiveVectorAdaptorUsageMode.getHiveConfValue(hiveConf); // create dispatcher and graph walker Dispatcher disp = new VectorizationDispatcher(); TaskGraphWalker ogw = new TaskGraphWalker(disp); // get all the tasks nodes from root task ArrayList<Node> topNodes = new ArrayList<Node>(); topNodes.addAll(physicalContext.getRootTasks()); // begin to walk through the task tree. ogw.startWalking(topNodes, null); return physicalContext; } private void setOperatorNotSupported(Operator<? extends OperatorDesc> op) { OperatorDesc desc = op.getConf(); Annotation note = AnnotationUtils.getAnnotation(desc.getClass(), Explain.class); if (note != null) { Explain explainNote = (Explain) note; setNodeIssue(explainNote.displayName() + " (" + op.getType() + ") not supported"); } else { setNodeIssue("Operator " + op.getType() + " not supported"); } } boolean validateMapWorkOperator(Operator<? extends OperatorDesc> op, MapWork mWork, boolean isTezOrSpark) { boolean ret; switch (op.getType()) { case MAPJOIN: if (op instanceof MapJoinOperator) { ret = validateMapJoinOperator((MapJoinOperator) op); } else if (op instanceof SMBMapJoinOperator) { ret = validateSMBMapJoinOperator((SMBMapJoinOperator) op); } else { setOperatorNotSupported(op); ret = false; } break; case GROUPBY: ret = validateGroupByOperator((GroupByOperator) op, false, isTezOrSpark); break; case FILTER: ret = validateFilterOperator((FilterOperator) op); break; case SELECT: ret = validateSelectOperator((SelectOperator) op); break; case REDUCESINK: ret = validateReduceSinkOperator((ReduceSinkOperator) op); break; case TABLESCAN: ret = validateTableScanOperator((TableScanOperator) op, mWork); break; case FILESINK: case LIMIT: case EVENT: case SPARKPRUNINGSINK: ret = true; break; case HASHTABLESINK: ret = op instanceof SparkHashTableSinkOperator && validateSparkHashTableSinkOperator((SparkHashTableSinkOperator) op); break; default: setOperatorNotSupported(op); ret = false; break; } return ret; } boolean validateReduceWorkOperator(Operator<? extends OperatorDesc> op) { boolean ret; switch (op.getType()) { case MAPJOIN: // Does MAPJOIN actually get planned in Reduce? if (op instanceof MapJoinOperator) { ret = validateMapJoinOperator((MapJoinOperator) op); } else if (op instanceof SMBMapJoinOperator) { ret = validateSMBMapJoinOperator((SMBMapJoinOperator) op); } else { setOperatorNotSupported(op); ret = false; } break; case GROUPBY: if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCE_GROUPBY_ENABLED)) { ret = validateGroupByOperator((GroupByOperator) op, true, true); } else { setNodeIssue("Operator " + op.getType() + " not enabled (" + HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCE_GROUPBY_ENABLED.name() + "=true IS false)"); ret = false; } break; case FILTER: ret = validateFilterOperator((FilterOperator) op); break; case SELECT: ret = validateSelectOperator((SelectOperator) op); break; case REDUCESINK: ret = validateReduceSinkOperator((ReduceSinkOperator) op); break; case FILESINK: ret = validateFileSinkOperator((FileSinkOperator) op); break; case LIMIT: case EVENT: case SPARKPRUNINGSINK: ret = true; break; case HASHTABLESINK: ret = op instanceof SparkHashTableSinkOperator && validateSparkHashTableSinkOperator((SparkHashTableSinkOperator) op); break; default: setOperatorNotSupported(op); ret = false; break; } return ret; } private void addOperatorChildrenToSet(Operator<? extends OperatorDesc> op, Set<Operator<? extends OperatorDesc>> nonVectorizedOps) { for (Operator<? extends OperatorDesc> childOp : op.getChildOperators()) { if (!nonVectorizedOps.contains(childOp)) { nonVectorizedOps.add(childOp); addOperatorChildrenToSet(childOp, nonVectorizedOps); } } } // When Vectorized GROUPBY outputs rows instead of vectorized row batchs, we don't // vectorize the operators below it. private Boolean isVectorizedGroupByThatOutputsRows(Operator<? extends OperatorDesc> op) throws SemanticException { if (op.getType().equals(OperatorType.GROUPBY)) { GroupByDesc desc = (GroupByDesc) op.getConf(); return !((VectorGroupByDesc) desc.getVectorDesc()).isVectorOutput(); } return false; } private boolean validateSMBMapJoinOperator(SMBMapJoinOperator op) { SMBJoinDesc desc = op.getConf(); // Validation is the same as for map join, since the 'small' tables are not vectorized return validateMapJoinDesc(desc); } private boolean validateTableScanOperator(TableScanOperator op, MapWork mWork) { TableScanDesc desc = op.getConf(); if (desc.isGatherStats()) { setOperatorIssue("gather stats not supported"); return false; } return true; } private boolean validateMapJoinOperator(MapJoinOperator op) { MapJoinDesc desc = op.getConf(); return validateMapJoinDesc(desc); } private boolean validateMapJoinDesc(MapJoinDesc desc) { byte posBigTable = (byte) desc.getPosBigTable(); List<ExprNodeDesc> filterExprs = desc.getFilters().get(posBigTable); if (!validateExprNodeDesc(filterExprs, "Filter", VectorExpressionDescriptor.Mode.FILTER)) { return false; } List<ExprNodeDesc> keyExprs = desc.getKeys().get(posBigTable); if (!validateExprNodeDesc(keyExprs, "Key")) { return false; } List<ExprNodeDesc> valueExprs = desc.getExprs().get(posBigTable); if (!validateExprNodeDesc(valueExprs, "Value")) { return false; } Byte[] order = desc.getTagOrder(); Byte posSingleVectorMapJoinSmallTable = (order[0] == posBigTable ? order[1] : order[0]); List<ExprNodeDesc> smallTableExprs = desc.getExprs().get(posSingleVectorMapJoinSmallTable); if (!validateExprNodeDesc(smallTableExprs, "Small Table")) { return false; } if (desc.getResidualFilterExprs() != null && !desc.getResidualFilterExprs().isEmpty()) { LOG.info("Cannot vectorize outer join with complex ON clause"); return false; } return true; } private boolean validateSparkHashTableSinkOperator(SparkHashTableSinkOperator op) { SparkHashTableSinkDesc desc = op.getConf(); byte tag = desc.getTag(); // it's essentially a MapJoinDesc List<ExprNodeDesc> filterExprs = desc.getFilters().get(tag); List<ExprNodeDesc> keyExprs = desc.getKeys().get(tag); List<ExprNodeDesc> valueExprs = desc.getExprs().get(tag); return validateExprNodeDesc(filterExprs, "Filter", VectorExpressionDescriptor.Mode.FILTER) && validateExprNodeDesc(keyExprs, "Key") && validateExprNodeDesc(valueExprs, "Value"); } private boolean validateReduceSinkOperator(ReduceSinkOperator op) { List<ExprNodeDesc> keyDescs = op.getConf().getKeyCols(); List<ExprNodeDesc> partitionDescs = op.getConf().getPartitionCols(); List<ExprNodeDesc> valueDesc = op.getConf().getValueCols(); return validateExprNodeDesc(keyDescs, "Key") && validateExprNodeDesc(partitionDescs, "Partition") && validateExprNodeDesc(valueDesc, "Value"); } private boolean validateSelectOperator(SelectOperator op) { List<ExprNodeDesc> descList = op.getConf().getColList(); for (ExprNodeDesc desc : descList) { boolean ret = validateExprNodeDesc(desc, "Select"); if (!ret) { return false; } } return true; } private boolean validateFilterOperator(FilterOperator op) { ExprNodeDesc desc = op.getConf().getPredicate(); return validateExprNodeDesc(desc, "Predicate", VectorExpressionDescriptor.Mode.FILTER); } private boolean validateGroupByOperator(GroupByOperator op, boolean isReduce, boolean isTezOrSpark) { GroupByDesc desc = op.getConf(); if (desc.getMode() != GroupByDesc.Mode.HASH && desc.isDistinct()) { setOperatorIssue("DISTINCT not supported"); return false; } boolean ret = validateExprNodeDesc(desc.getKeys(), "Key"); if (!ret) { return false; } /** * * GROUP BY DEFINITIONS: * * GroupByDesc.Mode enumeration: * * The different modes of a GROUP BY operator. * * These descriptions are hopefully less cryptic than the comments for GroupByDesc.Mode. * * COMPLETE Aggregates original rows into full aggregation row(s). * * If the key length is 0, this is also called Global aggregation and * 1 output row is produced. * * When the key length is > 0, the original rows come in ALREADY GROUPED. * * An example for key length > 0 is a GROUP BY being applied to the * ALREADY GROUPED rows coming from an upstream JOIN operator. Or, * ALREADY GROUPED rows coming from upstream MERGEPARTIAL GROUP BY * operator. * * PARTIAL1 The first of 2 (or more) phases that aggregates ALREADY GROUPED * original rows into partial aggregations. * * Subsequent phases PARTIAL2 (optional) and MERGEPARTIAL will merge * the partial aggregations and output full aggregations. * * PARTIAL2 Accept ALREADY GROUPED partial aggregations and merge them into another * partial aggregation. Output the merged partial aggregations. * * (Haven't seen this one used) * * PARTIALS (Behaves for non-distinct the same as PARTIAL2; and behaves for * distinct the same as PARTIAL1.) * * FINAL Accept ALREADY GROUPED original rows and aggregate them into * full aggregations. * * Example is a GROUP BY being applied to rows from a sorted table, where * the group key is the table sort key (or a prefix). * * HASH Accept UNORDERED original rows and aggregate them into a memory table. * Output the partial aggregations on closeOp (or low memory). * * Similar to PARTIAL1 except original rows are UNORDERED. * * Commonly used in both Mapper and Reducer nodes. Always followed by * a Reducer with MERGEPARTIAL GROUP BY. * * MERGEPARTIAL Always first operator of a Reducer. Data is grouped by reduce-shuffle. * * (Behaves for non-distinct aggregations the same as FINAL; and behaves * for distinct aggregations the same as COMPLETE.) * * The output is full aggregation(s). * * Used in Reducers after a stage with a HASH GROUP BY operator. * * * VectorGroupByDesc.ProcessingMode for VectorGroupByOperator: * * GLOBAL No key. All rows --> 1 full aggregation on end of input * * HASH Rows aggregated in to hash table on group key --> * 1 partial aggregation per key (normally, unless there is spilling) * * MERGE_PARTIAL As first operator in a REDUCER, partial aggregations come grouped from * reduce-shuffle --> * aggregate the partial aggregations and emit full aggregation on * endGroup / closeOp * * STREAMING Rows come from PARENT operator ALREADY GROUPED --> * aggregate the rows and emit full aggregation on key change / closeOp * * NOTE: Hash can spill partial result rows prematurely if it runs low on memory. * NOTE: Streaming has to compare keys where MergePartial gets an endGroup call. * * * DECIDER: Which VectorGroupByDesc.ProcessingMode for VectorGroupByOperator? * * Decides using GroupByDesc.Mode and whether there are keys with the * VectorGroupByDesc.groupByDescModeToVectorProcessingMode method. * * Mode.COMPLETE --> (numKeys == 0 ? ProcessingMode.GLOBAL : ProcessingMode.STREAMING) * * Mode.HASH --> ProcessingMode.HASH * * Mode.MERGEPARTIAL --> (numKeys == 0 ? ProcessingMode.GLOBAL : ProcessingMode.MERGE_PARTIAL) * * Mode.PARTIAL1, * Mode.PARTIAL2, * Mode.PARTIALS, * Mode.FINAL --> ProcessingMode.STREAMING * */ boolean hasKeys = (desc.getKeys().size() > 0); ProcessingMode processingMode = VectorGroupByDesc.groupByDescModeToVectorProcessingMode(desc.getMode(), hasKeys); if (desc.isGroupingSetsPresent() && (processingMode != ProcessingMode.HASH && processingMode != ProcessingMode.STREAMING)) { LOG.info("Vectorized GROUPING SETS only expected for HASH and STREAMING processing modes"); return false; } Pair<Boolean,Boolean> retPair = validateAggregationDescs(desc.getAggregators(), processingMode, hasKeys); if (!retPair.left) { return false; } // If all the aggregation outputs are primitive, we can output VectorizedRowBatch. // Otherwise, we the rest of the operator tree will be row mode. VectorGroupByDesc vectorDesc = new VectorGroupByDesc(); desc.setVectorDesc(vectorDesc); vectorDesc.setVectorOutput(retPair.right); vectorDesc.setProcessingMode(processingMode); LOG.info("Vector GROUP BY operator will use processing mode " + processingMode.name() + ", isVectorOutput " + vectorDesc.isVectorOutput()); return true; } private boolean validateFileSinkOperator(FileSinkOperator op) { return true; } private boolean validateExprNodeDesc(List<ExprNodeDesc> descs, String expressionTitle) { return validateExprNodeDesc(descs, expressionTitle, VectorExpressionDescriptor.Mode.PROJECTION); } private boolean validateExprNodeDesc(List<ExprNodeDesc> descs, String expressionTitle, VectorExpressionDescriptor.Mode mode) { for (ExprNodeDesc d : descs) { boolean ret = validateExprNodeDesc(d, expressionTitle, mode); if (!ret) { return false; } } return true; } private Pair<Boolean,Boolean> validateAggregationDescs(List<AggregationDesc> descs, ProcessingMode processingMode, boolean hasKeys) { boolean outputIsPrimitive = true; for (AggregationDesc d : descs) { Pair<Boolean,Boolean> retPair = validateAggregationDesc(d, processingMode, hasKeys); if (!retPair.left) { return retPair; } if (!retPair.right) { outputIsPrimitive = false; } } return new Pair<Boolean, Boolean>(true, outputIsPrimitive); } private boolean validateExprNodeDescRecursive(ExprNodeDesc desc, String expressionTitle, VectorExpressionDescriptor.Mode mode) { if (desc instanceof ExprNodeColumnDesc) { ExprNodeColumnDesc c = (ExprNodeColumnDesc) desc; // Currently, we do not support vectorized virtual columns (see HIVE-5570). if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(c.getColumn())) { setExpressionIssue(expressionTitle, "Virtual columns not supported (" + c.getColumn() + ")"); return false; } } String typeName = desc.getTypeInfo().getTypeName(); boolean ret = validateDataType(typeName, mode); if (!ret) { setExpressionIssue(expressionTitle, "Data type " + typeName + " of " + desc.toString() + " not supported"); return false; } boolean isInExpression = false; if (desc instanceof ExprNodeGenericFuncDesc) { ExprNodeGenericFuncDesc d = (ExprNodeGenericFuncDesc) desc; boolean r = validateGenericUdf(d); if (!r) { setExpressionIssue(expressionTitle, "UDF " + d + " not supported"); return false; } GenericUDF genericUDF = d.getGenericUDF(); isInExpression = (genericUDF instanceof GenericUDFIn); } if (desc.getChildren() != null) { if (isInExpression && desc.getChildren().get(0).getTypeInfo().getCategory() == Category.STRUCT) { // Don't restrict child expressions for projection. // Always use loose FILTER mode. if (!validateStructInExpression(desc, expressionTitle, VectorExpressionDescriptor.Mode.FILTER)) { return false; } } else { for (ExprNodeDesc d : desc.getChildren()) { // Don't restrict child expressions for projection. // Always use loose FILTER mode. if (!validateExprNodeDescRecursive(d, expressionTitle, VectorExpressionDescriptor.Mode.FILTER)) { return false; } } } } return true; } private boolean validateStructInExpression(ExprNodeDesc desc, String expressionTitle, VectorExpressionDescriptor.Mode mode) { for (ExprNodeDesc d : desc.getChildren()) { TypeInfo typeInfo = d.getTypeInfo(); if (typeInfo.getCategory() != Category.STRUCT) { return false; } StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; ArrayList<TypeInfo> fieldTypeInfos = structTypeInfo .getAllStructFieldTypeInfos(); ArrayList<String> fieldNames = structTypeInfo.getAllStructFieldNames(); final int fieldCount = fieldTypeInfos.size(); for (int f = 0; f < fieldCount; f++) { TypeInfo fieldTypeInfo = fieldTypeInfos.get(f); Category category = fieldTypeInfo.getCategory(); if (category != Category.PRIMITIVE) { setExpressionIssue(expressionTitle, "Cannot vectorize struct field " + fieldNames.get(f) + " of type " + fieldTypeInfo.getTypeName()); return false; } PrimitiveTypeInfo fieldPrimitiveTypeInfo = (PrimitiveTypeInfo) fieldTypeInfo; InConstantType inConstantType = VectorizationContext .getInConstantTypeFromPrimitiveCategory(fieldPrimitiveTypeInfo .getPrimitiveCategory()); // For now, limit the data types we support for Vectorized Struct IN(). if (inConstantType != InConstantType.INT_FAMILY && inConstantType != InConstantType.FLOAT_FAMILY && inConstantType != InConstantType.STRING_FAMILY) { setExpressionIssue(expressionTitle, "Cannot vectorize struct field " + fieldNames.get(f) + " of type " + fieldTypeInfo.getTypeName()); return false; } } } return true; } private boolean validateExprNodeDesc(ExprNodeDesc desc, String expressionTitle) { return validateExprNodeDesc(desc, expressionTitle, VectorExpressionDescriptor.Mode.PROJECTION); } boolean validateExprNodeDesc(ExprNodeDesc desc, String expressionTitle, VectorExpressionDescriptor.Mode mode) { if (!validateExprNodeDescRecursive(desc, expressionTitle, mode)) { return false; } try { VectorizationContext vc = new ValidatorVectorizationContext(hiveConf); if (vc.getVectorExpression(desc, mode) == null) { // TODO: this cannot happen - VectorizationContext throws in such cases. setExpressionIssue(expressionTitle, "getVectorExpression returned null"); return false; } } catch (Exception e) { if (e instanceof HiveException) { setExpressionIssue(expressionTitle, e.getMessage()); } else { String issue = "exception: " + VectorizationContext.getStackTraceAsSingleLine(e); setExpressionIssue(expressionTitle, issue); } return false; } return true; } private boolean validateGenericUdf(ExprNodeGenericFuncDesc genericUDFExpr) { if (VectorizationContext.isCustomUDF(genericUDFExpr)) { return true; } if (hiveVectorAdaptorUsageMode == HiveVectorAdaptorUsageMode.NONE || hiveVectorAdaptorUsageMode == HiveVectorAdaptorUsageMode.CHOSEN) { GenericUDF genericUDF = genericUDFExpr.getGenericUDF(); if (genericUDF instanceof GenericUDFBridge) { Class<? extends UDF> udf = ((GenericUDFBridge) genericUDF).getUdfClass(); return supportedGenericUDFs.contains(udf); } else { return supportedGenericUDFs.contains(genericUDF.getClass()); } } return true; } public static ObjectInspector.Category aggregationOutputCategory(VectorAggregateExpression vectorAggrExpr) { ObjectInspector outputObjInspector = vectorAggrExpr.getOutputObjectInspector(); return outputObjInspector.getCategory(); } private Pair<Boolean,Boolean> validateAggregationDesc(AggregationDesc aggDesc, ProcessingMode processingMode, boolean hasKeys) { String udfName = aggDesc.getGenericUDAFName().toLowerCase(); if (!supportedAggregationUdfs.contains(udfName)) { setExpressionIssue("Aggregation Function", "UDF " + udfName + " not supported"); return new Pair<Boolean,Boolean>(false, false); } /* if (aggDesc.getDistinct()) { setExpressionIssue("Aggregation Function", "DISTINCT not supported"); return new Pair<Boolean,Boolean>(false, false); } */ if (aggDesc.getParameters() != null && !validateExprNodeDesc(aggDesc.getParameters(), "Aggregation Function UDF " + udfName + " parameter")) { return new Pair<Boolean,Boolean>(false, false); } // See if we can vectorize the aggregation. VectorizationContext vc = new ValidatorVectorizationContext(hiveConf); VectorAggregateExpression vectorAggrExpr; try { vectorAggrExpr = vc.getAggregatorExpression(aggDesc); } catch (Exception e) { // We should have already attempted to vectorize in validateAggregationDesc. if (LOG.isDebugEnabled()) { LOG.debug("Vectorization of aggregation should have succeeded ", e); } setExpressionIssue("Aggregation Function", "Vectorization of aggreation should have succeeded " + e); return new Pair<Boolean,Boolean>(false, false); } if (LOG.isDebugEnabled()) { LOG.debug("Aggregation " + aggDesc.getExprString() + " --> " + " vector expression " + vectorAggrExpr.toString()); } ObjectInspector.Category outputCategory = aggregationOutputCategory(vectorAggrExpr); boolean outputIsPrimitive = (outputCategory == ObjectInspector.Category.PRIMITIVE); if (processingMode == ProcessingMode.MERGE_PARTIAL && hasKeys && !outputIsPrimitive) { setOperatorIssue("Vectorized Reduce MergePartial GROUP BY keys can only handle aggregate outputs that are primitive types"); return new Pair<Boolean,Boolean>(false, false); } return new Pair<Boolean,Boolean>(true, outputIsPrimitive); } public static boolean validateDataType(String type, VectorExpressionDescriptor.Mode mode) { type = type.toLowerCase(); boolean result = supportedDataTypesPattern.matcher(type).matches(); if (result && mode == VectorExpressionDescriptor.Mode.PROJECTION && type.equals("void")) { return false; } return result; } private VectorizationContext getVectorizationContext(String contextName, VectorTaskColumnInfo vectorTaskColumnInfo) { VectorizationContext vContext = new VectorizationContext(contextName, vectorTaskColumnInfo.allColumnNames, hiveConf); return vContext; } private void fixupParentChildOperators(Operator<? extends OperatorDesc> op, Operator<? extends OperatorDesc> vectorOp) { if (op.getParentOperators() != null) { vectorOp.setParentOperators(op.getParentOperators()); for (Operator<? extends OperatorDesc> p : op.getParentOperators()) { p.replaceChild(op, vectorOp); } } if (op.getChildOperators() != null) { vectorOp.setChildOperators(op.getChildOperators()); for (Operator<? extends OperatorDesc> c : op.getChildOperators()) { c.replaceParent(op, vectorOp); } } } private boolean isBigTableOnlyResults(MapJoinDesc desc) { Byte[] order = desc.getTagOrder(); byte posBigTable = (byte) desc.getPosBigTable(); Byte posSingleVectorMapJoinSmallTable = (order[0] == posBigTable ? order[1] : order[0]); int[] smallTableIndices; int smallTableIndicesSize; if (desc.getValueIndices() != null && desc.getValueIndices().get(posSingleVectorMapJoinSmallTable) != null) { smallTableIndices = desc.getValueIndices().get(posSingleVectorMapJoinSmallTable); LOG.info("Vectorizer isBigTableOnlyResults smallTableIndices " + Arrays.toString(smallTableIndices)); smallTableIndicesSize = smallTableIndices.length; } else { smallTableIndices = null; LOG.info("Vectorizer isBigTableOnlyResults smallTableIndices EMPTY"); smallTableIndicesSize = 0; } List<Integer> smallTableRetainList = desc.getRetainList().get(posSingleVectorMapJoinSmallTable); LOG.info("Vectorizer isBigTableOnlyResults smallTableRetainList " + smallTableRetainList); int smallTableRetainSize = smallTableRetainList.size(); if (smallTableIndicesSize > 0) { // Small table indices has priority over retain. for (int i = 0; i < smallTableIndicesSize; i++) { if (smallTableIndices[i] < 0) { // Negative numbers indicate a column to be (deserialize) read from the small table's // LazyBinary value row. setOperatorIssue("Vectorizer isBigTableOnlyResults smallTableIndices[i] < 0 returning false"); return false; } } } else if (smallTableRetainSize > 0) { setOperatorIssue("Vectorizer isBigTableOnlyResults smallTableRetainSize > 0 returning false"); return false; } LOG.info("Vectorizer isBigTableOnlyResults returning true"); return true; } Operator<? extends OperatorDesc> specializeMapJoinOperator(Operator<? extends OperatorDesc> op, VectorizationContext vContext, MapJoinDesc desc, VectorMapJoinInfo vectorMapJoinInfo) throws HiveException { Operator<? extends OperatorDesc> vectorOp = null; Class<? extends Operator<?>> opClass = null; VectorMapJoinDesc vectorDesc = (VectorMapJoinDesc) desc.getVectorDesc(); HashTableImplementationType hashTableImplementationType = HashTableImplementationType.NONE; HashTableKind hashTableKind = HashTableKind.NONE; HashTableKeyType hashTableKeyType = HashTableKeyType.NONE; OperatorVariation operatorVariation = OperatorVariation.NONE; if (vectorDesc.getIsFastHashTableEnabled()) { hashTableImplementationType = HashTableImplementationType.FAST; } else { hashTableImplementationType = HashTableImplementationType.OPTIMIZED; } int joinType = desc.getConds()[0].getType(); boolean isInnerBigOnly = false; if (joinType == JoinDesc.INNER_JOIN && isBigTableOnlyResults(desc)) { isInnerBigOnly = true; } // By default, we can always use the multi-key class. hashTableKeyType = HashTableKeyType.MULTI_KEY; if (!HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_MULTIKEY_ONLY_ENABLED)) { // Look for single column optimization. byte posBigTable = (byte) desc.getPosBigTable(); Map<Byte, List<ExprNodeDesc>> keyExprs = desc.getKeys(); List<ExprNodeDesc> bigTableKeyExprs = keyExprs.get(posBigTable); if (bigTableKeyExprs.size() == 1) { TypeInfo typeInfo = bigTableKeyExprs.get(0).getTypeInfo(); LOG.info("Vectorizer vectorizeOperator map join typeName " + typeInfo.getTypeName()); switch (((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory()) { case BOOLEAN: hashTableKeyType = HashTableKeyType.BOOLEAN; break; case BYTE: hashTableKeyType = HashTableKeyType.BYTE; break; case SHORT: hashTableKeyType = HashTableKeyType.SHORT; break; case INT: hashTableKeyType = HashTableKeyType.INT; break; case LONG: hashTableKeyType = HashTableKeyType.LONG; break; case STRING: case CHAR: case VARCHAR: case BINARY: hashTableKeyType = HashTableKeyType.STRING; default: // Stay with multi-key. } } } switch (joinType) { case JoinDesc.INNER_JOIN: if (!isInnerBigOnly) { operatorVariation = OperatorVariation.INNER; hashTableKind = HashTableKind.HASH_MAP; } else { operatorVariation = OperatorVariation.INNER_BIG_ONLY; hashTableKind = HashTableKind.HASH_MULTISET; } break; case JoinDesc.LEFT_OUTER_JOIN: case JoinDesc.RIGHT_OUTER_JOIN: operatorVariation = OperatorVariation.OUTER; hashTableKind = HashTableKind.HASH_MAP; break; case JoinDesc.LEFT_SEMI_JOIN: operatorVariation = OperatorVariation.LEFT_SEMI; hashTableKind = HashTableKind.HASH_SET; break; default: throw new HiveException("Unknown join type " + joinType); } LOG.info("Vectorizer vectorizeOperator map join hashTableKind " + hashTableKind.name() + " hashTableKeyType " + hashTableKeyType.name()); switch (hashTableKeyType) { case BOOLEAN: case BYTE: case SHORT: case INT: case LONG: switch (operatorVariation) { case INNER: opClass = VectorMapJoinInnerLongOperator.class; break; case INNER_BIG_ONLY: opClass = VectorMapJoinInnerBigOnlyLongOperator.class; break; case LEFT_SEMI: opClass = VectorMapJoinLeftSemiLongOperator.class; break; case OUTER: opClass = VectorMapJoinOuterLongOperator.class; break; default: throw new HiveException("Unknown operator variation " + operatorVariation); } break; case STRING: switch (operatorVariation) { case INNER: opClass = VectorMapJoinInnerStringOperator.class; break; case INNER_BIG_ONLY: opClass = VectorMapJoinInnerBigOnlyStringOperator.class; break; case LEFT_SEMI: opClass = VectorMapJoinLeftSemiStringOperator.class; break; case OUTER: opClass = VectorMapJoinOuterStringOperator.class; break; default: throw new HiveException("Unknown operator variation " + operatorVariation); } break; case MULTI_KEY: switch (operatorVariation) { case INNER: opClass = VectorMapJoinInnerMultiKeyOperator.class; break; case INNER_BIG_ONLY: opClass = VectorMapJoinInnerBigOnlyMultiKeyOperator.class; break; case LEFT_SEMI: opClass = VectorMapJoinLeftSemiMultiKeyOperator.class; break; case OUTER: opClass = VectorMapJoinOuterMultiKeyOperator.class; break; default: throw new HiveException("Unknown operator variation " + operatorVariation); } break; default: throw new RuntimeException("Unexpected hash table key type " + hashTableKeyType.name()); } boolean minMaxEnabled = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_MINMAX_ENABLED); vectorDesc.setHashTableImplementationType(hashTableImplementationType); vectorDesc.setHashTableKind(hashTableKind); vectorDesc.setHashTableKeyType(hashTableKeyType); vectorDesc.setOperatorVariation(operatorVariation); vectorDesc.setMinMaxEnabled(minMaxEnabled); vectorDesc.setVectorMapJoinInfo(vectorMapJoinInfo); vectorOp = OperatorFactory.getVectorOperator( opClass, op.getCompilationOpContext(), op.getConf(), vContext); LOG.info("Vectorizer vectorizeOperator map join class " + vectorOp.getClass().getSimpleName()); return vectorOp; } public static boolean onExpressionHasNullSafes(MapJoinDesc desc) { boolean[] nullSafes = desc.getNullSafes(); if (nullSafes == null) { return false; } for (boolean nullSafe : nullSafes) { if (nullSafe) { return true; } } return false; } private boolean canSpecializeMapJoin(Operator<? extends OperatorDesc> op, MapJoinDesc desc, boolean isTezOrSpark, VectorizationContext vContext, VectorMapJoinInfo vectorMapJoinInfo) throws HiveException { Preconditions.checkState(op instanceof MapJoinOperator); // Allocate a VectorReduceSinkDesc initially with implementation type NONE so EXPLAIN // can report this operator was vectorized, but not native. And, the conditions. VectorMapJoinDesc vectorDesc = new VectorMapJoinDesc(); desc.setVectorDesc(vectorDesc); boolean isVectorizationMapJoinNativeEnabled = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_ENABLED); String engine = HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE); boolean oneMapJoinCondition = (desc.getConds().length == 1); boolean hasNullSafes = onExpressionHasNullSafes(desc); byte posBigTable = (byte) desc.getPosBigTable(); // Since we want to display all the met and not met conditions in EXPLAIN, we determine all // information first.... List<ExprNodeDesc> keyDesc = desc.getKeys().get(posBigTable); VectorExpression[] allBigTableKeyExpressions = vContext.getVectorExpressions(keyDesc); final int allBigTableKeyExpressionsLength = allBigTableKeyExpressions.length; boolean supportsKeyTypes = true; // Assume. HashSet<String> notSupportedKeyTypes = new HashSet<String>(); // Since a key expression can be a calculation and the key will go into a scratch column, // we need the mapping and type information. int[] bigTableKeyColumnMap = new int[allBigTableKeyExpressionsLength]; String[] bigTableKeyColumnNames = new String[allBigTableKeyExpressionsLength]; TypeInfo[] bigTableKeyTypeInfos = new TypeInfo[allBigTableKeyExpressionsLength]; ArrayList<VectorExpression> bigTableKeyExpressionsList = new ArrayList<VectorExpression>(); VectorExpression[] bigTableKeyExpressions; for (int i = 0; i < allBigTableKeyExpressionsLength; i++) { VectorExpression ve = allBigTableKeyExpressions[i]; if (!IdentityExpression.isColumnOnly(ve)) { bigTableKeyExpressionsList.add(ve); } bigTableKeyColumnMap[i] = ve.getOutputColumn(); ExprNodeDesc exprNode = keyDesc.get(i); bigTableKeyColumnNames[i] = exprNode.toString(); TypeInfo typeInfo = exprNode.getTypeInfo(); // Verify we handle the key column types for an optimized table. This is the effectively the // same check used in HashTableLoader. if (!MapJoinKey.isSupportedField(typeInfo)) { supportsKeyTypes = false; Category category = typeInfo.getCategory(); notSupportedKeyTypes.add( (category != Category.PRIMITIVE ? category.toString() : ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory().toString())); } bigTableKeyTypeInfos[i] = typeInfo; } if (bigTableKeyExpressionsList.size() == 0) { bigTableKeyExpressions = null; } else { bigTableKeyExpressions = bigTableKeyExpressionsList.toArray(new VectorExpression[0]); } List<ExprNodeDesc> bigTableExprs = desc.getExprs().get(posBigTable); VectorExpression[] allBigTableValueExpressions = vContext.getVectorExpressions(bigTableExprs); boolean isFastHashTableEnabled = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED); // Especially since LLAP is prone to turn it off in the MapJoinDesc in later // physical optimizer stages... boolean isHybridHashJoin = desc.isHybridHashJoin(); /* * Populate vectorMapJoininfo. */ /* * Similarly, we need a mapping since a value expression can be a calculation and the value * will go into a scratch column. */ int[] bigTableValueColumnMap = new int[allBigTableValueExpressions.length]; String[] bigTableValueColumnNames = new String[allBigTableValueExpressions.length]; TypeInfo[] bigTableValueTypeInfos = new TypeInfo[allBigTableValueExpressions.length]; ArrayList<VectorExpression> bigTableValueExpressionsList = new ArrayList<VectorExpression>(); VectorExpression[] bigTableValueExpressions; for (int i = 0; i < bigTableValueColumnMap.length; i++) { VectorExpression ve = allBigTableValueExpressions[i]; if (!IdentityExpression.isColumnOnly(ve)) { bigTableValueExpressionsList.add(ve); } bigTableValueColumnMap[i] = ve.getOutputColumn(); ExprNodeDesc exprNode = bigTableExprs.get(i); bigTableValueColumnNames[i] = exprNode.toString(); bigTableValueTypeInfos[i] = exprNode.getTypeInfo(); } if (bigTableValueExpressionsList.size() == 0) { bigTableValueExpressions = null; } else { bigTableValueExpressions = bigTableValueExpressionsList.toArray(new VectorExpression[0]); } vectorMapJoinInfo.setBigTableKeyColumnMap(bigTableKeyColumnMap); vectorMapJoinInfo.setBigTableKeyColumnNames(bigTableKeyColumnNames); vectorMapJoinInfo.setBigTableKeyTypeInfos(bigTableKeyTypeInfos); vectorMapJoinInfo.setBigTableKeyExpressions(bigTableKeyExpressions); vectorMapJoinInfo.setBigTableValueColumnMap(bigTableValueColumnMap); vectorMapJoinInfo.setBigTableValueColumnNames(bigTableValueColumnNames); vectorMapJoinInfo.setBigTableValueTypeInfos(bigTableValueTypeInfos); vectorMapJoinInfo.setBigTableValueExpressions(bigTableValueExpressions); /* * Small table information. */ VectorColumnOutputMapping bigTableRetainedMapping = new VectorColumnOutputMapping("Big Table Retained Mapping"); VectorColumnOutputMapping bigTableOuterKeyMapping = new VectorColumnOutputMapping("Big Table Outer Key Mapping"); // The order of the fields in the LazyBinary small table value must be used, so // we use the source ordering flavor for the mapping. VectorColumnSourceMapping smallTableMapping = new VectorColumnSourceMapping("Small Table Mapping"); Byte[] order = desc.getTagOrder(); Byte posSingleVectorMapJoinSmallTable = (order[0] == posBigTable ? order[1] : order[0]); boolean isOuterJoin = !desc.getNoOuterJoin(); /* * Gather up big and small table output result information from the MapJoinDesc. */ List<Integer> bigTableRetainList = desc.getRetainList().get(posBigTable); int bigTableRetainSize = bigTableRetainList.size(); int[] smallTableIndices; int smallTableIndicesSize; List<ExprNodeDesc> smallTableExprs = desc.getExprs().get(posSingleVectorMapJoinSmallTable); if (desc.getValueIndices() != null && desc.getValueIndices().get(posSingleVectorMapJoinSmallTable) != null) { smallTableIndices = desc.getValueIndices().get(posSingleVectorMapJoinSmallTable); smallTableIndicesSize = smallTableIndices.length; } else { smallTableIndices = null; smallTableIndicesSize = 0; } List<Integer> smallTableRetainList = desc.getRetainList().get(posSingleVectorMapJoinSmallTable); int smallTableRetainSize = smallTableRetainList.size(); int smallTableResultSize = 0; if (smallTableIndicesSize > 0) { smallTableResultSize = smallTableIndicesSize; } else if (smallTableRetainSize > 0) { smallTableResultSize = smallTableRetainSize; } /* * Determine the big table retained mapping first so we can optimize out (with * projection) copying inner join big table keys in the subsequent small table results section. */ // We use a mapping object here so we can build the projection in any order and // get the ordered by 0 to n-1 output columns at the end. // // Also, to avoid copying a big table key into the small table result area for inner joins, // we reference it with the projection so there can be duplicate output columns // in the projection. VectorColumnSourceMapping projectionMapping = new VectorColumnSourceMapping("Projection Mapping"); int nextOutputColumn = (order[0] == posBigTable ? 0 : smallTableResultSize); for (int i = 0; i < bigTableRetainSize; i++) { // Since bigTableValueExpressions may do a calculation and produce a scratch column, we // need to map to the right batch column. int retainColumn = bigTableRetainList.get(i); int batchColumnIndex = bigTableValueColumnMap[retainColumn]; TypeInfo typeInfo = bigTableValueTypeInfos[i]; // With this map we project the big table batch to make it look like an output batch. projectionMapping.add(nextOutputColumn, batchColumnIndex, typeInfo); // Collect columns we copy from the big table batch to the overflow batch. if (!bigTableRetainedMapping.containsOutputColumn(batchColumnIndex)) { // Tolerate repeated use of a big table column. bigTableRetainedMapping.add(batchColumnIndex, batchColumnIndex, typeInfo); } nextOutputColumn++; } /* * Now determine the small table results. */ boolean smallTableExprVectorizes = true; int firstSmallTableOutputColumn; firstSmallTableOutputColumn = (order[0] == posBigTable ? bigTableRetainSize : 0); int smallTableOutputCount = 0; nextOutputColumn = firstSmallTableOutputColumn; // Small table indices has more information (i.e. keys) than retain, so use it if it exists... String[] bigTableRetainedNames; if (smallTableIndicesSize > 0) { smallTableOutputCount = smallTableIndicesSize; bigTableRetainedNames = new String[smallTableOutputCount]; for (int i = 0; i < smallTableIndicesSize; i++) { if (smallTableIndices[i] >= 0) { // Zero and above numbers indicate a big table key is needed for // small table result "area". int keyIndex = smallTableIndices[i]; // Since bigTableKeyExpressions may do a calculation and produce a scratch column, we // need to map the right column. int batchKeyColumn = bigTableKeyColumnMap[keyIndex]; bigTableRetainedNames[i] = bigTableKeyColumnNames[keyIndex]; TypeInfo typeInfo = bigTableKeyTypeInfos[keyIndex]; if (!isOuterJoin) { // Optimize inner join keys of small table results. // Project the big table key into the small table result "area". projectionMapping.add(nextOutputColumn, batchKeyColumn, typeInfo); if (!bigTableRetainedMapping.containsOutputColumn(batchKeyColumn)) { // If necessary, copy the big table key into the overflow batch's small table // result "area". bigTableRetainedMapping.add(batchKeyColumn, batchKeyColumn, typeInfo); } } else { // For outer joins, since the small table key can be null when there is no match, // we must have a physical (scratch) column for those keys. We cannot use the // projection optimization used by inner joins above. int scratchColumn = vContext.allocateScratchColumn(typeInfo); projectionMapping.add(nextOutputColumn, scratchColumn, typeInfo); bigTableRetainedMapping.add(batchKeyColumn, scratchColumn, typeInfo); bigTableOuterKeyMapping.add(batchKeyColumn, scratchColumn, typeInfo); } } else { // Negative numbers indicate a column to be (deserialize) read from the small table's // LazyBinary value row. int smallTableValueIndex = -smallTableIndices[i] - 1; ExprNodeDesc smallTableExprNode = smallTableExprs.get(i); if (!validateExprNodeDesc(smallTableExprNode, "Small Table")) { clearNotVectorizedReason(); smallTableExprVectorizes = false; } bigTableRetainedNames[i] = smallTableExprNode.toString(); TypeInfo typeInfo = smallTableExprNode.getTypeInfo(); // Make a new big table scratch column for the small table value. int scratchColumn = vContext.allocateScratchColumn(typeInfo); projectionMapping.add(nextOutputColumn, scratchColumn, typeInfo); smallTableMapping.add(smallTableValueIndex, scratchColumn, typeInfo); } nextOutputColumn++; } } else if (smallTableRetainSize > 0) { smallTableOutputCount = smallTableRetainSize; bigTableRetainedNames = new String[smallTableOutputCount]; // Only small table values appear in join output result. for (int i = 0; i < smallTableRetainSize; i++) { int smallTableValueIndex = smallTableRetainList.get(i); ExprNodeDesc smallTableExprNode = smallTableExprs.get(i); if (!validateExprNodeDesc(smallTableExprNode, "Small Table")) { clearNotVectorizedReason(); smallTableExprVectorizes = false; } bigTableRetainedNames[i] = smallTableExprNode.toString(); // Make a new big table scratch column for the small table value. TypeInfo typeInfo = smallTableExprNode.getTypeInfo(); int scratchColumn = vContext.allocateScratchColumn(typeInfo); projectionMapping.add(nextOutputColumn, scratchColumn, typeInfo); smallTableMapping.add(smallTableValueIndex, scratchColumn, typeInfo); nextOutputColumn++; } } else { bigTableRetainedNames = new String[0]; } boolean useOptimizedTable = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE); // Remember the condition variables for EXPLAIN regardless of whether we specialize or not. vectorDesc.setUseOptimizedTable(useOptimizedTable); vectorDesc.setIsVectorizationMapJoinNativeEnabled(isVectorizationMapJoinNativeEnabled); vectorDesc.setEngine(engine); vectorDesc.setOneMapJoinCondition(oneMapJoinCondition); vectorDesc.setHasNullSafes(hasNullSafes); vectorDesc.setSmallTableExprVectorizes(smallTableExprVectorizes); vectorDesc.setIsFastHashTableEnabled(isFastHashTableEnabled); vectorDesc.setIsHybridHashJoin(isHybridHashJoin); vectorDesc.setSupportsKeyTypes(supportsKeyTypes); if (!supportsKeyTypes) { vectorDesc.setNotSupportedKeyTypes(new ArrayList(notSupportedKeyTypes)); } // Check common conditions for both Optimized and Fast Hash Tables. boolean result = true; // Assume. if (!useOptimizedTable || !isVectorizationMapJoinNativeEnabled || !isTezOrSpark || !oneMapJoinCondition || hasNullSafes || !smallTableExprVectorizes) { result = false; } // supportsKeyTypes if (!isFastHashTableEnabled) { // Check optimized-only hash table restrictions. if (!supportsKeyTypes) { result = false; } } else { // With the fast hash table implementation, we currently do not support // Hybrid Grace Hash Join. if (isHybridHashJoin) { result = false; } } // Convert dynamic arrays and maps to simple arrays. bigTableRetainedMapping.finalize(); bigTableOuterKeyMapping.finalize(); smallTableMapping.finalize(); vectorMapJoinInfo.setBigTableRetainedMapping(bigTableRetainedMapping); vectorMapJoinInfo.setBigTableOuterKeyMapping(bigTableOuterKeyMapping); vectorMapJoinInfo.setSmallTableMapping(smallTableMapping); projectionMapping.finalize(); // Verify we added an entry for each output. assert projectionMapping.isSourceSequenceGood(); vectorMapJoinInfo.setProjectionMapping(projectionMapping); return result; } private Operator<? extends OperatorDesc> specializeReduceSinkOperator( Operator<? extends OperatorDesc> op, VectorizationContext vContext, ReduceSinkDesc desc, VectorReduceSinkInfo vectorReduceSinkInfo) throws HiveException { VectorReduceSinkDesc vectorDesc = (VectorReduceSinkDesc) desc.getVectorDesc(); Type[] reduceSinkKeyColumnVectorTypes = vectorReduceSinkInfo.getReduceSinkKeyColumnVectorTypes(); // By default, we can always use the multi-key class. VectorReduceSinkDesc.ReduceSinkKeyType reduceSinkKeyType = VectorReduceSinkDesc.ReduceSinkKeyType.MULTI_KEY; // Look for single column optimization. if (reduceSinkKeyColumnVectorTypes != null && reduceSinkKeyColumnVectorTypes.length == 1) { LOG.info("Vectorizer vectorizeOperator groupby typeName " + vectorReduceSinkInfo.getReduceSinkKeyTypeInfos()[0]); Type columnVectorType = reduceSinkKeyColumnVectorTypes[0]; switch (columnVectorType) { case LONG: { PrimitiveCategory primitiveCategory = ((PrimitiveTypeInfo) vectorReduceSinkInfo.getReduceSinkKeyTypeInfos()[0]).getPrimitiveCategory(); switch (primitiveCategory) { case BOOLEAN: case BYTE: case SHORT: case INT: case LONG: reduceSinkKeyType = VectorReduceSinkDesc.ReduceSinkKeyType.LONG; break; default: // Other integer types not supported yet. break; } } break; case BYTES: reduceSinkKeyType = VectorReduceSinkDesc.ReduceSinkKeyType.STRING; default: // Stay with multi-key. break; } } Class<? extends Operator<?>> opClass = null; if (vectorReduceSinkInfo.getUseUniformHash()) { if (vectorDesc.getIsEmptyKey()) { opClass = VectorReduceSinkEmptyKeyOperator.class; } else { switch (reduceSinkKeyType) { case LONG: opClass = VectorReduceSinkLongOperator.class; break; case STRING: opClass = VectorReduceSinkStringOperator.class; break; case MULTI_KEY: opClass = VectorReduceSinkMultiKeyOperator.class; break; default: throw new HiveException("Unknown reduce sink key type " + reduceSinkKeyType); } } } else { if (vectorDesc.getIsEmptyKey() && vectorDesc.getIsEmptyBuckets() && vectorDesc.getIsEmptyPartitions()) { opClass = VectorReduceSinkEmptyKeyOperator.class; } else { opClass = VectorReduceSinkObjectHashOperator.class; } } vectorDesc.setReduceSinkKeyType(reduceSinkKeyType); vectorDesc.setVectorReduceSinkInfo(vectorReduceSinkInfo); LOG.info("Vectorizer vectorizeOperator reduce sink class " + opClass.getSimpleName()); Operator<? extends OperatorDesc> vectorOp = null; try { vectorOp = OperatorFactory.getVectorOperator( opClass, op.getCompilationOpContext(), op.getConf(), vContext); } catch (Exception e) { LOG.info("Vectorizer vectorizeOperator reduce sink class exception " + opClass.getSimpleName() + " exception " + e); throw new HiveException(e); } return vectorOp; } private boolean canSpecializeReduceSink(ReduceSinkDesc desc, boolean isTezOrSpark, VectorizationContext vContext, VectorReduceSinkInfo vectorReduceSinkInfo) throws HiveException { // Allocate a VectorReduceSinkDesc initially with key type NONE so EXPLAIN can report this // operator was vectorized, but not native. And, the conditions. VectorReduceSinkDesc vectorDesc = new VectorReduceSinkDesc(); desc.setVectorDesc(vectorDesc); // Various restrictions. // Set this if we encounter a condition we were not expecting. boolean isUnexpectedCondition = false; boolean isVectorizationReduceSinkNativeEnabled = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCESINK_NEW_ENABLED); String engine = HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE); int limit = desc.getTopN(); float memUsage = desc.getTopNMemoryUsage(); boolean hasPTFTopN = (limit >= 0 && memUsage > 0 && desc.isPTFReduceSink()); boolean hasDistinctColumns = (desc.getDistinctColumnIndices().size() > 0); TableDesc keyTableDesc = desc.getKeySerializeInfo(); Class<? extends Deserializer> keySerializerClass = keyTableDesc.getDeserializerClass(); boolean isKeyBinarySortable = (keySerializerClass == org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe.class); TableDesc valueTableDesc = desc.getValueSerializeInfo(); Class<? extends Deserializer> valueDeserializerClass = valueTableDesc.getDeserializerClass(); boolean isValueLazyBinary = (valueDeserializerClass == org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe.class); // We are doing work here we'd normally do in VectorGroupByCommonOperator's constructor. // So if we later decide not to specialize, we'll just waste any scratch columns allocated... List<ExprNodeDesc> keysDescs = desc.getKeyCols(); final boolean isEmptyKey = (keysDescs.size() == 0); if (!isEmptyKey) { VectorExpression[] allKeyExpressions = vContext.getVectorExpressions(keysDescs); final int[] reduceSinkKeyColumnMap = new int[allKeyExpressions.length]; final TypeInfo[] reduceSinkKeyTypeInfos = new TypeInfo[allKeyExpressions.length]; final Type[] reduceSinkKeyColumnVectorTypes = new Type[allKeyExpressions.length]; final VectorExpression[] reduceSinkKeyExpressions; // Since a key expression can be a calculation and the key will go into a scratch column, // we need the mapping and type information. ArrayList<VectorExpression> groupByKeyExpressionsList = new ArrayList<VectorExpression>(); for (int i = 0; i < reduceSinkKeyColumnMap.length; i++) { VectorExpression ve = allKeyExpressions[i]; reduceSinkKeyColumnMap[i] = ve.getOutputColumn(); reduceSinkKeyTypeInfos[i] = keysDescs.get(i).getTypeInfo(); reduceSinkKeyColumnVectorTypes[i] = VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkKeyTypeInfos[i]); if (!IdentityExpression.isColumnOnly(ve)) { groupByKeyExpressionsList.add(ve); } } if (groupByKeyExpressionsList.size() == 0) { reduceSinkKeyExpressions = null; } else { reduceSinkKeyExpressions = groupByKeyExpressionsList.toArray(new VectorExpression[0]); } vectorReduceSinkInfo.setReduceSinkKeyColumnMap(reduceSinkKeyColumnMap); vectorReduceSinkInfo.setReduceSinkKeyTypeInfos(reduceSinkKeyTypeInfos); vectorReduceSinkInfo.setReduceSinkKeyColumnVectorTypes(reduceSinkKeyColumnVectorTypes); vectorReduceSinkInfo.setReduceSinkKeyExpressions(reduceSinkKeyExpressions); } ArrayList<ExprNodeDesc> valueDescs = desc.getValueCols(); final boolean isEmptyValue = (valueDescs.size() == 0); if (!isEmptyValue) { VectorExpression[] allValueExpressions = vContext.getVectorExpressions(valueDescs); final int[] reduceSinkValueColumnMap = new int[allValueExpressions.length]; final TypeInfo[] reduceSinkValueTypeInfos = new TypeInfo[allValueExpressions.length]; final Type[] reduceSinkValueColumnVectorTypes = new Type[allValueExpressions.length]; VectorExpression[] reduceSinkValueExpressions; ArrayList<VectorExpression> reduceSinkValueExpressionsList = new ArrayList<VectorExpression>(); for (int i = 0; i < valueDescs.size(); ++i) { VectorExpression ve = allValueExpressions[i]; reduceSinkValueColumnMap[i] = ve.getOutputColumn(); reduceSinkValueTypeInfos[i] = valueDescs.get(i).getTypeInfo(); reduceSinkValueColumnVectorTypes[i] = VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkValueTypeInfos[i]); if (!IdentityExpression.isColumnOnly(ve)) { reduceSinkValueExpressionsList.add(ve); } } if (reduceSinkValueExpressionsList.size() == 0) { reduceSinkValueExpressions = null; } else { reduceSinkValueExpressions = reduceSinkValueExpressionsList.toArray(new VectorExpression[0]); } vectorReduceSinkInfo.setReduceSinkValueColumnMap(reduceSinkValueColumnMap); vectorReduceSinkInfo.setReduceSinkValueTypeInfos(reduceSinkValueTypeInfos); vectorReduceSinkInfo.setReduceSinkValueColumnVectorTypes(reduceSinkValueColumnVectorTypes); vectorReduceSinkInfo.setReduceSinkValueExpressions(reduceSinkValueExpressions); } boolean useUniformHash = desc.getReducerTraits().contains(UNIFORM); vectorReduceSinkInfo.setUseUniformHash(useUniformHash); List<ExprNodeDesc> bucketDescs = desc.getBucketCols(); final boolean isEmptyBuckets = (bucketDescs == null || bucketDescs.size() == 0); List<ExprNodeDesc> partitionDescs = desc.getPartitionCols(); final boolean isEmptyPartitions = (partitionDescs == null || partitionDescs.size() == 0); if (useUniformHash || (isEmptyKey && isEmptyBuckets && isEmptyPartitions)) { // NOTE: For Uniform Hash or no buckets/partitions, when the key is empty, we will use the VectorReduceSinkEmptyKeyOperator instead. } else { // Collect bucket and/or partition information for object hashing. int[] reduceSinkBucketColumnMap = null; TypeInfo[] reduceSinkBucketTypeInfos = null; Type[] reduceSinkBucketColumnVectorTypes = null; VectorExpression[] reduceSinkBucketExpressions = null; if (!isEmptyBuckets) { VectorExpression[] allBucketExpressions = vContext.getVectorExpressions(bucketDescs); reduceSinkBucketColumnMap = new int[bucketDescs.size()]; reduceSinkBucketTypeInfos = new TypeInfo[bucketDescs.size()]; reduceSinkBucketColumnVectorTypes = new Type[bucketDescs.size()]; ArrayList<VectorExpression> reduceSinkBucketExpressionsList = new ArrayList<VectorExpression>(); for (int i = 0; i < bucketDescs.size(); ++i) { VectorExpression ve = allBucketExpressions[i]; reduceSinkBucketColumnMap[i] = ve.getOutputColumn(); reduceSinkBucketTypeInfos[i] = bucketDescs.get(i).getTypeInfo(); reduceSinkBucketColumnVectorTypes[i] = VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkBucketTypeInfos[i]); if (!IdentityExpression.isColumnOnly(ve)) { reduceSinkBucketExpressionsList.add(ve); } } if (reduceSinkBucketExpressionsList.size() == 0) { reduceSinkBucketExpressions = null; } else { reduceSinkBucketExpressions = reduceSinkBucketExpressionsList.toArray(new VectorExpression[0]); } } int[] reduceSinkPartitionColumnMap = null; TypeInfo[] reduceSinkPartitionTypeInfos = null; Type[] reduceSinkPartitionColumnVectorTypes = null; VectorExpression[] reduceSinkPartitionExpressions = null; if (!isEmptyPartitions) { VectorExpression[] allPartitionExpressions = vContext.getVectorExpressions(partitionDescs); reduceSinkPartitionColumnMap = new int[partitionDescs.size()]; reduceSinkPartitionTypeInfos = new TypeInfo[partitionDescs.size()]; reduceSinkPartitionColumnVectorTypes = new Type[partitionDescs.size()]; ArrayList<VectorExpression> reduceSinkPartitionExpressionsList = new ArrayList<VectorExpression>(); for (int i = 0; i < partitionDescs.size(); ++i) { VectorExpression ve = allPartitionExpressions[i]; reduceSinkPartitionColumnMap[i] = ve.getOutputColumn(); reduceSinkPartitionTypeInfos[i] = partitionDescs.get(i).getTypeInfo(); reduceSinkPartitionColumnVectorTypes[i] = VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkPartitionTypeInfos[i]); if (!IdentityExpression.isColumnOnly(ve)) { reduceSinkPartitionExpressionsList.add(ve); } } if (reduceSinkPartitionExpressionsList.size() == 0) { reduceSinkPartitionExpressions = null; } else { reduceSinkPartitionExpressions = reduceSinkPartitionExpressionsList.toArray(new VectorExpression[0]); } } vectorReduceSinkInfo.setReduceSinkBucketColumnMap(reduceSinkBucketColumnMap); vectorReduceSinkInfo.setReduceSinkBucketTypeInfos(reduceSinkBucketTypeInfos); vectorReduceSinkInfo.setReduceSinkBucketColumnVectorTypes(reduceSinkBucketColumnVectorTypes); vectorReduceSinkInfo.setReduceSinkBucketExpressions(reduceSinkBucketExpressions); vectorReduceSinkInfo.setReduceSinkPartitionColumnMap(reduceSinkPartitionColumnMap); vectorReduceSinkInfo.setReduceSinkPartitionTypeInfos(reduceSinkPartitionTypeInfos); vectorReduceSinkInfo.setReduceSinkPartitionColumnVectorTypes(reduceSinkPartitionColumnVectorTypes); vectorReduceSinkInfo.setReduceSinkPartitionExpressions(reduceSinkPartitionExpressions); } // Remember the condition variables for EXPLAIN regardless. vectorDesc.setIsVectorizationReduceSinkNativeEnabled(isVectorizationReduceSinkNativeEnabled); vectorDesc.setEngine(engine); vectorDesc.setIsEmptyKey(isEmptyKey); vectorDesc.setIsEmptyValue(isEmptyValue); vectorDesc.setIsEmptyBuckets(isEmptyBuckets); vectorDesc.setIsEmptyPartitions(isEmptyPartitions); vectorDesc.setHasPTFTopN(hasPTFTopN); vectorDesc.setHasDistinctColumns(hasDistinctColumns); vectorDesc.setIsKeyBinarySortable(isKeyBinarySortable); vectorDesc.setIsValueLazyBinary(isValueLazyBinary); // This indicates we logged an inconsistency (from our point-of-view) and will not make this // operator native... vectorDesc.setIsUnexpectedCondition(isUnexpectedCondition); // Many restrictions. if (!isVectorizationReduceSinkNativeEnabled || !isTezOrSpark || hasPTFTopN || hasDistinctColumns || !isKeyBinarySortable || !isValueLazyBinary || isUnexpectedCondition) { return false; } return true; } private boolean usesVectorUDFAdaptor(VectorExpression vecExpr) { if (vecExpr == null) { return false; } if (vecExpr instanceof VectorUDFAdaptor) { return true; } if (usesVectorUDFAdaptor(vecExpr.getChildExpressions())) { return true; } return false; } private boolean usesVectorUDFAdaptor(VectorExpression[] vecExprs) { if (vecExprs == null) { return false; } for (VectorExpression vecExpr : vecExprs) { if (usesVectorUDFAdaptor(vecExpr)) { return true; } } return false; } public static Operator<? extends OperatorDesc> vectorizeTableScanOperator( Operator<? extends OperatorDesc> tableScanOp, VectorizationContext vContext) throws HiveException { TableScanDesc tableScanDesc = (TableScanDesc) tableScanOp.getConf(); VectorTableScanDesc vectorTableScanDesc = new VectorTableScanDesc(); tableScanDesc.setVectorDesc(vectorTableScanDesc); vectorTableScanDesc.setProjectedOutputColumns( ArrayUtils.toPrimitive(vContext.getProjectedColumns().toArray(new Integer[0]))); return tableScanOp; } public static Operator<? extends OperatorDesc> vectorizeFilterOperator( Operator<? extends OperatorDesc> filterOp, VectorizationContext vContext) throws HiveException { FilterDesc filterDesc = (FilterDesc) filterOp.getConf(); VectorFilterDesc vectorFilterDesc = new VectorFilterDesc(); filterDesc.setVectorDesc(vectorFilterDesc); ExprNodeDesc predicateExpr = filterDesc.getPredicate(); VectorExpression vectorPredicateExpr = vContext.getVectorExpression(predicateExpr, VectorExpressionDescriptor.Mode.FILTER); vectorFilterDesc.setPredicateExpression(vectorPredicateExpr); return OperatorFactory.getVectorOperator( filterOp.getCompilationOpContext(), filterDesc, vContext); } /* * NOTE: The VectorGroupByDesc has already been allocated and partially populated. */ public static Operator<? extends OperatorDesc> vectorizeGroupByOperator( Operator<? extends OperatorDesc> groupByOp, VectorizationContext vContext) throws HiveException { GroupByDesc groupByDesc = (GroupByDesc) groupByOp.getConf(); List<ExprNodeDesc> keysDesc = groupByDesc.getKeys(); VectorExpression[] vecKeyExpressions = vContext.getVectorExpressions(keysDesc); ArrayList<AggregationDesc> aggrDesc = groupByDesc.getAggregators(); final int size = aggrDesc.size(); VectorAggregateExpression[] vecAggregators = new VectorAggregateExpression[size]; int[] projectedOutputColumns = new int[size]; for (int i = 0; i < size; ++i) { AggregationDesc aggDesc = aggrDesc.get(i); vecAggregators[i] = vContext.getAggregatorExpression(aggDesc); // GroupBy generates a new vectorized row batch... projectedOutputColumns[i] = i; } VectorGroupByDesc vectorGroupByDesc = (VectorGroupByDesc) groupByDesc.getVectorDesc(); vectorGroupByDesc.setKeyExpressions(vecKeyExpressions); vectorGroupByDesc.setAggregators(vecAggregators); vectorGroupByDesc.setProjectedOutputColumns(projectedOutputColumns); return OperatorFactory.getVectorOperator( groupByOp.getCompilationOpContext(), groupByDesc, vContext); } public static Operator<? extends OperatorDesc> vectorizeSelectOperator( Operator<? extends OperatorDesc> selectOp, VectorizationContext vContext) throws HiveException { SelectDesc selectDesc = (SelectDesc) selectOp.getConf(); VectorSelectDesc vectorSelectDesc = new VectorSelectDesc(); selectDesc.setVectorDesc(vectorSelectDesc); List<ExprNodeDesc> colList = selectDesc.getColList(); int index = 0; final int size = colList.size(); VectorExpression[] vectorSelectExprs = new VectorExpression[size]; int[] projectedOutputColumns = new int[size]; for (int i = 0; i < size; i++) { ExprNodeDesc expr = colList.get(i); VectorExpression ve = vContext.getVectorExpression(expr); projectedOutputColumns[i] = ve.getOutputColumn(); if (ve instanceof IdentityExpression) { // Suppress useless evaluation. continue; } vectorSelectExprs[index++] = ve; } if (index < size) { vectorSelectExprs = Arrays.copyOf(vectorSelectExprs, index); } vectorSelectDesc.setSelectExpressions(vectorSelectExprs); vectorSelectDesc.setProjectedOutputColumns(projectedOutputColumns); return OperatorFactory.getVectorOperator( selectOp.getCompilationOpContext(), selectDesc, vContext); } public Operator<? extends OperatorDesc> vectorizeOperator(Operator<? extends OperatorDesc> op, VectorizationContext vContext, boolean isTezOrSpark, VectorTaskColumnInfo vectorTaskColumnInfo) throws HiveException { Operator<? extends OperatorDesc> vectorOp = null; boolean isNative; switch (op.getType()) { case TABLESCAN: vectorOp = vectorizeTableScanOperator(op, vContext); isNative = true; break; case MAPJOIN: { if (op instanceof MapJoinOperator) { VectorMapJoinInfo vectorMapJoinInfo = new VectorMapJoinInfo(); MapJoinDesc desc = (MapJoinDesc) op.getConf(); boolean specialize = canSpecializeMapJoin(op, desc, isTezOrSpark, vContext, vectorMapJoinInfo); if (!specialize) { Class<? extends Operator<?>> opClass = null; // *NON-NATIVE* vector map differences for LEFT OUTER JOIN and Filtered... List<ExprNodeDesc> bigTableFilters = desc.getFilters().get((byte) desc.getPosBigTable()); boolean isOuterAndFiltered = (!desc.isNoOuterJoin() && bigTableFilters.size() > 0); if (!isOuterAndFiltered) { opClass = VectorMapJoinOperator.class; } else { opClass = VectorMapJoinOuterFilteredOperator.class; } vectorOp = OperatorFactory.getVectorOperator( opClass, op.getCompilationOpContext(), op.getConf(), vContext); isNative = false; } else { // TEMPORARY Until Native Vector Map Join with Hybrid passes tests... // HiveConf.setBoolVar(physicalContext.getConf(), // HiveConf.ConfVars.HIVEUSEHYBRIDGRACEHASHJOIN, false); vectorOp = specializeMapJoinOperator(op, vContext, desc, vectorMapJoinInfo); isNative = true; if (vectorTaskColumnInfo != null) { if (usesVectorUDFAdaptor(vectorMapJoinInfo.getBigTableKeyExpressions())) { vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true); } if (usesVectorUDFAdaptor(vectorMapJoinInfo.getBigTableValueExpressions())) { vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true); } } } } else { Preconditions.checkState(op instanceof SMBMapJoinOperator); SMBJoinDesc smbJoinSinkDesc = (SMBJoinDesc) op.getConf(); VectorSMBJoinDesc vectorSMBJoinDesc = new VectorSMBJoinDesc(); smbJoinSinkDesc.setVectorDesc(vectorSMBJoinDesc); vectorOp = OperatorFactory.getVectorOperator( op.getCompilationOpContext(), smbJoinSinkDesc, vContext); isNative = false; } } break; case REDUCESINK: { VectorReduceSinkInfo vectorReduceSinkInfo = new VectorReduceSinkInfo(); ReduceSinkDesc desc = (ReduceSinkDesc) op.getConf(); boolean specialize = canSpecializeReduceSink(desc, isTezOrSpark, vContext, vectorReduceSinkInfo); if (!specialize) { vectorOp = OperatorFactory.getVectorOperator( op.getCompilationOpContext(), op.getConf(), vContext); isNative = false; } else { vectorOp = specializeReduceSinkOperator(op, vContext, desc, vectorReduceSinkInfo); isNative = true; if (vectorTaskColumnInfo != null) { if (usesVectorUDFAdaptor(vectorReduceSinkInfo.getReduceSinkKeyExpressions())) { vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true); } if (usesVectorUDFAdaptor(vectorReduceSinkInfo.getReduceSinkValueExpressions())) { vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true); } } } } break; case FILTER: { vectorOp = vectorizeFilterOperator(op, vContext); isNative = true; if (vectorTaskColumnInfo != null) { VectorFilterDesc vectorFilterDesc = (VectorFilterDesc) ((AbstractOperatorDesc) vectorOp.getConf()).getVectorDesc(); VectorExpression vectorPredicateExpr = vectorFilterDesc.getPredicateExpression(); if (usesVectorUDFAdaptor(vectorPredicateExpr)) { vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true); } } } break; case SELECT: { vectorOp = vectorizeSelectOperator(op, vContext); isNative = true; if (vectorTaskColumnInfo != null) { VectorSelectDesc vectorSelectDesc = (VectorSelectDesc) ((AbstractOperatorDesc) vectorOp.getConf()).getVectorDesc(); VectorExpression[] vectorSelectExprs = vectorSelectDesc.getSelectExpressions(); if (usesVectorUDFAdaptor(vectorSelectExprs)) { vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true); } } } break; case GROUPBY: { vectorOp = vectorizeGroupByOperator(op, vContext); isNative = false; if (vectorTaskColumnInfo != null) { VectorGroupByDesc vectorGroupByDesc = (VectorGroupByDesc) ((AbstractOperatorDesc) vectorOp.getConf()).getVectorDesc(); if (!vectorGroupByDesc.isVectorOutput()) { vectorTaskColumnInfo.setGroupByVectorOutput(false); } VectorExpression[] vecKeyExpressions = vectorGroupByDesc.getKeyExpressions(); if (usesVectorUDFAdaptor(vecKeyExpressions)) { vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true); } VectorAggregateExpression[] vecAggregators = vectorGroupByDesc.getAggregators(); for (VectorAggregateExpression vecAggr : vecAggregators) { if (usesVectorUDFAdaptor(vecAggr.inputExpression())) { vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true); } } } } break; case FILESINK: { FileSinkDesc fileSinkDesc = (FileSinkDesc) op.getConf(); VectorFileSinkDesc vectorFileSinkDesc = new VectorFileSinkDesc(); fileSinkDesc.setVectorDesc(vectorFileSinkDesc); vectorOp = OperatorFactory.getVectorOperator( op.getCompilationOpContext(), fileSinkDesc, vContext); isNative = false; } break; case LIMIT: { LimitDesc limitDesc = (LimitDesc) op.getConf(); VectorLimitDesc vectorLimitDesc = new VectorLimitDesc(); limitDesc.setVectorDesc(vectorLimitDesc); vectorOp = OperatorFactory.getVectorOperator( op.getCompilationOpContext(), limitDesc, vContext); isNative = true; } break; case EVENT: { AppMasterEventDesc eventDesc = (AppMasterEventDesc) op.getConf(); VectorAppMasterEventDesc vectorEventDesc = new VectorAppMasterEventDesc(); eventDesc.setVectorDesc(vectorEventDesc); vectorOp = OperatorFactory.getVectorOperator( op.getCompilationOpContext(), eventDesc, vContext); isNative = true; } break; case HASHTABLESINK: { SparkHashTableSinkDesc sparkHashTableSinkDesc = (SparkHashTableSinkDesc) op.getConf(); VectorSparkHashTableSinkDesc vectorSparkHashTableSinkDesc = new VectorSparkHashTableSinkDesc(); sparkHashTableSinkDesc.setVectorDesc(vectorSparkHashTableSinkDesc); vectorOp = OperatorFactory.getVectorOperator( op.getCompilationOpContext(), sparkHashTableSinkDesc, vContext); isNative = true; } break; case SPARKPRUNINGSINK: { SparkPartitionPruningSinkDesc sparkPartitionPruningSinkDesc = (SparkPartitionPruningSinkDesc) op.getConf(); VectorSparkPartitionPruningSinkDesc vectorSparkPartitionPruningSinkDesc = new VectorSparkPartitionPruningSinkDesc(); sparkPartitionPruningSinkDesc.setVectorDesc(vectorSparkPartitionPruningSinkDesc); vectorOp = OperatorFactory.getVectorOperator( op.getCompilationOpContext(), sparkPartitionPruningSinkDesc, vContext); isNative = true; } break; default: // These are children of GROUP BY operators with non-vector outputs. isNative = false; vectorOp = op; break; } Preconditions.checkState(vectorOp != null); if (vectorTaskColumnInfo != null && !isNative) { vectorTaskColumnInfo.setAllNative(false); } LOG.debug("vectorizeOperator " + vectorOp.getClass().getName()); LOG.debug("vectorizeOperator " + vectorOp.getConf().getClass().getName()); if (vectorOp != op) { fixupParentChildOperators(op, vectorOp); ((AbstractOperatorDesc) vectorOp.getConf()).setVectorMode(true); } return vectorOp; } private boolean isVirtualColumn(ColumnInfo column) { // Not using method column.getIsVirtualCol() because partitioning columns are also // treated as virtual columns in ColumnInfo. if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(column.getInternalName())) { return true; } return false; } public void debugDisplayAllMaps(BaseWork work) { VectorizedRowBatchCtx vectorizedRowBatchCtx = work.getVectorizedRowBatchCtx(); String[] allColumnNames = vectorizedRowBatchCtx.getRowColumnNames(); Object columnTypeInfos = vectorizedRowBatchCtx.getRowColumnTypeInfos(); int partitionColumnCount = vectorizedRowBatchCtx.getPartitionColumnCount(); String[] scratchColumnTypeNames =vectorizedRowBatchCtx.getScratchColumnTypeNames(); LOG.debug("debugDisplayAllMaps allColumnNames " + Arrays.toString(allColumnNames)); LOG.debug("debugDisplayAllMaps columnTypeInfos " + Arrays.deepToString((Object[]) columnTypeInfos)); LOG.debug("debugDisplayAllMaps partitionColumnCount " + partitionColumnCount); LOG.debug("debugDisplayAllMaps scratchColumnTypeNames " + Arrays.toString(scratchColumnTypeNames)); } }