/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer.physical;
import static org.apache.hadoop.hive.ql.plan.ReduceSinkDesc.ReducerTraits.UNIFORM;
import java.io.Serializable;
import java.lang.annotation.Annotation;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Set;
import java.util.Stack;
import java.util.regex.Pattern;
import org.apache.commons.lang.ArrayUtils;
import org.apache.calcite.util.Pair;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.*;
import org.apache.hadoop.hive.ql.exec.mr.MapRedTask;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey;
import org.apache.hadoop.hive.ql.exec.spark.SparkTask;
import org.apache.hadoop.hive.ql.exec.tez.TezTask;
import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerBigOnlyLongOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerBigOnlyMultiKeyOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerBigOnlyStringOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerLongOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerMultiKeyOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinInnerStringOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinLeftSemiLongOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinLeftSemiMultiKeyOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinLeftSemiStringOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterLongOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterMultiKeyOperator;
import org.apache.hadoop.hive.ql.exec.vector.mapjoin.VectorMapJoinOuterStringOperator;
import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkEmptyKeyOperator;
import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkLongOperator;
import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkMultiKeyOperator;
import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkObjectHashOperator;
import org.apache.hadoop.hive.ql.exec.vector.reducesink.VectorReduceSinkStringOperator;
import org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFAdaptor;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type;
import org.apache.hadoop.hive.ql.exec.vector.VectorColumnOutputMapping;
import org.apache.hadoop.hive.ql.exec.vector.VectorColumnSourceMapping;
import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOperator;
import org.apache.hadoop.hive.ql.exec.vector.VectorMapJoinOuterFilteredOperator;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext.HiveVectorAdaptorUsageMode;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext.InConstantType;
import org.apache.hadoop.hive.ql.exec.vector.VectorizationContextRegion;
import org.apache.hadoop.hive.ql.exec.vector.expressions.IdentityExpression;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorAggregateExpression;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.PreOrderOnceWalker;
import org.apache.hadoop.hive.ql.lib.PreOrderWalker;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.lib.TaskGraphWalker;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.AbstractOperatorDesc;
import org.apache.hadoop.hive.ql.plan.AggregationDesc;
import org.apache.hadoop.hive.ql.plan.AppMasterEventDesc;
import org.apache.hadoop.hive.ql.plan.BaseWork;
import org.apache.hadoop.hive.ql.plan.Explain;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
import org.apache.hadoop.hive.ql.plan.FilterDesc;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.plan.JoinDesc;
import org.apache.hadoop.hive.ql.plan.LimitDesc;
import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.SelectDesc;
import org.apache.hadoop.hive.ql.plan.VectorAppMasterEventDesc;
import org.apache.hadoop.hive.ql.plan.VectorFileSinkDesc;
import org.apache.hadoop.hive.ql.plan.VectorFilterDesc;
import org.apache.hadoop.hive.ql.plan.VectorTableScanDesc;
import org.apache.hadoop.hive.ql.plan.VectorizationCondition;
import org.apache.hadoop.hive.ql.plan.VectorGroupByDesc.ProcessingMode;
import org.apache.hadoop.hive.ql.plan.VectorSparkHashTableSinkDesc;
import org.apache.hadoop.hive.ql.plan.VectorSparkPartitionPruningSinkDesc;
import org.apache.hadoop.hive.ql.plan.VectorLimitDesc;
import org.apache.hadoop.hive.ql.plan.VectorMapJoinInfo;
import org.apache.hadoop.hive.ql.plan.VectorSMBJoinDesc;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
import org.apache.hadoop.hive.ql.plan.ReduceWork;
import org.apache.hadoop.hive.ql.plan.SMBJoinDesc;
import org.apache.hadoop.hive.ql.plan.SparkHashTableSinkDesc;
import org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc;
import org.apache.hadoop.hive.ql.plan.SparkWork;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.ql.plan.TezWork;
import org.apache.hadoop.hive.ql.plan.VectorGroupByDesc;
import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc;
import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableImplementationType;
import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKeyType;
import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.HashTableKind;
import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc.OperatorVariation;
import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc.VectorDeserializeType;
import org.apache.hadoop.hive.ql.plan.VectorReduceSinkDesc;
import org.apache.hadoop.hive.ql.plan.VectorReduceSinkInfo;
import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc;
import org.apache.hadoop.hive.ql.plan.VectorSelectDesc;
import org.apache.hadoop.hive.ql.plan.VectorReduceSinkDesc;
import org.apache.hadoop.hive.ql.plan.VectorReduceSinkInfo;
import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.ql.udf.UDFAcos;
import org.apache.hadoop.hive.ql.udf.UDFAsin;
import org.apache.hadoop.hive.ql.udf.UDFAtan;
import org.apache.hadoop.hive.ql.udf.UDFBin;
import org.apache.hadoop.hive.ql.udf.UDFConv;
import org.apache.hadoop.hive.ql.udf.UDFCos;
import org.apache.hadoop.hive.ql.udf.UDFDayOfMonth;
import org.apache.hadoop.hive.ql.udf.UDFDayOfWeek;
import org.apache.hadoop.hive.ql.udf.UDFDegrees;
import org.apache.hadoop.hive.ql.udf.UDFExp;
import org.apache.hadoop.hive.ql.udf.UDFFromUnixTime;
import org.apache.hadoop.hive.ql.udf.UDFHex;
import org.apache.hadoop.hive.ql.udf.UDFHour;
import org.apache.hadoop.hive.ql.udf.UDFLike;
import org.apache.hadoop.hive.ql.udf.UDFLn;
import org.apache.hadoop.hive.ql.udf.UDFLog;
import org.apache.hadoop.hive.ql.udf.UDFLog10;
import org.apache.hadoop.hive.ql.udf.UDFLog2;
import org.apache.hadoop.hive.ql.udf.UDFMinute;
import org.apache.hadoop.hive.ql.udf.UDFMonth;
import org.apache.hadoop.hive.ql.udf.UDFRadians;
import org.apache.hadoop.hive.ql.udf.UDFRand;
import org.apache.hadoop.hive.ql.udf.UDFRegExpExtract;
import org.apache.hadoop.hive.ql.udf.UDFRegExpReplace;
import org.apache.hadoop.hive.ql.udf.UDFSecond;
import org.apache.hadoop.hive.ql.udf.UDFSign;
import org.apache.hadoop.hive.ql.udf.UDFSin;
import org.apache.hadoop.hive.ql.udf.UDFSqrt;
import org.apache.hadoop.hive.ql.udf.UDFSubstr;
import org.apache.hadoop.hive.ql.udf.UDFTan;
import org.apache.hadoop.hive.ql.udf.UDFToBoolean;
import org.apache.hadoop.hive.ql.udf.UDFToByte;
import org.apache.hadoop.hive.ql.udf.UDFToDouble;
import org.apache.hadoop.hive.ql.udf.UDFToFloat;
import org.apache.hadoop.hive.ql.udf.UDFToInteger;
import org.apache.hadoop.hive.ql.udf.UDFToLong;
import org.apache.hadoop.hive.ql.udf.UDFToShort;
import org.apache.hadoop.hive.ql.udf.UDFToString;
import org.apache.hadoop.hive.ql.udf.UDFWeekOfYear;
import org.apache.hadoop.hive.ql.udf.UDFYear;
import org.apache.hadoop.hive.ql.udf.generic.*;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.NullStructSerDe;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hive.common.util.AnnotationUtils;
import org.apache.hadoop.util.ReflectionUtils;
import com.google.common.base.Preconditions;
public class Vectorizer implements PhysicalPlanResolver {
protected static transient final Logger LOG = LoggerFactory.getLogger(Vectorizer.class);
static Pattern supportedDataTypesPattern;
static {
StringBuilder patternBuilder = new StringBuilder();
patternBuilder.append("int");
patternBuilder.append("|smallint");
patternBuilder.append("|tinyint");
patternBuilder.append("|bigint");
patternBuilder.append("|integer");
patternBuilder.append("|long");
patternBuilder.append("|short");
patternBuilder.append("|timestamp");
patternBuilder.append("|" + serdeConstants.INTERVAL_YEAR_MONTH_TYPE_NAME);
patternBuilder.append("|" + serdeConstants.INTERVAL_DAY_TIME_TYPE_NAME);
patternBuilder.append("|boolean");
patternBuilder.append("|binary");
patternBuilder.append("|string");
patternBuilder.append("|byte");
patternBuilder.append("|float");
patternBuilder.append("|double");
patternBuilder.append("|date");
patternBuilder.append("|void");
// Decimal types can be specified with different precision and scales e.g. decimal(10,5),
// as opposed to other data types which can be represented by constant strings.
// The regex matches only the "decimal" prefix of the type.
patternBuilder.append("|decimal.*");
// CHAR and VARCHAR types can be specified with maximum length.
patternBuilder.append("|char.*");
patternBuilder.append("|varchar.*");
supportedDataTypesPattern = Pattern.compile(patternBuilder.toString());
}
private Set<Class<?>> supportedGenericUDFs = new HashSet<Class<?>>();
private Set<String> supportedAggregationUdfs = new HashSet<String>();
private HiveConf hiveConf;
private boolean useVectorizedInputFileFormat;
private boolean useVectorDeserialize;
private boolean useRowDeserialize;
private boolean isReduceVectorizationEnabled;
private boolean isSchemaEvolution;
private HiveVectorAdaptorUsageMode hiveVectorAdaptorUsageMode;
private BaseWork currentBaseWork;
private Operator<? extends OperatorDesc> currentOperator;
public void testSetCurrentBaseWork(BaseWork testBaseWork) {
currentBaseWork = testBaseWork;
}
private void setNodeIssue(String issue) {
currentBaseWork.setNotVectorizedReason(
VectorizerReason.createNodeIssue(issue));
}
private void setOperatorIssue(String issue) {
currentBaseWork.setNotVectorizedReason(
VectorizerReason.createOperatorIssue(currentOperator, issue));
}
private void setExpressionIssue(String expressionTitle, String issue) {
currentBaseWork.setNotVectorizedReason(
VectorizerReason.createExpressionIssue(currentOperator, expressionTitle, issue));
}
private void clearNotVectorizedReason() {
currentBaseWork.setNotVectorizedReason(null);
}
private long vectorizedVertexNum = -1;
public Vectorizer() {
/*
* We check UDFs against the supportedGenericUDFs when
* hive.vectorized.adaptor.usage.mode=chosen or none.
*
* We allow all UDFs for hive.vectorized.adaptor.usage.mode=all.
*/
supportedGenericUDFs.add(GenericUDFOPPlus.class);
supportedGenericUDFs.add(GenericUDFOPMinus.class);
supportedGenericUDFs.add(GenericUDFOPMultiply.class);
supportedGenericUDFs.add(GenericUDFOPDivide.class);
supportedGenericUDFs.add(GenericUDFOPMod.class);
supportedGenericUDFs.add(GenericUDFOPNegative.class);
supportedGenericUDFs.add(GenericUDFOPPositive.class);
supportedGenericUDFs.add(GenericUDFOPEqualOrLessThan.class);
supportedGenericUDFs.add(GenericUDFOPEqualOrGreaterThan.class);
supportedGenericUDFs.add(GenericUDFOPGreaterThan.class);
supportedGenericUDFs.add(GenericUDFOPLessThan.class);
supportedGenericUDFs.add(GenericUDFOPNot.class);
supportedGenericUDFs.add(GenericUDFOPNotEqual.class);
supportedGenericUDFs.add(GenericUDFOPNotNull.class);
supportedGenericUDFs.add(GenericUDFOPNull.class);
supportedGenericUDFs.add(GenericUDFOPOr.class);
supportedGenericUDFs.add(GenericUDFOPAnd.class);
supportedGenericUDFs.add(GenericUDFOPEqual.class);
supportedGenericUDFs.add(GenericUDFLength.class);
supportedGenericUDFs.add(GenericUDFCharacterLength.class);
supportedGenericUDFs.add(GenericUDFOctetLength.class);
supportedGenericUDFs.add(UDFYear.class);
supportedGenericUDFs.add(UDFMonth.class);
supportedGenericUDFs.add(UDFDayOfMonth.class);
supportedGenericUDFs.add(UDFDayOfWeek.class);
supportedGenericUDFs.add(UDFHour.class);
supportedGenericUDFs.add(UDFMinute.class);
supportedGenericUDFs.add(UDFSecond.class);
supportedGenericUDFs.add(UDFWeekOfYear.class);
supportedGenericUDFs.add(GenericUDFToUnixTimeStamp.class);
supportedGenericUDFs.add(UDFFromUnixTime.class);
supportedGenericUDFs.add(GenericUDFDateAdd.class);
supportedGenericUDFs.add(GenericUDFDateSub.class);
supportedGenericUDFs.add(GenericUDFDate.class);
supportedGenericUDFs.add(GenericUDFDateDiff.class);
supportedGenericUDFs.add(UDFLike.class);
supportedGenericUDFs.add(GenericUDFRegExp.class);
supportedGenericUDFs.add(UDFRegExpExtract.class);
supportedGenericUDFs.add(UDFRegExpReplace.class);
supportedGenericUDFs.add(UDFSubstr.class);
supportedGenericUDFs.add(GenericUDFLTrim.class);
supportedGenericUDFs.add(GenericUDFRTrim.class);
supportedGenericUDFs.add(GenericUDFTrim.class);
supportedGenericUDFs.add(UDFSin.class);
supportedGenericUDFs.add(UDFCos.class);
supportedGenericUDFs.add(UDFTan.class);
supportedGenericUDFs.add(UDFAsin.class);
supportedGenericUDFs.add(UDFAcos.class);
supportedGenericUDFs.add(UDFAtan.class);
supportedGenericUDFs.add(UDFDegrees.class);
supportedGenericUDFs.add(UDFRadians.class);
supportedGenericUDFs.add(GenericUDFFloor.class);
supportedGenericUDFs.add(GenericUDFCeil.class);
supportedGenericUDFs.add(UDFExp.class);
supportedGenericUDFs.add(UDFLn.class);
supportedGenericUDFs.add(UDFLog2.class);
supportedGenericUDFs.add(UDFLog10.class);
supportedGenericUDFs.add(UDFLog.class);
supportedGenericUDFs.add(GenericUDFPower.class);
supportedGenericUDFs.add(GenericUDFRound.class);
supportedGenericUDFs.add(GenericUDFBRound.class);
supportedGenericUDFs.add(GenericUDFPosMod.class);
supportedGenericUDFs.add(UDFSqrt.class);
supportedGenericUDFs.add(UDFSign.class);
supportedGenericUDFs.add(UDFRand.class);
supportedGenericUDFs.add(UDFBin.class);
supportedGenericUDFs.add(UDFHex.class);
supportedGenericUDFs.add(UDFConv.class);
supportedGenericUDFs.add(GenericUDFLower.class);
supportedGenericUDFs.add(GenericUDFUpper.class);
supportedGenericUDFs.add(GenericUDFConcat.class);
supportedGenericUDFs.add(GenericUDFAbs.class);
supportedGenericUDFs.add(GenericUDFBetween.class);
supportedGenericUDFs.add(GenericUDFIn.class);
supportedGenericUDFs.add(GenericUDFCase.class);
supportedGenericUDFs.add(GenericUDFWhen.class);
supportedGenericUDFs.add(GenericUDFCoalesce.class);
supportedGenericUDFs.add(GenericUDFNvl.class);
supportedGenericUDFs.add(GenericUDFElt.class);
supportedGenericUDFs.add(GenericUDFInitCap.class);
supportedGenericUDFs.add(GenericUDFInBloomFilter.class);
// For type casts
supportedGenericUDFs.add(UDFToLong.class);
supportedGenericUDFs.add(UDFToInteger.class);
supportedGenericUDFs.add(UDFToShort.class);
supportedGenericUDFs.add(UDFToByte.class);
supportedGenericUDFs.add(UDFToBoolean.class);
supportedGenericUDFs.add(UDFToFloat.class);
supportedGenericUDFs.add(UDFToDouble.class);
supportedGenericUDFs.add(UDFToString.class);
supportedGenericUDFs.add(GenericUDFTimestamp.class);
supportedGenericUDFs.add(GenericUDFToDecimal.class);
supportedGenericUDFs.add(GenericUDFToDate.class);
supportedGenericUDFs.add(GenericUDFToChar.class);
supportedGenericUDFs.add(GenericUDFToVarchar.class);
supportedGenericUDFs.add(GenericUDFToIntervalYearMonth.class);
supportedGenericUDFs.add(GenericUDFToIntervalDayTime.class);
// For conditional expressions
supportedGenericUDFs.add(GenericUDFIf.class);
supportedAggregationUdfs.add("min");
supportedAggregationUdfs.add("max");
supportedAggregationUdfs.add("count");
supportedAggregationUdfs.add("sum");
supportedAggregationUdfs.add("avg");
supportedAggregationUdfs.add("variance");
supportedAggregationUdfs.add("var_pop");
supportedAggregationUdfs.add("var_samp");
supportedAggregationUdfs.add("std");
supportedAggregationUdfs.add("stddev");
supportedAggregationUdfs.add("stddev_pop");
supportedAggregationUdfs.add("stddev_samp");
supportedAggregationUdfs.add("bloom_filter");
}
private class VectorTaskColumnInfo {
List<String> allColumnNames;
List<TypeInfo> allTypeInfos;
List<Integer> dataColumnNums;
int partitionColumnCount;
boolean useVectorizedInputFileFormat;
boolean groupByVectorOutput;
boolean allNative;
boolean usesVectorUDFAdaptor;
String[] scratchTypeNameArray;
Set<Operator<? extends OperatorDesc>> nonVectorizedOps;
String reduceColumnSortOrder;
String reduceColumnNullOrder;
VectorTaskColumnInfo() {
partitionColumnCount = 0;
}
public void assume() {
groupByVectorOutput = true;
allNative = true;
usesVectorUDFAdaptor = false;
}
public void setAllColumnNames(List<String> allColumnNames) {
this.allColumnNames = allColumnNames;
}
public void setAllTypeInfos(List<TypeInfo> allTypeInfos) {
this.allTypeInfos = allTypeInfos;
}
public void setDataColumnNums(List<Integer> dataColumnNums) {
this.dataColumnNums = dataColumnNums;
}
public void setPartitionColumnCount(int partitionColumnCount) {
this.partitionColumnCount = partitionColumnCount;
}
public void setScratchTypeNameArray(String[] scratchTypeNameArray) {
this.scratchTypeNameArray = scratchTypeNameArray;
}
public void setGroupByVectorOutput(boolean groupByVectorOutput) {
this.groupByVectorOutput = groupByVectorOutput;
}
public void setAllNative(boolean allNative) {
this.allNative = allNative;
}
public void setUsesVectorUDFAdaptor(boolean usesVectorUDFAdaptor) {
this.usesVectorUDFAdaptor = usesVectorUDFAdaptor;
}
public void setUseVectorizedInputFileFormat(boolean useVectorizedInputFileFormat) {
this.useVectorizedInputFileFormat = useVectorizedInputFileFormat;
}
public void setNonVectorizedOps(Set<Operator<? extends OperatorDesc>> nonVectorizedOps) {
this.nonVectorizedOps = nonVectorizedOps;
}
public Set<Operator<? extends OperatorDesc>> getNonVectorizedOps() {
return nonVectorizedOps;
}
public void setReduceColumnSortOrder(String reduceColumnSortOrder) {
this.reduceColumnSortOrder = reduceColumnSortOrder;
}
public void setReduceColumnNullOrder(String reduceColumnNullOrder) {
this.reduceColumnNullOrder = reduceColumnNullOrder;
}
public void transferToBaseWork(BaseWork baseWork) {
String[] allColumnNameArray = allColumnNames.toArray(new String[0]);
TypeInfo[] allTypeInfoArray = allTypeInfos.toArray(new TypeInfo[0]);
int[] dataColumnNumsArray;
if (dataColumnNums != null) {
dataColumnNumsArray = ArrayUtils.toPrimitive(dataColumnNums.toArray(new Integer[0]));
} else {
dataColumnNumsArray = null;
}
VectorizedRowBatchCtx vectorizedRowBatchCtx =
new VectorizedRowBatchCtx(
allColumnNameArray,
allTypeInfoArray,
dataColumnNumsArray,
partitionColumnCount,
scratchTypeNameArray);
baseWork.setVectorizedRowBatchCtx(vectorizedRowBatchCtx);
if (baseWork instanceof MapWork) {
MapWork mapWork = (MapWork) baseWork;
mapWork.setUseVectorizedInputFileFormat(useVectorizedInputFileFormat);
}
if (baseWork instanceof ReduceWork) {
ReduceWork reduceWork = (ReduceWork) baseWork;
reduceWork.setVectorReduceColumnSortOrder(reduceColumnSortOrder);
reduceWork.setVectorReduceColumnNullOrder(reduceColumnNullOrder);
}
baseWork.setAllNative(allNative);
baseWork.setGroupByVectorOutput(groupByVectorOutput);
baseWork.setUsesVectorUDFAdaptor(usesVectorUDFAdaptor);
}
}
class VectorizationDispatcher implements Dispatcher {
@Override
public Object dispatch(Node nd, Stack<Node> stack, Object... nodeOutputs)
throws SemanticException {
Task<? extends Serializable> currTask = (Task<? extends Serializable>) nd;
if (currTask instanceof MapRedTask) {
MapredWork mapredWork = ((MapRedTask) currTask).getWork();
convertMapWork(mapredWork.getMapWork(), false);
ReduceWork reduceWork = mapredWork.getReduceWork();
if (reduceWork != null) {
// Always set the EXPLAIN conditions.
setReduceWorkExplainConditions(reduceWork);
// We do not vectorize MR Reduce.
}
} else if (currTask instanceof TezTask) {
TezWork work = ((TezTask) currTask).getWork();
for (BaseWork baseWork: work.getAllWork()) {
if (baseWork instanceof MapWork) {
convertMapWork((MapWork) baseWork, true);
} else if (baseWork instanceof ReduceWork) {
ReduceWork reduceWork = (ReduceWork) baseWork;
// Always set the EXPLAIN conditions.
setReduceWorkExplainConditions(reduceWork);
// We are only vectorizing Reduce under Tez/Spark.
if (isReduceVectorizationEnabled) {
convertReduceWork(reduceWork);
}
}
}
} else if (currTask instanceof SparkTask) {
SparkWork sparkWork = (SparkWork) currTask.getWork();
for (BaseWork baseWork : sparkWork.getAllWork()) {
if (baseWork instanceof MapWork) {
convertMapWork((MapWork) baseWork, true);
} else if (baseWork instanceof ReduceWork) {
ReduceWork reduceWork = (ReduceWork) baseWork;
// Always set the EXPLAIN conditions.
setReduceWorkExplainConditions(reduceWork);
if (isReduceVectorizationEnabled) {
convertReduceWork(reduceWork);
}
}
}
}
return null;
}
private void convertMapWork(MapWork mapWork, boolean isTezOrSpark) throws SemanticException {
mapWork.setVectorizationExamined(true);
// Global used when setting errors, etc.
currentBaseWork = mapWork;
VectorTaskColumnInfo vectorTaskColumnInfo = new VectorTaskColumnInfo();
vectorTaskColumnInfo.assume();
mapWork.setVectorizedVertexNum(++vectorizedVertexNum);
boolean ret;
try {
ret = validateMapWork(mapWork, vectorTaskColumnInfo, isTezOrSpark);
} catch (Exception e) {
String issue = "exception: " + VectorizationContext.getStackTraceAsSingleLine(e);
setNodeIssue(issue);
ret = false;
}
if (ret) {
vectorizeMapWork(mapWork, vectorTaskColumnInfo, isTezOrSpark);
} else if (currentBaseWork.getVectorizationEnabled()) {
VectorizerReason notVectorizedReason = currentBaseWork.getNotVectorizedReason();
if (notVectorizedReason == null) {
LOG.info("Cannot vectorize: unknown");
} else {
LOG.info("Cannot vectorize: " + notVectorizedReason.toString());
}
clearMapWorkVectorDescs(mapWork);
}
}
private void addMapWorkRules(Map<Rule, NodeProcessor> opRules, NodeProcessor np) {
opRules.put(new RuleRegExp("R1", TableScanOperator.getOperatorName() + ".*"
+ FileSinkOperator.getOperatorName()), np);
opRules.put(new RuleRegExp("R2", TableScanOperator.getOperatorName() + ".*"
+ ReduceSinkOperator.getOperatorName()), np);
}
/*
* Determine if there is only one TableScanOperator. Currently in Map vectorization, we do not
* try to vectorize multiple input trees.
*/
private ImmutablePair<String, TableScanOperator> verifyOnlyOneTableScanOperator(MapWork mapWork) {
// Eliminate MR plans with more than one TableScanOperator.
LinkedHashMap<String, Operator<? extends OperatorDesc>> aliasToWork = mapWork.getAliasToWork();
if ((aliasToWork == null) || (aliasToWork.size() == 0)) {
setNodeIssue("Vectorized map work requires work");
return null;
}
int tableScanCount = 0;
String alias = "";
TableScanOperator tableScanOperator = null;
for (Entry<String, Operator<? extends OperatorDesc>> entry : aliasToWork.entrySet()) {
Operator<?> op = entry.getValue();
if (op == null) {
setNodeIssue("Vectorized map work requires a valid alias");
return null;
}
if (op instanceof TableScanOperator) {
tableScanCount++;
alias = entry.getKey();
tableScanOperator = (TableScanOperator) op;
}
}
if (tableScanCount > 1) {
setNodeIssue("Vectorized map work only works with 1 TableScanOperator");
return null;
}
return new ImmutablePair(alias, tableScanOperator);
}
private void getTableScanOperatorSchemaInfo(TableScanOperator tableScanOperator,
List<String> logicalColumnNameList, List<TypeInfo> logicalTypeInfoList) {
// Add all non-virtual columns to make a vectorization context for
// the TableScan operator.
RowSchema rowSchema = tableScanOperator.getSchema();
for (ColumnInfo c : rowSchema.getSignature()) {
// Validation will later exclude vectorization of virtual columns usage (HIVE-5560).
if (!isVirtualColumn(c)) {
String columnName = c.getInternalName();
String typeName = c.getTypeName();
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName);
logicalColumnNameList.add(columnName);
logicalTypeInfoList.add(typeInfo);
}
}
}
private void determineDataColumnNums(TableScanOperator tableScanOperator,
List<String> allColumnNameList, int dataColumnCount, List<Integer> dataColumnNums) {
/*
* The TableScanOperator's needed columns are just the data columns.
*/
Set<String> neededColumns = new HashSet<String>(tableScanOperator.getNeededColumns());
for (int dataColumnNum = 0; dataColumnNum < dataColumnCount; dataColumnNum++) {
String columnName = allColumnNameList.get(dataColumnNum);
if (neededColumns.contains(columnName)) {
dataColumnNums.add(dataColumnNum);
}
}
}
/*
* There are 3 modes of reading for vectorization:
*
* 1) One for the Vectorized Input File Format which returns VectorizedRowBatch as the row.
*
* 2) One for using VectorDeserializeRow to deserialize each row into the VectorizedRowBatch.
* Currently, these Input File Formats:
* TEXTFILE
* SEQUENCEFILE
*
* 3) And one using the regular partition deserializer to get the row object and assigning
* the row object into the VectorizedRowBatch with VectorAssignRow.
* This picks up Input File Format not supported by the other two.
*/
private boolean verifyAndSetVectorPartDesc(PartitionDesc pd, boolean isAcidTable,
HashSet<String> inputFileFormatClassNameSet, HashSet<String> enabledConditionsMetSet,
ArrayList<String> enabledConditionsNotMetList) {
String inputFileFormatClassName = pd.getInputFileFormatClassName();
// Always collect input file formats.
inputFileFormatClassNameSet.add(inputFileFormatClassName);
boolean isInputFileFormatVectorized = Utilities.isInputFileFormatVectorized(pd);
if (isAcidTable) {
// Today, ACID tables are only ORC and that format is vectorizable. Verify these
// assumptions.
Preconditions.checkState(isInputFileFormatVectorized);
Preconditions.checkState(inputFileFormatClassName.equals(OrcInputFormat.class.getName()));
if (!useVectorizedInputFileFormat) {
enabledConditionsNotMetList.add(
"Vectorizing ACID tables requires " + HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname);
return false;
}
pd.setVectorPartitionDesc(
VectorPartitionDesc.createVectorizedInputFileFormat(
inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd)));
enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname);
return true;
}
// Look for Pass-Thru case where InputFileFormat has VectorizedInputFormatInterface
// and reads VectorizedRowBatch as a "row".
if (useVectorizedInputFileFormat) {
if (isInputFileFormatVectorized) {
pd.setVectorPartitionDesc(
VectorPartitionDesc.createVectorizedInputFileFormat(
inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd)));
enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname);
return true;
}
// Fall through and look for other options...
}
if (!isSchemaEvolution) {
enabledConditionsNotMetList.add(
"Vectorizing tables without Schema Evolution requires " + HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname);
return false;
}
String deserializerClassName = pd.getDeserializerClassName();
// Look for InputFileFormat / Serde combinations we can deserialize more efficiently
// using VectorDeserializeRow and a deserialize class with the DeserializeRead interface.
//
// Do the "vectorized" row-by-row deserialization into a VectorizedRowBatch in the
// VectorMapOperator.
boolean isTextFormat = inputFileFormatClassName.equals(TextInputFormat.class.getName()) &&
deserializerClassName.equals(LazySimpleSerDe.class.getName());
boolean isSequenceFormat =
inputFileFormatClassName.equals(SequenceFileInputFormat.class.getName()) &&
deserializerClassName.equals(LazyBinarySerDe.class.getName());
boolean isVectorDeserializeEligable = isTextFormat || isSequenceFormat;
if (useVectorDeserialize) {
// Currently, we support LazySimple deserialization:
//
// org.apache.hadoop.mapred.TextInputFormat
// org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
//
// AND
//
// org.apache.hadoop.mapred.SequenceFileInputFormat
// org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
if (isTextFormat) {
Properties properties = pd.getTableDesc().getProperties();
String lastColumnTakesRestString =
properties.getProperty(serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST);
boolean lastColumnTakesRest =
(lastColumnTakesRestString != null &&
lastColumnTakesRestString.equalsIgnoreCase("true"));
if (lastColumnTakesRest) {
// If row mode will not catch this input file format, then not enabled.
if (useRowDeserialize) {
enabledConditionsNotMetList.add(
inputFileFormatClassName + " " +
serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST + " must be disabled ");
return false;
}
} else {
pd.setVectorPartitionDesc(
VectorPartitionDesc.createVectorDeserialize(
inputFileFormatClassName, VectorDeserializeType.LAZY_SIMPLE));
enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE.varname);
return true;
}
} else if (isSequenceFormat) {
pd.setVectorPartitionDesc(
VectorPartitionDesc.createVectorDeserialize(
inputFileFormatClassName, VectorDeserializeType.LAZY_BINARY));
enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE.varname);
return true;
}
// Fall through and look for other options...
}
// Otherwise, if enabled, deserialize rows using regular Serde and add the object
// inspect-able Object[] row to a VectorizedRowBatch in the VectorMapOperator.
if (useRowDeserialize) {
pd.setVectorPartitionDesc(
VectorPartitionDesc.createRowDeserialize(
inputFileFormatClassName,
Utilities.isInputFileFormatSelfDescribing(pd),
deserializerClassName));
enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_ROW_DESERIALIZE.varname);
return true;
}
if (isInputFileFormatVectorized) {
Preconditions.checkState(!useVectorizedInputFileFormat);
enabledConditionsNotMetList.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname);
} else {
// Only offer these when the input file format is not the fast vectorized formats.
if (isVectorDeserializeEligable) {
Preconditions.checkState(!useVectorDeserialize);
enabledConditionsNotMetList.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE.varname);
} else {
// Since row mode takes everyone.
enabledConditionsNotMetList.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_ROW_DESERIALIZE.varname);
}
}
return false;
}
private ImmutablePair<Boolean, Boolean> validateInputFormatAndSchemaEvolution(MapWork mapWork, String alias,
TableScanOperator tableScanOperator, VectorTaskColumnInfo vectorTaskColumnInfo)
throws SemanticException {
boolean isAcidTable = tableScanOperator.getConf().isAcidTable();
// These names/types are the data columns plus partition columns.
final List<String> allColumnNameList = new ArrayList<String>();
final List<TypeInfo> allTypeInfoList = new ArrayList<TypeInfo>();
getTableScanOperatorSchemaInfo(tableScanOperator, allColumnNameList, allTypeInfoList);
final List<Integer> dataColumnNums = new ArrayList<Integer>();
final int allColumnCount = allColumnNameList.size();
/*
* Validate input formats of all the partitions can be vectorized.
*/
boolean isFirst = true;
int dataColumnCount = 0;
int partitionColumnCount = 0;
List<String> tableDataColumnList = null;
List<TypeInfo> tableDataTypeInfoList = null;
LinkedHashMap<Path, ArrayList<String>> pathToAliases = mapWork.getPathToAliases();
LinkedHashMap<Path, PartitionDesc> pathToPartitionInfo = mapWork.getPathToPartitionInfo();
// Remember the input file formats we validated and why.
HashSet<String> inputFileFormatClassNameSet = new HashSet<String>();
HashSet<String> enabledConditionsMetSet = new HashSet<String>();
ArrayList<String> enabledConditionsNotMetList = new ArrayList<String>();
for (Entry<Path, ArrayList<String>> entry: pathToAliases.entrySet()) {
Path path = entry.getKey();
List<String> aliases = entry.getValue();
boolean isPresent = (aliases != null && aliases.indexOf(alias) != -1);
if (!isPresent) {
setOperatorIssue("Alias " + alias + " not present in aliases " + aliases);
return new ImmutablePair<Boolean,Boolean>(false, false);
}
PartitionDesc partDesc = pathToPartitionInfo.get(path);
if (partDesc.getVectorPartitionDesc() != null) {
// We've seen this already.
continue;
}
if (!verifyAndSetVectorPartDesc(partDesc, isAcidTable, inputFileFormatClassNameSet,
enabledConditionsMetSet, enabledConditionsNotMetList)) {
// Always set these so EXPLAIN can see.
mapWork.setVectorizationInputFileFormatClassNameSet(inputFileFormatClassNameSet);
mapWork.setVectorizationEnabledConditionsMet(new ArrayList(enabledConditionsMetSet));
mapWork.setVectorizationEnabledConditionsNotMet(enabledConditionsNotMetList);
// We consider this an enable issue, not a not vectorized issue.
LOG.info("Cannot enable vectorization because input file format(s) " + inputFileFormatClassNameSet +
" do not met conditions " + VectorizationCondition.addBooleans(enabledConditionsNotMetList, false));
return new ImmutablePair<Boolean,Boolean>(false, true);
}
VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc();
if (isFirst) {
// Determine the data and partition columns using the first partition descriptor.
LinkedHashMap<String, String> partSpec = partDesc.getPartSpec();
if (partSpec != null && partSpec.size() > 0) {
partitionColumnCount = partSpec.size();
dataColumnCount = allColumnCount - partitionColumnCount;
} else {
partitionColumnCount = 0;
dataColumnCount = allColumnCount;
}
determineDataColumnNums(tableScanOperator, allColumnNameList, dataColumnCount,
dataColumnNums);
tableDataColumnList = allColumnNameList.subList(0, dataColumnCount);
tableDataTypeInfoList = allTypeInfoList.subList(0, dataColumnCount);
isFirst = false;
}
// We need to get the partition's column names from the partition serde.
// (e.g. Avro provides the table schema and ignores the partition schema..).
//
Deserializer deserializer;
StructObjectInspector partObjectInspector;
try {
deserializer = partDesc.getDeserializer(hiveConf);
partObjectInspector = (StructObjectInspector) deserializer.getObjectInspector();
} catch (Exception e) {
throw new SemanticException(e);
}
String nextDataColumnsString = ObjectInspectorUtils.getFieldNames(partObjectInspector);
String[] nextDataColumns = nextDataColumnsString.split(",");
List<String> nextDataColumnList = Arrays.asList(nextDataColumns);
/*
* Validate the column names that are present are the same. Missing columns will be
* implicitly defaulted to null.
*/
if (nextDataColumnList.size() > tableDataColumnList.size()) {
setOperatorIssue(
String.format(
"Could not vectorize partition %s " +
"(deserializer " + deserializer.getClass().getName() + ")" +
"The partition column names %d is greater than the number of table columns %d",
path, nextDataColumnList.size(), tableDataColumnList.size()));
return new ImmutablePair<Boolean,Boolean>(false, false);
}
if (!(deserializer instanceof NullStructSerDe)) {
// (Don't insist NullStructSerDe produce correct column names).
for (int i = 0; i < nextDataColumnList.size(); i++) {
String nextColumnName = nextDataColumnList.get(i);
String tableColumnName = tableDataColumnList.get(i);
if (!nextColumnName.equals(tableColumnName)) {
setOperatorIssue(
String.format(
"Could not vectorize partition %s " +
"(deserializer " + deserializer.getClass().getName() + ")" +
"The partition column name %s is does not match table column name %s",
path, nextColumnName, tableColumnName));
return new ImmutablePair<Boolean,Boolean>(false, false);
}
}
}
List<TypeInfo> nextDataTypeInfoList;
if (vectorPartDesc.getIsInputFileFormatSelfDescribing()) {
/*
* Self-Describing Input Format will convert its data to the table schema.
*/
nextDataTypeInfoList = tableDataTypeInfoList;
} else {
String nextDataTypesString = ObjectInspectorUtils.getFieldTypes(partObjectInspector);
// We convert to an array of TypeInfo using a library routine since it parses the information
// and can handle use of different separators, etc. We cannot use the raw type string
// for comparison in the map because of the different separators used.
nextDataTypeInfoList =
TypeInfoUtils.getTypeInfosFromTypeString(nextDataTypesString);
}
vectorPartDesc.setDataTypeInfos(nextDataTypeInfoList);
}
vectorTaskColumnInfo.setAllColumnNames(allColumnNameList);
vectorTaskColumnInfo.setAllTypeInfos(allTypeInfoList);
vectorTaskColumnInfo.setDataColumnNums(dataColumnNums);
vectorTaskColumnInfo.setPartitionColumnCount(partitionColumnCount);
vectorTaskColumnInfo.setUseVectorizedInputFileFormat(useVectorizedInputFileFormat);
// Always set these so EXPLAIN can see.
mapWork.setVectorizationInputFileFormatClassNameSet(inputFileFormatClassNameSet);
mapWork.setVectorizationEnabledConditionsMet(new ArrayList(enabledConditionsMetSet));
mapWork.setVectorizationEnabledConditionsNotMet(enabledConditionsNotMetList);
return new ImmutablePair<Boolean,Boolean>(true, false);
}
private boolean validateMapWork(MapWork mapWork, VectorTaskColumnInfo vectorTaskColumnInfo, boolean isTezOrSpark)
throws SemanticException {
LOG.info("Validating MapWork...");
ImmutablePair<String,TableScanOperator> onlyOneTableScanPair = verifyOnlyOneTableScanOperator(mapWork);
if (onlyOneTableScanPair == null) {
VectorizerReason notVectorizedReason = currentBaseWork.getNotVectorizedReason();
Preconditions.checkState(notVectorizedReason != null);
mapWork.setVectorizationEnabledConditionsNotMet(Arrays.asList(new String[] {notVectorizedReason.toString()}));
return false;
}
String alias = onlyOneTableScanPair.left;
TableScanOperator tableScanOperator = onlyOneTableScanPair.right;
// This call fills in the column names, types, and partition column count in
// vectorTaskColumnInfo.
currentOperator = tableScanOperator;
ImmutablePair<Boolean, Boolean> validateInputFormatAndSchemaEvolutionPair =
validateInputFormatAndSchemaEvolution(mapWork, alias, tableScanOperator, vectorTaskColumnInfo);
if (!validateInputFormatAndSchemaEvolutionPair.left) {
// Have we already set the enabled conditions not met?
if (!validateInputFormatAndSchemaEvolutionPair.right) {
VectorizerReason notVectorizedReason = currentBaseWork.getNotVectorizedReason();
Preconditions.checkState(notVectorizedReason != null);
mapWork.setVectorizationEnabledConditionsNotMet(Arrays.asList(new String[] {notVectorizedReason.toString()}));
}
return false;
}
// Now we are enabled and any issues found from here on out are considered
// not vectorized issues.
mapWork.setVectorizationEnabled(true);
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
MapWorkValidationNodeProcessor vnp = new MapWorkValidationNodeProcessor(mapWork, isTezOrSpark);
addMapWorkRules(opRules, vnp);
Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null);
GraphWalker ogw = new DefaultGraphWalker(disp);
// iterator the mapper operator tree
ArrayList<Node> topNodes = new ArrayList<Node>();
topNodes.addAll(mapWork.getAliasToWork().values());
HashMap<Node, Object> nodeOutput = new HashMap<Node, Object>();
ogw.startWalking(topNodes, nodeOutput);
for (Node n : nodeOutput.keySet()) {
if (nodeOutput.get(n) != null) {
if (!((Boolean)nodeOutput.get(n)).booleanValue()) {
return false;
}
}
}
vectorTaskColumnInfo.setNonVectorizedOps(vnp.getNonVectorizedOps());
return true;
}
private void vectorizeMapWork(MapWork mapWork, VectorTaskColumnInfo vectorTaskColumnInfo,
boolean isTezOrSpark) throws SemanticException {
LOG.info("Vectorizing MapWork...");
mapWork.setVectorMode(true);
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
MapWorkVectorizationNodeProcessor vnp =
new MapWorkVectorizationNodeProcessor(mapWork, isTezOrSpark, vectorTaskColumnInfo);
addMapWorkRules(opRules, vnp);
Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null);
GraphWalker ogw = new PreOrderOnceWalker(disp);
// iterator the mapper operator tree
ArrayList<Node> topNodes = new ArrayList<Node>();
topNodes.addAll(mapWork.getAliasToWork().values());
HashMap<Node, Object> nodeOutput = new HashMap<Node, Object>();
ogw.startWalking(topNodes, nodeOutput);
vectorTaskColumnInfo.setScratchTypeNameArray(vnp.getVectorScratchColumnTypeNames());
vectorTaskColumnInfo.transferToBaseWork(mapWork);
if (LOG.isDebugEnabled()) {
debugDisplayAllMaps(mapWork);
}
return;
}
private void setReduceWorkExplainConditions(ReduceWork reduceWork) {
reduceWork.setVectorizationExamined(true);
reduceWork.setReduceVectorizationEnabled(isReduceVectorizationEnabled);
reduceWork.setVectorReduceEngine(
HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE));
}
private void convertReduceWork(ReduceWork reduceWork) throws SemanticException {
// Global used when setting errors, etc.
currentBaseWork = reduceWork;
currentBaseWork.setVectorizationEnabled(true);
VectorTaskColumnInfo vectorTaskColumnInfo = new VectorTaskColumnInfo();
vectorTaskColumnInfo.assume();
reduceWork.setVectorizedVertexNum(++vectorizedVertexNum);
boolean ret;
try {
ret = validateReduceWork(reduceWork, vectorTaskColumnInfo);
} catch (Exception e) {
String issue = "exception: " + VectorizationContext.getStackTraceAsSingleLine(e);
setNodeIssue(issue);
ret = false;
}
if (ret) {
vectorizeReduceWork(reduceWork, vectorTaskColumnInfo);
} else if (currentBaseWork.getVectorizationEnabled()) {
VectorizerReason notVectorizedReason = currentBaseWork.getNotVectorizedReason();
if (notVectorizedReason == null) {
LOG.info("Cannot vectorize: unknown");
} else {
LOG.info("Cannot vectorize: " + notVectorizedReason.toString());
}
clearReduceWorkVectorDescs(reduceWork);
}
}
private boolean getOnlyStructObjectInspectors(ReduceWork reduceWork,
VectorTaskColumnInfo vectorTaskColumnInfo) throws SemanticException {
ArrayList<String> reduceColumnNames = new ArrayList<String>();
ArrayList<TypeInfo> reduceTypeInfos = new ArrayList<TypeInfo>();
if (reduceWork.getNeedsTagging()) {
setNodeIssue("Tagging not supported");
return false;
}
String columnSortOrder;
String columnNullOrder;
try {
TableDesc keyTableDesc = reduceWork.getKeyDesc();
if (LOG.isDebugEnabled()) {
LOG.debug("Using reduce tag " + reduceWork.getTag());
}
TableDesc valueTableDesc = reduceWork.getTagToValueDesc().get(reduceWork.getTag());
Properties keyTableProperties = keyTableDesc.getProperties();
Deserializer keyDeserializer =
ReflectionUtils.newInstance(
keyTableDesc.getDeserializerClass(), null);
SerDeUtils.initializeSerDe(keyDeserializer, null, keyTableProperties, null);
ObjectInspector keyObjectInspector = keyDeserializer.getObjectInspector();
if (keyObjectInspector == null) {
setNodeIssue("Key object inspector null");
return false;
}
if (!(keyObjectInspector instanceof StructObjectInspector)) {
setNodeIssue("Key object inspector not StructObjectInspector");
return false;
}
StructObjectInspector keyStructObjectInspector = (StructObjectInspector) keyObjectInspector;
List<? extends StructField> keyFields = keyStructObjectInspector.getAllStructFieldRefs();
for (StructField field: keyFields) {
reduceColumnNames.add(Utilities.ReduceField.KEY.toString() + "." + field.getFieldName());
reduceTypeInfos.add(TypeInfoUtils.getTypeInfoFromTypeString(field.getFieldObjectInspector().getTypeName()));
}
columnSortOrder = keyTableProperties.getProperty(serdeConstants.SERIALIZATION_SORT_ORDER);
columnNullOrder = keyTableProperties.getProperty(serdeConstants.SERIALIZATION_NULL_SORT_ORDER);
Deserializer valueDeserializer =
ReflectionUtils.newInstance(
valueTableDesc.getDeserializerClass(), null);
SerDeUtils.initializeSerDe(valueDeserializer, null, valueTableDesc.getProperties(), null);
ObjectInspector valueObjectInspector = valueDeserializer.getObjectInspector();
if (valueObjectInspector != null) {
if (!(valueObjectInspector instanceof StructObjectInspector)) {
setNodeIssue("Value object inspector not StructObjectInspector");
return false;
}
StructObjectInspector valueStructObjectInspector = (StructObjectInspector) valueObjectInspector;
List<? extends StructField> valueFields = valueStructObjectInspector.getAllStructFieldRefs();
for (StructField field: valueFields) {
reduceColumnNames.add(Utilities.ReduceField.VALUE.toString() + "." + field.getFieldName());
reduceTypeInfos.add(TypeInfoUtils.getTypeInfoFromTypeString(field.getFieldObjectInspector().getTypeName()));
}
}
} catch (Exception e) {
throw new SemanticException(e);
}
vectorTaskColumnInfo.setAllColumnNames(reduceColumnNames);
vectorTaskColumnInfo.setAllTypeInfos(reduceTypeInfos);
vectorTaskColumnInfo.setReduceColumnSortOrder(columnSortOrder);
vectorTaskColumnInfo.setReduceColumnNullOrder(columnNullOrder);
return true;
}
private void addReduceWorkRules(Map<Rule, NodeProcessor> opRules, NodeProcessor np) {
opRules.put(new RuleRegExp("R1", GroupByOperator.getOperatorName() + ".*"), np);
opRules.put(new RuleRegExp("R2", SelectOperator.getOperatorName() + ".*"), np);
}
private boolean validateReduceWork(ReduceWork reduceWork,
VectorTaskColumnInfo vectorTaskColumnInfo) throws SemanticException {
LOG.info("Validating ReduceWork...");
// Validate input to ReduceWork.
if (!getOnlyStructObjectInspectors(reduceWork, vectorTaskColumnInfo)) {
return false;
}
// Now check the reduce operator tree.
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
ReduceWorkValidationNodeProcessor vnp = new ReduceWorkValidationNodeProcessor();
addReduceWorkRules(opRules, vnp);
Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null);
GraphWalker ogw = new DefaultGraphWalker(disp);
// iterator the reduce operator tree
ArrayList<Node> topNodes = new ArrayList<Node>();
topNodes.add(reduceWork.getReducer());
HashMap<Node, Object> nodeOutput = new HashMap<Node, Object>();
ogw.startWalking(topNodes, nodeOutput);
for (Node n : nodeOutput.keySet()) {
if (nodeOutput.get(n) != null) {
if (!((Boolean)nodeOutput.get(n)).booleanValue()) {
return false;
}
}
}
vectorTaskColumnInfo.setNonVectorizedOps(vnp.getNonVectorizedOps());
return true;
}
private void vectorizeReduceWork(ReduceWork reduceWork,
VectorTaskColumnInfo vectorTaskColumnInfo) throws SemanticException {
LOG.info("Vectorizing ReduceWork...");
reduceWork.setVectorMode(true);
// For some reason, the DefaultGraphWalker does not descend down from the reducer Operator as
// expected. We need to descend down, otherwise it breaks our algorithm that determines
// VectorizationContext... Do we use PreOrderWalker instead of DefaultGraphWalker.
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
ReduceWorkVectorizationNodeProcessor vnp =
new ReduceWorkVectorizationNodeProcessor(vectorTaskColumnInfo);
addReduceWorkRules(opRules, vnp);
Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null);
GraphWalker ogw = new PreOrderWalker(disp);
// iterator the reduce operator tree
ArrayList<Node> topNodes = new ArrayList<Node>();
topNodes.add(reduceWork.getReducer());
LOG.info("vectorizeReduceWork reducer Operator: " +
reduceWork.getReducer().getName() + "...");
HashMap<Node, Object> nodeOutput = new HashMap<Node, Object>();
ogw.startWalking(topNodes, nodeOutput);
// Necessary since we are vectorizing the root operator in reduce.
reduceWork.setReducer(vnp.getRootVectorOp());
vectorTaskColumnInfo.setScratchTypeNameArray(vnp.getVectorScratchColumnTypeNames());
vectorTaskColumnInfo.transferToBaseWork(reduceWork);
if (LOG.isDebugEnabled()) {
debugDisplayAllMaps(reduceWork);
}
}
class ClearVectorDescsNodeProcessor implements NodeProcessor {
public ClearVectorDescsNodeProcessor() {
}
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
for (Node n : stack) {
Operator<? extends OperatorDesc> op = (Operator<? extends OperatorDesc>) n;
OperatorDesc desc = op.getConf();
if (desc instanceof AbstractOperatorDesc) {
AbstractOperatorDesc abstractDesc = (AbstractOperatorDesc) desc;
abstractDesc.setVectorDesc(null);
}
}
return null;
}
}
private void clearMapWorkVectorDescs(MapWork mapWork) throws SemanticException {
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
ClearVectorDescsNodeProcessor vnp = new ClearVectorDescsNodeProcessor();
addMapWorkRules(opRules, vnp);
Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null);
GraphWalker ogw = new DefaultGraphWalker(disp);
ArrayList<Node> topNodes = new ArrayList<Node>();
topNodes.addAll(mapWork.getAliasToWork().values());
ogw.startWalking(topNodes, null);
}
private void clearReduceWorkVectorDescs(ReduceWork reduceWork) throws SemanticException {
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
ClearVectorDescsNodeProcessor vnp = new ClearVectorDescsNodeProcessor();
addReduceWorkRules(opRules, vnp);
Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null);
GraphWalker ogw = new DefaultGraphWalker(disp);
ArrayList<Node> topNodes = new ArrayList<Node>();
topNodes.add(reduceWork.getReducer());
ogw.startWalking(topNodes, null);
}
}
class MapWorkValidationNodeProcessor implements NodeProcessor {
private final MapWork mapWork;
private final boolean isTezOrSpark;
// Children of Vectorized GROUPBY that outputs rows instead of vectorized row batchs.
protected final Set<Operator<? extends OperatorDesc>> nonVectorizedOps =
new HashSet<Operator<? extends OperatorDesc>>();
public Set<Operator<? extends OperatorDesc>> getNonVectorizedOps() {
return nonVectorizedOps;
}
public MapWorkValidationNodeProcessor(MapWork mapWork, boolean isTezOrSpark) {
this.mapWork = mapWork;
this.isTezOrSpark = isTezOrSpark;
}
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
for (Node n : stack) {
Operator<? extends OperatorDesc> op = (Operator<? extends OperatorDesc>) n;
if (nonVectorizedOps.contains(op)) {
return new Boolean(true);
}
boolean ret;
currentOperator = op;
try {
ret = validateMapWorkOperator(op, mapWork, isTezOrSpark);
} catch (Exception e) {
throw new SemanticException(e);
}
if (!ret) {
return new Boolean(false);
}
// When Vectorized GROUPBY outputs rows instead of vectorized row batches, we don't
// vectorize the operators below it.
if (isVectorizedGroupByThatOutputsRows(op)) {
addOperatorChildrenToSet(op, nonVectorizedOps);
return new Boolean(true);
}
}
return new Boolean(true);
}
}
class ReduceWorkValidationNodeProcessor implements NodeProcessor {
// Children of Vectorized GROUPBY that outputs rows instead of vectorized row batchs.
protected final Set<Operator<? extends OperatorDesc>> nonVectorizedOps =
new HashSet<Operator<? extends OperatorDesc>>();
public Set<Operator<? extends OperatorDesc>> getNonVectorizeOps() {
return nonVectorizedOps;
}
public Set<Operator<? extends OperatorDesc>> getNonVectorizedOps() {
return nonVectorizedOps;
}
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
for (Node n : stack) {
Operator<? extends OperatorDesc> op = (Operator<? extends OperatorDesc>) n;
if (nonVectorizedOps.contains(op)) {
return new Boolean(true);
}
currentOperator = op;
boolean ret = validateReduceWorkOperator(op);
if (!ret) {
return new Boolean(false);
}
// When Vectorized GROUPBY outputs rows instead of vectorized row batches, we don't
// vectorize the operators below it.
if (isVectorizedGroupByThatOutputsRows(op)) {
addOperatorChildrenToSet(op, nonVectorizedOps);
return new Boolean(true);
}
}
return new Boolean(true);
}
}
// This class has common code used by both MapWorkVectorizationNodeProcessor and
// ReduceWorkVectorizationNodeProcessor.
class VectorizationNodeProcessor implements NodeProcessor {
// The vectorization context for the Map or Reduce task.
protected VectorizationContext taskVectorizationContext;
protected final VectorTaskColumnInfo vectorTaskColumnInfo;
protected final Set<Operator<? extends OperatorDesc>> nonVectorizedOps;
VectorizationNodeProcessor(VectorTaskColumnInfo vectorTaskColumnInfo,
Set<Operator<? extends OperatorDesc>> nonVectorizedOps) {
this.vectorTaskColumnInfo = vectorTaskColumnInfo;
this.nonVectorizedOps = nonVectorizedOps;
}
public String[] getVectorScratchColumnTypeNames() {
return taskVectorizationContext.getScratchColumnTypeNames();
}
protected final Set<Operator<? extends OperatorDesc>> opsDone =
new HashSet<Operator<? extends OperatorDesc>>();
protected final Map<Operator<? extends OperatorDesc>, Operator<? extends OperatorDesc>> opToVectorOpMap =
new HashMap<Operator<? extends OperatorDesc>, Operator<? extends OperatorDesc>>();
public VectorizationContext walkStackToFindVectorizationContext(Stack<Node> stack,
Operator<? extends OperatorDesc> op) throws SemanticException {
VectorizationContext vContext = null;
if (stack.size() <= 1) {
throw new SemanticException(
String.format("Expected operator stack for operator %s to have at least 2 operators",
op.getName()));
}
// Walk down the stack of operators until we found one willing to give us a context.
// At the bottom will be the root operator, guaranteed to have a context
int i= stack.size()-2;
while (vContext == null) {
if (i < 0) {
return null;
}
Operator<? extends OperatorDesc> opParent = (Operator<? extends OperatorDesc>) stack.get(i);
Operator<? extends OperatorDesc> vectorOpParent = opToVectorOpMap.get(opParent);
if (vectorOpParent != null) {
if (vectorOpParent instanceof VectorizationContextRegion) {
VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOpParent;
vContext = vcRegion.getOuputVectorizationContext();
LOG.info("walkStackToFindVectorizationContext " + vectorOpParent.getName() + " has new vectorization context " + vContext.toString());
} else {
LOG.info("walkStackToFindVectorizationContext " + vectorOpParent.getName() + " does not have new vectorization context");
}
} else {
LOG.info("walkStackToFindVectorizationContext " + opParent.getName() + " is not vectorized");
}
--i;
}
return vContext;
}
public Operator<? extends OperatorDesc> doVectorize(Operator<? extends OperatorDesc> op,
VectorizationContext vContext, boolean isTezOrSpark) throws SemanticException {
Operator<? extends OperatorDesc> vectorOp = op;
try {
if (!opsDone.contains(op)) {
vectorOp = vectorizeOperator(op, vContext, isTezOrSpark, vectorTaskColumnInfo);
opsDone.add(op);
if (vectorOp != op) {
opToVectorOpMap.put(op, vectorOp);
opsDone.add(vectorOp);
}
}
} catch (HiveException e) {
throw new SemanticException(e);
}
return vectorOp;
}
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
throw new SemanticException("Must be overridden");
}
}
class MapWorkVectorizationNodeProcessor extends VectorizationNodeProcessor {
private final VectorTaskColumnInfo vectorTaskColumnInfo;
private final boolean isTezOrSpark;
public MapWorkVectorizationNodeProcessor(MapWork mWork, boolean isTezOrSpark,
VectorTaskColumnInfo vectorTaskColumnInfo) {
super(vectorTaskColumnInfo, vectorTaskColumnInfo.getNonVectorizedOps());
this.vectorTaskColumnInfo = vectorTaskColumnInfo;
this.isTezOrSpark = isTezOrSpark;
}
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
Operator<? extends OperatorDesc> op = (Operator<? extends OperatorDesc>) nd;
if (nonVectorizedOps.contains(op)) {
return null;
}
VectorizationContext vContext = null;
currentOperator = op;
if (op instanceof TableScanOperator) {
if (taskVectorizationContext == null) {
taskVectorizationContext = getVectorizationContext(op.getName(), vectorTaskColumnInfo);
if (LOG.isInfoEnabled()) {
LOG.info("MapWorkVectorizationNodeProcessor process vectorizedVertexNum " + vectorizedVertexNum + " mapColumnNames " + vectorTaskColumnInfo.allColumnNames.toString());
LOG.info("MapWorkVectorizationNodeProcessor process vectorizedVertexNum " + vectorizedVertexNum + " mapTypeInfos " + vectorTaskColumnInfo.allTypeInfos.toString());
}
}
vContext = taskVectorizationContext;
} else {
LOG.debug("MapWorkVectorizationNodeProcessor process going to walk the operator stack to get vectorization context for " + op.getName());
vContext = walkStackToFindVectorizationContext(stack, op);
if (vContext == null) {
// No operator has "pushed" a new context -- so use the task vectorization context.
vContext = taskVectorizationContext;
}
}
assert vContext != null;
if (LOG.isDebugEnabled()) {
LOG.debug("MapWorkVectorizationNodeProcessor process operator " + op.getName()
+ " using vectorization context" + vContext.toString());
}
Operator<? extends OperatorDesc> vectorOp = doVectorize(op, vContext, isTezOrSpark);
if (LOG.isDebugEnabled()) {
if (vectorOp instanceof VectorizationContextRegion) {
VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOp;
VectorizationContext vNewContext = vcRegion.getOuputVectorizationContext();
LOG.debug("Vectorized MapWork operator " + vectorOp.getName() + " added vectorization context " + vNewContext.toString());
}
}
return null;
}
}
class ReduceWorkVectorizationNodeProcessor extends VectorizationNodeProcessor {
private final VectorTaskColumnInfo vectorTaskColumnInfo;
private Operator<? extends OperatorDesc> rootVectorOp;
public Operator<? extends OperatorDesc> getRootVectorOp() {
return rootVectorOp;
}
public ReduceWorkVectorizationNodeProcessor(VectorTaskColumnInfo vectorTaskColumnInfo) {
super(vectorTaskColumnInfo, vectorTaskColumnInfo.getNonVectorizedOps());
this.vectorTaskColumnInfo = vectorTaskColumnInfo;
rootVectorOp = null;
}
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
Operator<? extends OperatorDesc> op = (Operator<? extends OperatorDesc>) nd;
if (nonVectorizedOps.contains(op)) {
return null;
}
VectorizationContext vContext = null;
boolean saveRootVectorOp = false;
currentOperator = op;
if (op.getParentOperators().size() == 0) {
if (LOG.isInfoEnabled()) {
LOG.info("ReduceWorkVectorizationNodeProcessor process vectorizedVertexNum " + vectorizedVertexNum + " reduceColumnNames " + vectorTaskColumnInfo.allColumnNames.toString());
LOG.info("ReduceWorkVectorizationNodeProcessor process vectorizedVertexNum " + vectorizedVertexNum + " reduceTypeInfos " + vectorTaskColumnInfo.allTypeInfos.toString());
}
vContext = new VectorizationContext("__Reduce_Shuffle__", vectorTaskColumnInfo.allColumnNames, hiveConf);
taskVectorizationContext = vContext;
saveRootVectorOp = true;
if (LOG.isDebugEnabled()) {
LOG.debug("Vectorized ReduceWork reduce shuffle vectorization context " + vContext.toString());
}
} else {
LOG.info("ReduceWorkVectorizationNodeProcessor process going to walk the operator stack to get vectorization context for " + op.getName());
vContext = walkStackToFindVectorizationContext(stack, op);
if (vContext == null) {
// If we didn't find a context among the operators, assume the top -- reduce shuffle's
// vectorization context.
vContext = taskVectorizationContext;
}
}
assert vContext != null;
LOG.info("ReduceWorkVectorizationNodeProcessor process operator " + op.getName() + " using vectorization context" + vContext.toString());
Operator<? extends OperatorDesc> vectorOp = doVectorize(op, vContext, true);
if (LOG.isDebugEnabled()) {
if (vectorOp instanceof VectorizationContextRegion) {
VectorizationContextRegion vcRegion = (VectorizationContextRegion) vectorOp;
VectorizationContext vNewContext = vcRegion.getOuputVectorizationContext();
LOG.debug("Vectorized ReduceWork operator " + vectorOp.getName() + " added vectorization context " + vNewContext.toString());
}
}
if (saveRootVectorOp && op != vectorOp) {
rootVectorOp = vectorOp;
}
return null;
}
}
private static class ValidatorVectorizationContext extends VectorizationContext {
private ValidatorVectorizationContext(HiveConf hiveConf) {
super("No Name", hiveConf);
}
@Override
protected int getInputColumnIndex(String name) {
return 0;
}
@Override
protected int getInputColumnIndex(ExprNodeColumnDesc colExpr) {
return 0;
}
}
@Override
public PhysicalContext resolve(PhysicalContext physicalContext) throws SemanticException {
hiveConf = physicalContext.getConf();
boolean vectorPath = HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED);
if (!vectorPath) {
LOG.info("Vectorization is disabled");
return physicalContext;
}
useVectorizedInputFileFormat =
HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT);
useVectorDeserialize =
HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE);
useRowDeserialize =
HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_USE_ROW_DESERIALIZE);
// TODO: we could also vectorize some formats based on hive.llap.io.encode.formats if LLAP IO
// is enabled and we are going to run in LLAP. However, we don't know if we end up in
// LLAP or not at this stage, so don't do this now. We may need to add a 'force' option.
isReduceVectorizationEnabled =
HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCE_ENABLED);
isSchemaEvolution =
HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_SCHEMA_EVOLUTION);
hiveVectorAdaptorUsageMode = HiveVectorAdaptorUsageMode.getHiveConfValue(hiveConf);
// create dispatcher and graph walker
Dispatcher disp = new VectorizationDispatcher();
TaskGraphWalker ogw = new TaskGraphWalker(disp);
// get all the tasks nodes from root task
ArrayList<Node> topNodes = new ArrayList<Node>();
topNodes.addAll(physicalContext.getRootTasks());
// begin to walk through the task tree.
ogw.startWalking(topNodes, null);
return physicalContext;
}
private void setOperatorNotSupported(Operator<? extends OperatorDesc> op) {
OperatorDesc desc = op.getConf();
Annotation note = AnnotationUtils.getAnnotation(desc.getClass(), Explain.class);
if (note != null) {
Explain explainNote = (Explain) note;
setNodeIssue(explainNote.displayName() + " (" + op.getType() + ") not supported");
} else {
setNodeIssue("Operator " + op.getType() + " not supported");
}
}
boolean validateMapWorkOperator(Operator<? extends OperatorDesc> op, MapWork mWork, boolean isTezOrSpark) {
boolean ret;
switch (op.getType()) {
case MAPJOIN:
if (op instanceof MapJoinOperator) {
ret = validateMapJoinOperator((MapJoinOperator) op);
} else if (op instanceof SMBMapJoinOperator) {
ret = validateSMBMapJoinOperator((SMBMapJoinOperator) op);
} else {
setOperatorNotSupported(op);
ret = false;
}
break;
case GROUPBY:
ret = validateGroupByOperator((GroupByOperator) op, false, isTezOrSpark);
break;
case FILTER:
ret = validateFilterOperator((FilterOperator) op);
break;
case SELECT:
ret = validateSelectOperator((SelectOperator) op);
break;
case REDUCESINK:
ret = validateReduceSinkOperator((ReduceSinkOperator) op);
break;
case TABLESCAN:
ret = validateTableScanOperator((TableScanOperator) op, mWork);
break;
case FILESINK:
case LIMIT:
case EVENT:
case SPARKPRUNINGSINK:
ret = true;
break;
case HASHTABLESINK:
ret = op instanceof SparkHashTableSinkOperator &&
validateSparkHashTableSinkOperator((SparkHashTableSinkOperator) op);
break;
default:
setOperatorNotSupported(op);
ret = false;
break;
}
return ret;
}
boolean validateReduceWorkOperator(Operator<? extends OperatorDesc> op) {
boolean ret;
switch (op.getType()) {
case MAPJOIN:
// Does MAPJOIN actually get planned in Reduce?
if (op instanceof MapJoinOperator) {
ret = validateMapJoinOperator((MapJoinOperator) op);
} else if (op instanceof SMBMapJoinOperator) {
ret = validateSMBMapJoinOperator((SMBMapJoinOperator) op);
} else {
setOperatorNotSupported(op);
ret = false;
}
break;
case GROUPBY:
if (HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCE_GROUPBY_ENABLED)) {
ret = validateGroupByOperator((GroupByOperator) op, true, true);
} else {
setNodeIssue("Operator " + op.getType() + " not enabled (" + HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCE_GROUPBY_ENABLED.name() + "=true IS false)");
ret = false;
}
break;
case FILTER:
ret = validateFilterOperator((FilterOperator) op);
break;
case SELECT:
ret = validateSelectOperator((SelectOperator) op);
break;
case REDUCESINK:
ret = validateReduceSinkOperator((ReduceSinkOperator) op);
break;
case FILESINK:
ret = validateFileSinkOperator((FileSinkOperator) op);
break;
case LIMIT:
case EVENT:
case SPARKPRUNINGSINK:
ret = true;
break;
case HASHTABLESINK:
ret = op instanceof SparkHashTableSinkOperator &&
validateSparkHashTableSinkOperator((SparkHashTableSinkOperator) op);
break;
default:
setOperatorNotSupported(op);
ret = false;
break;
}
return ret;
}
private void addOperatorChildrenToSet(Operator<? extends OperatorDesc> op,
Set<Operator<? extends OperatorDesc>> nonVectorizedOps) {
for (Operator<? extends OperatorDesc> childOp : op.getChildOperators()) {
if (!nonVectorizedOps.contains(childOp)) {
nonVectorizedOps.add(childOp);
addOperatorChildrenToSet(childOp, nonVectorizedOps);
}
}
}
// When Vectorized GROUPBY outputs rows instead of vectorized row batchs, we don't
// vectorize the operators below it.
private Boolean isVectorizedGroupByThatOutputsRows(Operator<? extends OperatorDesc> op)
throws SemanticException {
if (op.getType().equals(OperatorType.GROUPBY)) {
GroupByDesc desc = (GroupByDesc) op.getConf();
return !((VectorGroupByDesc) desc.getVectorDesc()).isVectorOutput();
}
return false;
}
private boolean validateSMBMapJoinOperator(SMBMapJoinOperator op) {
SMBJoinDesc desc = op.getConf();
// Validation is the same as for map join, since the 'small' tables are not vectorized
return validateMapJoinDesc(desc);
}
private boolean validateTableScanOperator(TableScanOperator op, MapWork mWork) {
TableScanDesc desc = op.getConf();
if (desc.isGatherStats()) {
setOperatorIssue("gather stats not supported");
return false;
}
return true;
}
private boolean validateMapJoinOperator(MapJoinOperator op) {
MapJoinDesc desc = op.getConf();
return validateMapJoinDesc(desc);
}
private boolean validateMapJoinDesc(MapJoinDesc desc) {
byte posBigTable = (byte) desc.getPosBigTable();
List<ExprNodeDesc> filterExprs = desc.getFilters().get(posBigTable);
if (!validateExprNodeDesc(filterExprs, "Filter", VectorExpressionDescriptor.Mode.FILTER)) {
return false;
}
List<ExprNodeDesc> keyExprs = desc.getKeys().get(posBigTable);
if (!validateExprNodeDesc(keyExprs, "Key")) {
return false;
}
List<ExprNodeDesc> valueExprs = desc.getExprs().get(posBigTable);
if (!validateExprNodeDesc(valueExprs, "Value")) {
return false;
}
Byte[] order = desc.getTagOrder();
Byte posSingleVectorMapJoinSmallTable = (order[0] == posBigTable ? order[1] : order[0]);
List<ExprNodeDesc> smallTableExprs = desc.getExprs().get(posSingleVectorMapJoinSmallTable);
if (!validateExprNodeDesc(smallTableExprs, "Small Table")) {
return false;
}
if (desc.getResidualFilterExprs() != null && !desc.getResidualFilterExprs().isEmpty()) {
LOG.info("Cannot vectorize outer join with complex ON clause");
return false;
}
return true;
}
private boolean validateSparkHashTableSinkOperator(SparkHashTableSinkOperator op) {
SparkHashTableSinkDesc desc = op.getConf();
byte tag = desc.getTag();
// it's essentially a MapJoinDesc
List<ExprNodeDesc> filterExprs = desc.getFilters().get(tag);
List<ExprNodeDesc> keyExprs = desc.getKeys().get(tag);
List<ExprNodeDesc> valueExprs = desc.getExprs().get(tag);
return validateExprNodeDesc(filterExprs, "Filter", VectorExpressionDescriptor.Mode.FILTER) &&
validateExprNodeDesc(keyExprs, "Key") && validateExprNodeDesc(valueExprs, "Value");
}
private boolean validateReduceSinkOperator(ReduceSinkOperator op) {
List<ExprNodeDesc> keyDescs = op.getConf().getKeyCols();
List<ExprNodeDesc> partitionDescs = op.getConf().getPartitionCols();
List<ExprNodeDesc> valueDesc = op.getConf().getValueCols();
return validateExprNodeDesc(keyDescs, "Key") && validateExprNodeDesc(partitionDescs, "Partition") &&
validateExprNodeDesc(valueDesc, "Value");
}
private boolean validateSelectOperator(SelectOperator op) {
List<ExprNodeDesc> descList = op.getConf().getColList();
for (ExprNodeDesc desc : descList) {
boolean ret = validateExprNodeDesc(desc, "Select");
if (!ret) {
return false;
}
}
return true;
}
private boolean validateFilterOperator(FilterOperator op) {
ExprNodeDesc desc = op.getConf().getPredicate();
return validateExprNodeDesc(desc, "Predicate", VectorExpressionDescriptor.Mode.FILTER);
}
private boolean validateGroupByOperator(GroupByOperator op, boolean isReduce, boolean isTezOrSpark) {
GroupByDesc desc = op.getConf();
if (desc.getMode() != GroupByDesc.Mode.HASH && desc.isDistinct()) {
setOperatorIssue("DISTINCT not supported");
return false;
}
boolean ret = validateExprNodeDesc(desc.getKeys(), "Key");
if (!ret) {
return false;
}
/**
*
* GROUP BY DEFINITIONS:
*
* GroupByDesc.Mode enumeration:
*
* The different modes of a GROUP BY operator.
*
* These descriptions are hopefully less cryptic than the comments for GroupByDesc.Mode.
*
* COMPLETE Aggregates original rows into full aggregation row(s).
*
* If the key length is 0, this is also called Global aggregation and
* 1 output row is produced.
*
* When the key length is > 0, the original rows come in ALREADY GROUPED.
*
* An example for key length > 0 is a GROUP BY being applied to the
* ALREADY GROUPED rows coming from an upstream JOIN operator. Or,
* ALREADY GROUPED rows coming from upstream MERGEPARTIAL GROUP BY
* operator.
*
* PARTIAL1 The first of 2 (or more) phases that aggregates ALREADY GROUPED
* original rows into partial aggregations.
*
* Subsequent phases PARTIAL2 (optional) and MERGEPARTIAL will merge
* the partial aggregations and output full aggregations.
*
* PARTIAL2 Accept ALREADY GROUPED partial aggregations and merge them into another
* partial aggregation. Output the merged partial aggregations.
*
* (Haven't seen this one used)
*
* PARTIALS (Behaves for non-distinct the same as PARTIAL2; and behaves for
* distinct the same as PARTIAL1.)
*
* FINAL Accept ALREADY GROUPED original rows and aggregate them into
* full aggregations.
*
* Example is a GROUP BY being applied to rows from a sorted table, where
* the group key is the table sort key (or a prefix).
*
* HASH Accept UNORDERED original rows and aggregate them into a memory table.
* Output the partial aggregations on closeOp (or low memory).
*
* Similar to PARTIAL1 except original rows are UNORDERED.
*
* Commonly used in both Mapper and Reducer nodes. Always followed by
* a Reducer with MERGEPARTIAL GROUP BY.
*
* MERGEPARTIAL Always first operator of a Reducer. Data is grouped by reduce-shuffle.
*
* (Behaves for non-distinct aggregations the same as FINAL; and behaves
* for distinct aggregations the same as COMPLETE.)
*
* The output is full aggregation(s).
*
* Used in Reducers after a stage with a HASH GROUP BY operator.
*
*
* VectorGroupByDesc.ProcessingMode for VectorGroupByOperator:
*
* GLOBAL No key. All rows --> 1 full aggregation on end of input
*
* HASH Rows aggregated in to hash table on group key -->
* 1 partial aggregation per key (normally, unless there is spilling)
*
* MERGE_PARTIAL As first operator in a REDUCER, partial aggregations come grouped from
* reduce-shuffle -->
* aggregate the partial aggregations and emit full aggregation on
* endGroup / closeOp
*
* STREAMING Rows come from PARENT operator ALREADY GROUPED -->
* aggregate the rows and emit full aggregation on key change / closeOp
*
* NOTE: Hash can spill partial result rows prematurely if it runs low on memory.
* NOTE: Streaming has to compare keys where MergePartial gets an endGroup call.
*
*
* DECIDER: Which VectorGroupByDesc.ProcessingMode for VectorGroupByOperator?
*
* Decides using GroupByDesc.Mode and whether there are keys with the
* VectorGroupByDesc.groupByDescModeToVectorProcessingMode method.
*
* Mode.COMPLETE --> (numKeys == 0 ? ProcessingMode.GLOBAL : ProcessingMode.STREAMING)
*
* Mode.HASH --> ProcessingMode.HASH
*
* Mode.MERGEPARTIAL --> (numKeys == 0 ? ProcessingMode.GLOBAL : ProcessingMode.MERGE_PARTIAL)
*
* Mode.PARTIAL1,
* Mode.PARTIAL2,
* Mode.PARTIALS,
* Mode.FINAL --> ProcessingMode.STREAMING
*
*/
boolean hasKeys = (desc.getKeys().size() > 0);
ProcessingMode processingMode =
VectorGroupByDesc.groupByDescModeToVectorProcessingMode(desc.getMode(), hasKeys);
if (desc.isGroupingSetsPresent() &&
(processingMode != ProcessingMode.HASH && processingMode != ProcessingMode.STREAMING)) {
LOG.info("Vectorized GROUPING SETS only expected for HASH and STREAMING processing modes");
return false;
}
Pair<Boolean,Boolean> retPair =
validateAggregationDescs(desc.getAggregators(), processingMode, hasKeys);
if (!retPair.left) {
return false;
}
// If all the aggregation outputs are primitive, we can output VectorizedRowBatch.
// Otherwise, we the rest of the operator tree will be row mode.
VectorGroupByDesc vectorDesc = new VectorGroupByDesc();
desc.setVectorDesc(vectorDesc);
vectorDesc.setVectorOutput(retPair.right);
vectorDesc.setProcessingMode(processingMode);
LOG.info("Vector GROUP BY operator will use processing mode " + processingMode.name() +
", isVectorOutput " + vectorDesc.isVectorOutput());
return true;
}
private boolean validateFileSinkOperator(FileSinkOperator op) {
return true;
}
private boolean validateExprNodeDesc(List<ExprNodeDesc> descs, String expressionTitle) {
return validateExprNodeDesc(descs, expressionTitle, VectorExpressionDescriptor.Mode.PROJECTION);
}
private boolean validateExprNodeDesc(List<ExprNodeDesc> descs,
String expressionTitle,
VectorExpressionDescriptor.Mode mode) {
for (ExprNodeDesc d : descs) {
boolean ret = validateExprNodeDesc(d, expressionTitle, mode);
if (!ret) {
return false;
}
}
return true;
}
private Pair<Boolean,Boolean> validateAggregationDescs(List<AggregationDesc> descs,
ProcessingMode processingMode, boolean hasKeys) {
boolean outputIsPrimitive = true;
for (AggregationDesc d : descs) {
Pair<Boolean,Boolean> retPair = validateAggregationDesc(d, processingMode, hasKeys);
if (!retPair.left) {
return retPair;
}
if (!retPair.right) {
outputIsPrimitive = false;
}
}
return new Pair<Boolean, Boolean>(true, outputIsPrimitive);
}
private boolean validateExprNodeDescRecursive(ExprNodeDesc desc, String expressionTitle,
VectorExpressionDescriptor.Mode mode) {
if (desc instanceof ExprNodeColumnDesc) {
ExprNodeColumnDesc c = (ExprNodeColumnDesc) desc;
// Currently, we do not support vectorized virtual columns (see HIVE-5570).
if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(c.getColumn())) {
setExpressionIssue(expressionTitle, "Virtual columns not supported (" + c.getColumn() + ")");
return false;
}
}
String typeName = desc.getTypeInfo().getTypeName();
boolean ret = validateDataType(typeName, mode);
if (!ret) {
setExpressionIssue(expressionTitle, "Data type " + typeName + " of " + desc.toString() + " not supported");
return false;
}
boolean isInExpression = false;
if (desc instanceof ExprNodeGenericFuncDesc) {
ExprNodeGenericFuncDesc d = (ExprNodeGenericFuncDesc) desc;
boolean r = validateGenericUdf(d);
if (!r) {
setExpressionIssue(expressionTitle, "UDF " + d + " not supported");
return false;
}
GenericUDF genericUDF = d.getGenericUDF();
isInExpression = (genericUDF instanceof GenericUDFIn);
}
if (desc.getChildren() != null) {
if (isInExpression
&& desc.getChildren().get(0).getTypeInfo().getCategory() == Category.STRUCT) {
// Don't restrict child expressions for projection.
// Always use loose FILTER mode.
if (!validateStructInExpression(desc, expressionTitle, VectorExpressionDescriptor.Mode.FILTER)) {
return false;
}
} else {
for (ExprNodeDesc d : desc.getChildren()) {
// Don't restrict child expressions for projection.
// Always use loose FILTER mode.
if (!validateExprNodeDescRecursive(d, expressionTitle, VectorExpressionDescriptor.Mode.FILTER)) {
return false;
}
}
}
}
return true;
}
private boolean validateStructInExpression(ExprNodeDesc desc,
String expressionTitle, VectorExpressionDescriptor.Mode mode) {
for (ExprNodeDesc d : desc.getChildren()) {
TypeInfo typeInfo = d.getTypeInfo();
if (typeInfo.getCategory() != Category.STRUCT) {
return false;
}
StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
ArrayList<TypeInfo> fieldTypeInfos = structTypeInfo
.getAllStructFieldTypeInfos();
ArrayList<String> fieldNames = structTypeInfo.getAllStructFieldNames();
final int fieldCount = fieldTypeInfos.size();
for (int f = 0; f < fieldCount; f++) {
TypeInfo fieldTypeInfo = fieldTypeInfos.get(f);
Category category = fieldTypeInfo.getCategory();
if (category != Category.PRIMITIVE) {
setExpressionIssue(expressionTitle,
"Cannot vectorize struct field " + fieldNames.get(f)
+ " of type " + fieldTypeInfo.getTypeName());
return false;
}
PrimitiveTypeInfo fieldPrimitiveTypeInfo = (PrimitiveTypeInfo) fieldTypeInfo;
InConstantType inConstantType = VectorizationContext
.getInConstantTypeFromPrimitiveCategory(fieldPrimitiveTypeInfo
.getPrimitiveCategory());
// For now, limit the data types we support for Vectorized Struct IN().
if (inConstantType != InConstantType.INT_FAMILY
&& inConstantType != InConstantType.FLOAT_FAMILY
&& inConstantType != InConstantType.STRING_FAMILY) {
setExpressionIssue(expressionTitle,
"Cannot vectorize struct field " + fieldNames.get(f)
+ " of type " + fieldTypeInfo.getTypeName());
return false;
}
}
}
return true;
}
private boolean validateExprNodeDesc(ExprNodeDesc desc, String expressionTitle) {
return validateExprNodeDesc(desc, expressionTitle, VectorExpressionDescriptor.Mode.PROJECTION);
}
boolean validateExprNodeDesc(ExprNodeDesc desc, String expressionTitle,
VectorExpressionDescriptor.Mode mode) {
if (!validateExprNodeDescRecursive(desc, expressionTitle, mode)) {
return false;
}
try {
VectorizationContext vc = new ValidatorVectorizationContext(hiveConf);
if (vc.getVectorExpression(desc, mode) == null) {
// TODO: this cannot happen - VectorizationContext throws in such cases.
setExpressionIssue(expressionTitle, "getVectorExpression returned null");
return false;
}
} catch (Exception e) {
if (e instanceof HiveException) {
setExpressionIssue(expressionTitle, e.getMessage());
} else {
String issue = "exception: " + VectorizationContext.getStackTraceAsSingleLine(e);
setExpressionIssue(expressionTitle, issue);
}
return false;
}
return true;
}
private boolean validateGenericUdf(ExprNodeGenericFuncDesc genericUDFExpr) {
if (VectorizationContext.isCustomUDF(genericUDFExpr)) {
return true;
}
if (hiveVectorAdaptorUsageMode == HiveVectorAdaptorUsageMode.NONE ||
hiveVectorAdaptorUsageMode == HiveVectorAdaptorUsageMode.CHOSEN) {
GenericUDF genericUDF = genericUDFExpr.getGenericUDF();
if (genericUDF instanceof GenericUDFBridge) {
Class<? extends UDF> udf = ((GenericUDFBridge) genericUDF).getUdfClass();
return supportedGenericUDFs.contains(udf);
} else {
return supportedGenericUDFs.contains(genericUDF.getClass());
}
}
return true;
}
public static ObjectInspector.Category aggregationOutputCategory(VectorAggregateExpression vectorAggrExpr) {
ObjectInspector outputObjInspector = vectorAggrExpr.getOutputObjectInspector();
return outputObjInspector.getCategory();
}
private Pair<Boolean,Boolean> validateAggregationDesc(AggregationDesc aggDesc, ProcessingMode processingMode,
boolean hasKeys) {
String udfName = aggDesc.getGenericUDAFName().toLowerCase();
if (!supportedAggregationUdfs.contains(udfName)) {
setExpressionIssue("Aggregation Function", "UDF " + udfName + " not supported");
return new Pair<Boolean,Boolean>(false, false);
}
/*
if (aggDesc.getDistinct()) {
setExpressionIssue("Aggregation Function", "DISTINCT not supported");
return new Pair<Boolean,Boolean>(false, false);
}
*/
if (aggDesc.getParameters() != null && !validateExprNodeDesc(aggDesc.getParameters(), "Aggregation Function UDF " + udfName + " parameter")) {
return new Pair<Boolean,Boolean>(false, false);
}
// See if we can vectorize the aggregation.
VectorizationContext vc = new ValidatorVectorizationContext(hiveConf);
VectorAggregateExpression vectorAggrExpr;
try {
vectorAggrExpr = vc.getAggregatorExpression(aggDesc);
} catch (Exception e) {
// We should have already attempted to vectorize in validateAggregationDesc.
if (LOG.isDebugEnabled()) {
LOG.debug("Vectorization of aggregation should have succeeded ", e);
}
setExpressionIssue("Aggregation Function", "Vectorization of aggreation should have succeeded " + e);
return new Pair<Boolean,Boolean>(false, false);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Aggregation " + aggDesc.getExprString() + " --> " +
" vector expression " + vectorAggrExpr.toString());
}
ObjectInspector.Category outputCategory = aggregationOutputCategory(vectorAggrExpr);
boolean outputIsPrimitive = (outputCategory == ObjectInspector.Category.PRIMITIVE);
if (processingMode == ProcessingMode.MERGE_PARTIAL &&
hasKeys &&
!outputIsPrimitive) {
setOperatorIssue("Vectorized Reduce MergePartial GROUP BY keys can only handle aggregate outputs that are primitive types");
return new Pair<Boolean,Boolean>(false, false);
}
return new Pair<Boolean,Boolean>(true, outputIsPrimitive);
}
public static boolean validateDataType(String type, VectorExpressionDescriptor.Mode mode) {
type = type.toLowerCase();
boolean result = supportedDataTypesPattern.matcher(type).matches();
if (result && mode == VectorExpressionDescriptor.Mode.PROJECTION && type.equals("void")) {
return false;
}
return result;
}
private VectorizationContext getVectorizationContext(String contextName,
VectorTaskColumnInfo vectorTaskColumnInfo) {
VectorizationContext vContext =
new VectorizationContext(contextName, vectorTaskColumnInfo.allColumnNames, hiveConf);
return vContext;
}
private void fixupParentChildOperators(Operator<? extends OperatorDesc> op,
Operator<? extends OperatorDesc> vectorOp) {
if (op.getParentOperators() != null) {
vectorOp.setParentOperators(op.getParentOperators());
for (Operator<? extends OperatorDesc> p : op.getParentOperators()) {
p.replaceChild(op, vectorOp);
}
}
if (op.getChildOperators() != null) {
vectorOp.setChildOperators(op.getChildOperators());
for (Operator<? extends OperatorDesc> c : op.getChildOperators()) {
c.replaceParent(op, vectorOp);
}
}
}
private boolean isBigTableOnlyResults(MapJoinDesc desc) {
Byte[] order = desc.getTagOrder();
byte posBigTable = (byte) desc.getPosBigTable();
Byte posSingleVectorMapJoinSmallTable = (order[0] == posBigTable ? order[1] : order[0]);
int[] smallTableIndices;
int smallTableIndicesSize;
if (desc.getValueIndices() != null && desc.getValueIndices().get(posSingleVectorMapJoinSmallTable) != null) {
smallTableIndices = desc.getValueIndices().get(posSingleVectorMapJoinSmallTable);
LOG.info("Vectorizer isBigTableOnlyResults smallTableIndices " + Arrays.toString(smallTableIndices));
smallTableIndicesSize = smallTableIndices.length;
} else {
smallTableIndices = null;
LOG.info("Vectorizer isBigTableOnlyResults smallTableIndices EMPTY");
smallTableIndicesSize = 0;
}
List<Integer> smallTableRetainList = desc.getRetainList().get(posSingleVectorMapJoinSmallTable);
LOG.info("Vectorizer isBigTableOnlyResults smallTableRetainList " + smallTableRetainList);
int smallTableRetainSize = smallTableRetainList.size();
if (smallTableIndicesSize > 0) {
// Small table indices has priority over retain.
for (int i = 0; i < smallTableIndicesSize; i++) {
if (smallTableIndices[i] < 0) {
// Negative numbers indicate a column to be (deserialize) read from the small table's
// LazyBinary value row.
setOperatorIssue("Vectorizer isBigTableOnlyResults smallTableIndices[i] < 0 returning false");
return false;
}
}
} else if (smallTableRetainSize > 0) {
setOperatorIssue("Vectorizer isBigTableOnlyResults smallTableRetainSize > 0 returning false");
return false;
}
LOG.info("Vectorizer isBigTableOnlyResults returning true");
return true;
}
Operator<? extends OperatorDesc> specializeMapJoinOperator(Operator<? extends OperatorDesc> op,
VectorizationContext vContext, MapJoinDesc desc, VectorMapJoinInfo vectorMapJoinInfo)
throws HiveException {
Operator<? extends OperatorDesc> vectorOp = null;
Class<? extends Operator<?>> opClass = null;
VectorMapJoinDesc vectorDesc = (VectorMapJoinDesc) desc.getVectorDesc();
HashTableImplementationType hashTableImplementationType = HashTableImplementationType.NONE;
HashTableKind hashTableKind = HashTableKind.NONE;
HashTableKeyType hashTableKeyType = HashTableKeyType.NONE;
OperatorVariation operatorVariation = OperatorVariation.NONE;
if (vectorDesc.getIsFastHashTableEnabled()) {
hashTableImplementationType = HashTableImplementationType.FAST;
} else {
hashTableImplementationType = HashTableImplementationType.OPTIMIZED;
}
int joinType = desc.getConds()[0].getType();
boolean isInnerBigOnly = false;
if (joinType == JoinDesc.INNER_JOIN && isBigTableOnlyResults(desc)) {
isInnerBigOnly = true;
}
// By default, we can always use the multi-key class.
hashTableKeyType = HashTableKeyType.MULTI_KEY;
if (!HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_MULTIKEY_ONLY_ENABLED)) {
// Look for single column optimization.
byte posBigTable = (byte) desc.getPosBigTable();
Map<Byte, List<ExprNodeDesc>> keyExprs = desc.getKeys();
List<ExprNodeDesc> bigTableKeyExprs = keyExprs.get(posBigTable);
if (bigTableKeyExprs.size() == 1) {
TypeInfo typeInfo = bigTableKeyExprs.get(0).getTypeInfo();
LOG.info("Vectorizer vectorizeOperator map join typeName " + typeInfo.getTypeName());
switch (((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory()) {
case BOOLEAN:
hashTableKeyType = HashTableKeyType.BOOLEAN;
break;
case BYTE:
hashTableKeyType = HashTableKeyType.BYTE;
break;
case SHORT:
hashTableKeyType = HashTableKeyType.SHORT;
break;
case INT:
hashTableKeyType = HashTableKeyType.INT;
break;
case LONG:
hashTableKeyType = HashTableKeyType.LONG;
break;
case STRING:
case CHAR:
case VARCHAR:
case BINARY:
hashTableKeyType = HashTableKeyType.STRING;
default:
// Stay with multi-key.
}
}
}
switch (joinType) {
case JoinDesc.INNER_JOIN:
if (!isInnerBigOnly) {
operatorVariation = OperatorVariation.INNER;
hashTableKind = HashTableKind.HASH_MAP;
} else {
operatorVariation = OperatorVariation.INNER_BIG_ONLY;
hashTableKind = HashTableKind.HASH_MULTISET;
}
break;
case JoinDesc.LEFT_OUTER_JOIN:
case JoinDesc.RIGHT_OUTER_JOIN:
operatorVariation = OperatorVariation.OUTER;
hashTableKind = HashTableKind.HASH_MAP;
break;
case JoinDesc.LEFT_SEMI_JOIN:
operatorVariation = OperatorVariation.LEFT_SEMI;
hashTableKind = HashTableKind.HASH_SET;
break;
default:
throw new HiveException("Unknown join type " + joinType);
}
LOG.info("Vectorizer vectorizeOperator map join hashTableKind " + hashTableKind.name() + " hashTableKeyType " + hashTableKeyType.name());
switch (hashTableKeyType) {
case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
switch (operatorVariation) {
case INNER:
opClass = VectorMapJoinInnerLongOperator.class;
break;
case INNER_BIG_ONLY:
opClass = VectorMapJoinInnerBigOnlyLongOperator.class;
break;
case LEFT_SEMI:
opClass = VectorMapJoinLeftSemiLongOperator.class;
break;
case OUTER:
opClass = VectorMapJoinOuterLongOperator.class;
break;
default:
throw new HiveException("Unknown operator variation " + operatorVariation);
}
break;
case STRING:
switch (operatorVariation) {
case INNER:
opClass = VectorMapJoinInnerStringOperator.class;
break;
case INNER_BIG_ONLY:
opClass = VectorMapJoinInnerBigOnlyStringOperator.class;
break;
case LEFT_SEMI:
opClass = VectorMapJoinLeftSemiStringOperator.class;
break;
case OUTER:
opClass = VectorMapJoinOuterStringOperator.class;
break;
default:
throw new HiveException("Unknown operator variation " + operatorVariation);
}
break;
case MULTI_KEY:
switch (operatorVariation) {
case INNER:
opClass = VectorMapJoinInnerMultiKeyOperator.class;
break;
case INNER_BIG_ONLY:
opClass = VectorMapJoinInnerBigOnlyMultiKeyOperator.class;
break;
case LEFT_SEMI:
opClass = VectorMapJoinLeftSemiMultiKeyOperator.class;
break;
case OUTER:
opClass = VectorMapJoinOuterMultiKeyOperator.class;
break;
default:
throw new HiveException("Unknown operator variation " + operatorVariation);
}
break;
default:
throw new RuntimeException("Unexpected hash table key type " + hashTableKeyType.name());
}
boolean minMaxEnabled = HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_MINMAX_ENABLED);
vectorDesc.setHashTableImplementationType(hashTableImplementationType);
vectorDesc.setHashTableKind(hashTableKind);
vectorDesc.setHashTableKeyType(hashTableKeyType);
vectorDesc.setOperatorVariation(operatorVariation);
vectorDesc.setMinMaxEnabled(minMaxEnabled);
vectorDesc.setVectorMapJoinInfo(vectorMapJoinInfo);
vectorOp = OperatorFactory.getVectorOperator(
opClass, op.getCompilationOpContext(), op.getConf(), vContext);
LOG.info("Vectorizer vectorizeOperator map join class " + vectorOp.getClass().getSimpleName());
return vectorOp;
}
public static boolean onExpressionHasNullSafes(MapJoinDesc desc) {
boolean[] nullSafes = desc.getNullSafes();
if (nullSafes == null) {
return false;
}
for (boolean nullSafe : nullSafes) {
if (nullSafe) {
return true;
}
}
return false;
}
private boolean canSpecializeMapJoin(Operator<? extends OperatorDesc> op, MapJoinDesc desc,
boolean isTezOrSpark, VectorizationContext vContext, VectorMapJoinInfo vectorMapJoinInfo)
throws HiveException {
Preconditions.checkState(op instanceof MapJoinOperator);
// Allocate a VectorReduceSinkDesc initially with implementation type NONE so EXPLAIN
// can report this operator was vectorized, but not native. And, the conditions.
VectorMapJoinDesc vectorDesc = new VectorMapJoinDesc();
desc.setVectorDesc(vectorDesc);
boolean isVectorizationMapJoinNativeEnabled = HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_ENABLED);
String engine = HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE);
boolean oneMapJoinCondition = (desc.getConds().length == 1);
boolean hasNullSafes = onExpressionHasNullSafes(desc);
byte posBigTable = (byte) desc.getPosBigTable();
// Since we want to display all the met and not met conditions in EXPLAIN, we determine all
// information first....
List<ExprNodeDesc> keyDesc = desc.getKeys().get(posBigTable);
VectorExpression[] allBigTableKeyExpressions = vContext.getVectorExpressions(keyDesc);
final int allBigTableKeyExpressionsLength = allBigTableKeyExpressions.length;
boolean supportsKeyTypes = true; // Assume.
HashSet<String> notSupportedKeyTypes = new HashSet<String>();
// Since a key expression can be a calculation and the key will go into a scratch column,
// we need the mapping and type information.
int[] bigTableKeyColumnMap = new int[allBigTableKeyExpressionsLength];
String[] bigTableKeyColumnNames = new String[allBigTableKeyExpressionsLength];
TypeInfo[] bigTableKeyTypeInfos = new TypeInfo[allBigTableKeyExpressionsLength];
ArrayList<VectorExpression> bigTableKeyExpressionsList = new ArrayList<VectorExpression>();
VectorExpression[] bigTableKeyExpressions;
for (int i = 0; i < allBigTableKeyExpressionsLength; i++) {
VectorExpression ve = allBigTableKeyExpressions[i];
if (!IdentityExpression.isColumnOnly(ve)) {
bigTableKeyExpressionsList.add(ve);
}
bigTableKeyColumnMap[i] = ve.getOutputColumn();
ExprNodeDesc exprNode = keyDesc.get(i);
bigTableKeyColumnNames[i] = exprNode.toString();
TypeInfo typeInfo = exprNode.getTypeInfo();
// Verify we handle the key column types for an optimized table. This is the effectively the
// same check used in HashTableLoader.
if (!MapJoinKey.isSupportedField(typeInfo)) {
supportsKeyTypes = false;
Category category = typeInfo.getCategory();
notSupportedKeyTypes.add(
(category != Category.PRIMITIVE ? category.toString() :
((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory().toString()));
}
bigTableKeyTypeInfos[i] = typeInfo;
}
if (bigTableKeyExpressionsList.size() == 0) {
bigTableKeyExpressions = null;
} else {
bigTableKeyExpressions = bigTableKeyExpressionsList.toArray(new VectorExpression[0]);
}
List<ExprNodeDesc> bigTableExprs = desc.getExprs().get(posBigTable);
VectorExpression[] allBigTableValueExpressions = vContext.getVectorExpressions(bigTableExprs);
boolean isFastHashTableEnabled =
HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED);
// Especially since LLAP is prone to turn it off in the MapJoinDesc in later
// physical optimizer stages...
boolean isHybridHashJoin = desc.isHybridHashJoin();
/*
* Populate vectorMapJoininfo.
*/
/*
* Similarly, we need a mapping since a value expression can be a calculation and the value
* will go into a scratch column.
*/
int[] bigTableValueColumnMap = new int[allBigTableValueExpressions.length];
String[] bigTableValueColumnNames = new String[allBigTableValueExpressions.length];
TypeInfo[] bigTableValueTypeInfos = new TypeInfo[allBigTableValueExpressions.length];
ArrayList<VectorExpression> bigTableValueExpressionsList = new ArrayList<VectorExpression>();
VectorExpression[] bigTableValueExpressions;
for (int i = 0; i < bigTableValueColumnMap.length; i++) {
VectorExpression ve = allBigTableValueExpressions[i];
if (!IdentityExpression.isColumnOnly(ve)) {
bigTableValueExpressionsList.add(ve);
}
bigTableValueColumnMap[i] = ve.getOutputColumn();
ExprNodeDesc exprNode = bigTableExprs.get(i);
bigTableValueColumnNames[i] = exprNode.toString();
bigTableValueTypeInfos[i] = exprNode.getTypeInfo();
}
if (bigTableValueExpressionsList.size() == 0) {
bigTableValueExpressions = null;
} else {
bigTableValueExpressions = bigTableValueExpressionsList.toArray(new VectorExpression[0]);
}
vectorMapJoinInfo.setBigTableKeyColumnMap(bigTableKeyColumnMap);
vectorMapJoinInfo.setBigTableKeyColumnNames(bigTableKeyColumnNames);
vectorMapJoinInfo.setBigTableKeyTypeInfos(bigTableKeyTypeInfos);
vectorMapJoinInfo.setBigTableKeyExpressions(bigTableKeyExpressions);
vectorMapJoinInfo.setBigTableValueColumnMap(bigTableValueColumnMap);
vectorMapJoinInfo.setBigTableValueColumnNames(bigTableValueColumnNames);
vectorMapJoinInfo.setBigTableValueTypeInfos(bigTableValueTypeInfos);
vectorMapJoinInfo.setBigTableValueExpressions(bigTableValueExpressions);
/*
* Small table information.
*/
VectorColumnOutputMapping bigTableRetainedMapping =
new VectorColumnOutputMapping("Big Table Retained Mapping");
VectorColumnOutputMapping bigTableOuterKeyMapping =
new VectorColumnOutputMapping("Big Table Outer Key Mapping");
// The order of the fields in the LazyBinary small table value must be used, so
// we use the source ordering flavor for the mapping.
VectorColumnSourceMapping smallTableMapping =
new VectorColumnSourceMapping("Small Table Mapping");
Byte[] order = desc.getTagOrder();
Byte posSingleVectorMapJoinSmallTable = (order[0] == posBigTable ? order[1] : order[0]);
boolean isOuterJoin = !desc.getNoOuterJoin();
/*
* Gather up big and small table output result information from the MapJoinDesc.
*/
List<Integer> bigTableRetainList = desc.getRetainList().get(posBigTable);
int bigTableRetainSize = bigTableRetainList.size();
int[] smallTableIndices;
int smallTableIndicesSize;
List<ExprNodeDesc> smallTableExprs = desc.getExprs().get(posSingleVectorMapJoinSmallTable);
if (desc.getValueIndices() != null && desc.getValueIndices().get(posSingleVectorMapJoinSmallTable) != null) {
smallTableIndices = desc.getValueIndices().get(posSingleVectorMapJoinSmallTable);
smallTableIndicesSize = smallTableIndices.length;
} else {
smallTableIndices = null;
smallTableIndicesSize = 0;
}
List<Integer> smallTableRetainList = desc.getRetainList().get(posSingleVectorMapJoinSmallTable);
int smallTableRetainSize = smallTableRetainList.size();
int smallTableResultSize = 0;
if (smallTableIndicesSize > 0) {
smallTableResultSize = smallTableIndicesSize;
} else if (smallTableRetainSize > 0) {
smallTableResultSize = smallTableRetainSize;
}
/*
* Determine the big table retained mapping first so we can optimize out (with
* projection) copying inner join big table keys in the subsequent small table results section.
*/
// We use a mapping object here so we can build the projection in any order and
// get the ordered by 0 to n-1 output columns at the end.
//
// Also, to avoid copying a big table key into the small table result area for inner joins,
// we reference it with the projection so there can be duplicate output columns
// in the projection.
VectorColumnSourceMapping projectionMapping = new VectorColumnSourceMapping("Projection Mapping");
int nextOutputColumn = (order[0] == posBigTable ? 0 : smallTableResultSize);
for (int i = 0; i < bigTableRetainSize; i++) {
// Since bigTableValueExpressions may do a calculation and produce a scratch column, we
// need to map to the right batch column.
int retainColumn = bigTableRetainList.get(i);
int batchColumnIndex = bigTableValueColumnMap[retainColumn];
TypeInfo typeInfo = bigTableValueTypeInfos[i];
// With this map we project the big table batch to make it look like an output batch.
projectionMapping.add(nextOutputColumn, batchColumnIndex, typeInfo);
// Collect columns we copy from the big table batch to the overflow batch.
if (!bigTableRetainedMapping.containsOutputColumn(batchColumnIndex)) {
// Tolerate repeated use of a big table column.
bigTableRetainedMapping.add(batchColumnIndex, batchColumnIndex, typeInfo);
}
nextOutputColumn++;
}
/*
* Now determine the small table results.
*/
boolean smallTableExprVectorizes = true;
int firstSmallTableOutputColumn;
firstSmallTableOutputColumn = (order[0] == posBigTable ? bigTableRetainSize : 0);
int smallTableOutputCount = 0;
nextOutputColumn = firstSmallTableOutputColumn;
// Small table indices has more information (i.e. keys) than retain, so use it if it exists...
String[] bigTableRetainedNames;
if (smallTableIndicesSize > 0) {
smallTableOutputCount = smallTableIndicesSize;
bigTableRetainedNames = new String[smallTableOutputCount];
for (int i = 0; i < smallTableIndicesSize; i++) {
if (smallTableIndices[i] >= 0) {
// Zero and above numbers indicate a big table key is needed for
// small table result "area".
int keyIndex = smallTableIndices[i];
// Since bigTableKeyExpressions may do a calculation and produce a scratch column, we
// need to map the right column.
int batchKeyColumn = bigTableKeyColumnMap[keyIndex];
bigTableRetainedNames[i] = bigTableKeyColumnNames[keyIndex];
TypeInfo typeInfo = bigTableKeyTypeInfos[keyIndex];
if (!isOuterJoin) {
// Optimize inner join keys of small table results.
// Project the big table key into the small table result "area".
projectionMapping.add(nextOutputColumn, batchKeyColumn, typeInfo);
if (!bigTableRetainedMapping.containsOutputColumn(batchKeyColumn)) {
// If necessary, copy the big table key into the overflow batch's small table
// result "area".
bigTableRetainedMapping.add(batchKeyColumn, batchKeyColumn, typeInfo);
}
} else {
// For outer joins, since the small table key can be null when there is no match,
// we must have a physical (scratch) column for those keys. We cannot use the
// projection optimization used by inner joins above.
int scratchColumn = vContext.allocateScratchColumn(typeInfo);
projectionMapping.add(nextOutputColumn, scratchColumn, typeInfo);
bigTableRetainedMapping.add(batchKeyColumn, scratchColumn, typeInfo);
bigTableOuterKeyMapping.add(batchKeyColumn, scratchColumn, typeInfo);
}
} else {
// Negative numbers indicate a column to be (deserialize) read from the small table's
// LazyBinary value row.
int smallTableValueIndex = -smallTableIndices[i] - 1;
ExprNodeDesc smallTableExprNode = smallTableExprs.get(i);
if (!validateExprNodeDesc(smallTableExprNode, "Small Table")) {
clearNotVectorizedReason();
smallTableExprVectorizes = false;
}
bigTableRetainedNames[i] = smallTableExprNode.toString();
TypeInfo typeInfo = smallTableExprNode.getTypeInfo();
// Make a new big table scratch column for the small table value.
int scratchColumn = vContext.allocateScratchColumn(typeInfo);
projectionMapping.add(nextOutputColumn, scratchColumn, typeInfo);
smallTableMapping.add(smallTableValueIndex, scratchColumn, typeInfo);
}
nextOutputColumn++;
}
} else if (smallTableRetainSize > 0) {
smallTableOutputCount = smallTableRetainSize;
bigTableRetainedNames = new String[smallTableOutputCount];
// Only small table values appear in join output result.
for (int i = 0; i < smallTableRetainSize; i++) {
int smallTableValueIndex = smallTableRetainList.get(i);
ExprNodeDesc smallTableExprNode = smallTableExprs.get(i);
if (!validateExprNodeDesc(smallTableExprNode, "Small Table")) {
clearNotVectorizedReason();
smallTableExprVectorizes = false;
}
bigTableRetainedNames[i] = smallTableExprNode.toString();
// Make a new big table scratch column for the small table value.
TypeInfo typeInfo = smallTableExprNode.getTypeInfo();
int scratchColumn = vContext.allocateScratchColumn(typeInfo);
projectionMapping.add(nextOutputColumn, scratchColumn, typeInfo);
smallTableMapping.add(smallTableValueIndex, scratchColumn, typeInfo);
nextOutputColumn++;
}
} else {
bigTableRetainedNames = new String[0];
}
boolean useOptimizedTable =
HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE);
// Remember the condition variables for EXPLAIN regardless of whether we specialize or not.
vectorDesc.setUseOptimizedTable(useOptimizedTable);
vectorDesc.setIsVectorizationMapJoinNativeEnabled(isVectorizationMapJoinNativeEnabled);
vectorDesc.setEngine(engine);
vectorDesc.setOneMapJoinCondition(oneMapJoinCondition);
vectorDesc.setHasNullSafes(hasNullSafes);
vectorDesc.setSmallTableExprVectorizes(smallTableExprVectorizes);
vectorDesc.setIsFastHashTableEnabled(isFastHashTableEnabled);
vectorDesc.setIsHybridHashJoin(isHybridHashJoin);
vectorDesc.setSupportsKeyTypes(supportsKeyTypes);
if (!supportsKeyTypes) {
vectorDesc.setNotSupportedKeyTypes(new ArrayList(notSupportedKeyTypes));
}
// Check common conditions for both Optimized and Fast Hash Tables.
boolean result = true; // Assume.
if (!useOptimizedTable ||
!isVectorizationMapJoinNativeEnabled ||
!isTezOrSpark ||
!oneMapJoinCondition ||
hasNullSafes ||
!smallTableExprVectorizes) {
result = false;
}
// supportsKeyTypes
if (!isFastHashTableEnabled) {
// Check optimized-only hash table restrictions.
if (!supportsKeyTypes) {
result = false;
}
} else {
// With the fast hash table implementation, we currently do not support
// Hybrid Grace Hash Join.
if (isHybridHashJoin) {
result = false;
}
}
// Convert dynamic arrays and maps to simple arrays.
bigTableRetainedMapping.finalize();
bigTableOuterKeyMapping.finalize();
smallTableMapping.finalize();
vectorMapJoinInfo.setBigTableRetainedMapping(bigTableRetainedMapping);
vectorMapJoinInfo.setBigTableOuterKeyMapping(bigTableOuterKeyMapping);
vectorMapJoinInfo.setSmallTableMapping(smallTableMapping);
projectionMapping.finalize();
// Verify we added an entry for each output.
assert projectionMapping.isSourceSequenceGood();
vectorMapJoinInfo.setProjectionMapping(projectionMapping);
return result;
}
private Operator<? extends OperatorDesc> specializeReduceSinkOperator(
Operator<? extends OperatorDesc> op, VectorizationContext vContext, ReduceSinkDesc desc,
VectorReduceSinkInfo vectorReduceSinkInfo) throws HiveException {
VectorReduceSinkDesc vectorDesc = (VectorReduceSinkDesc) desc.getVectorDesc();
Type[] reduceSinkKeyColumnVectorTypes = vectorReduceSinkInfo.getReduceSinkKeyColumnVectorTypes();
// By default, we can always use the multi-key class.
VectorReduceSinkDesc.ReduceSinkKeyType reduceSinkKeyType = VectorReduceSinkDesc.ReduceSinkKeyType.MULTI_KEY;
// Look for single column optimization.
if (reduceSinkKeyColumnVectorTypes != null && reduceSinkKeyColumnVectorTypes.length == 1) {
LOG.info("Vectorizer vectorizeOperator groupby typeName " + vectorReduceSinkInfo.getReduceSinkKeyTypeInfos()[0]);
Type columnVectorType = reduceSinkKeyColumnVectorTypes[0];
switch (columnVectorType) {
case LONG:
{
PrimitiveCategory primitiveCategory =
((PrimitiveTypeInfo) vectorReduceSinkInfo.getReduceSinkKeyTypeInfos()[0]).getPrimitiveCategory();
switch (primitiveCategory) {
case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
reduceSinkKeyType = VectorReduceSinkDesc.ReduceSinkKeyType.LONG;
break;
default:
// Other integer types not supported yet.
break;
}
}
break;
case BYTES:
reduceSinkKeyType = VectorReduceSinkDesc.ReduceSinkKeyType.STRING;
default:
// Stay with multi-key.
break;
}
}
Class<? extends Operator<?>> opClass = null;
if (vectorReduceSinkInfo.getUseUniformHash()) {
if (vectorDesc.getIsEmptyKey()) {
opClass = VectorReduceSinkEmptyKeyOperator.class;
} else {
switch (reduceSinkKeyType) {
case LONG:
opClass = VectorReduceSinkLongOperator.class;
break;
case STRING:
opClass = VectorReduceSinkStringOperator.class;
break;
case MULTI_KEY:
opClass = VectorReduceSinkMultiKeyOperator.class;
break;
default:
throw new HiveException("Unknown reduce sink key type " + reduceSinkKeyType);
}
}
} else {
if (vectorDesc.getIsEmptyKey() && vectorDesc.getIsEmptyBuckets() && vectorDesc.getIsEmptyPartitions()) {
opClass = VectorReduceSinkEmptyKeyOperator.class;
} else {
opClass = VectorReduceSinkObjectHashOperator.class;
}
}
vectorDesc.setReduceSinkKeyType(reduceSinkKeyType);
vectorDesc.setVectorReduceSinkInfo(vectorReduceSinkInfo);
LOG.info("Vectorizer vectorizeOperator reduce sink class " + opClass.getSimpleName());
Operator<? extends OperatorDesc> vectorOp = null;
try {
vectorOp = OperatorFactory.getVectorOperator(
opClass, op.getCompilationOpContext(), op.getConf(), vContext);
} catch (Exception e) {
LOG.info("Vectorizer vectorizeOperator reduce sink class exception " + opClass.getSimpleName() +
" exception " + e);
throw new HiveException(e);
}
return vectorOp;
}
private boolean canSpecializeReduceSink(ReduceSinkDesc desc,
boolean isTezOrSpark, VectorizationContext vContext,
VectorReduceSinkInfo vectorReduceSinkInfo) throws HiveException {
// Allocate a VectorReduceSinkDesc initially with key type NONE so EXPLAIN can report this
// operator was vectorized, but not native. And, the conditions.
VectorReduceSinkDesc vectorDesc = new VectorReduceSinkDesc();
desc.setVectorDesc(vectorDesc);
// Various restrictions.
// Set this if we encounter a condition we were not expecting.
boolean isUnexpectedCondition = false;
boolean isVectorizationReduceSinkNativeEnabled =
HiveConf.getBoolVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCESINK_NEW_ENABLED);
String engine = HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE);
int limit = desc.getTopN();
float memUsage = desc.getTopNMemoryUsage();
boolean hasPTFTopN = (limit >= 0 && memUsage > 0 && desc.isPTFReduceSink());
boolean hasDistinctColumns = (desc.getDistinctColumnIndices().size() > 0);
TableDesc keyTableDesc = desc.getKeySerializeInfo();
Class<? extends Deserializer> keySerializerClass = keyTableDesc.getDeserializerClass();
boolean isKeyBinarySortable = (keySerializerClass == org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe.class);
TableDesc valueTableDesc = desc.getValueSerializeInfo();
Class<? extends Deserializer> valueDeserializerClass = valueTableDesc.getDeserializerClass();
boolean isValueLazyBinary = (valueDeserializerClass == org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe.class);
// We are doing work here we'd normally do in VectorGroupByCommonOperator's constructor.
// So if we later decide not to specialize, we'll just waste any scratch columns allocated...
List<ExprNodeDesc> keysDescs = desc.getKeyCols();
final boolean isEmptyKey = (keysDescs.size() == 0);
if (!isEmptyKey) {
VectorExpression[] allKeyExpressions = vContext.getVectorExpressions(keysDescs);
final int[] reduceSinkKeyColumnMap = new int[allKeyExpressions.length];
final TypeInfo[] reduceSinkKeyTypeInfos = new TypeInfo[allKeyExpressions.length];
final Type[] reduceSinkKeyColumnVectorTypes = new Type[allKeyExpressions.length];
final VectorExpression[] reduceSinkKeyExpressions;
// Since a key expression can be a calculation and the key will go into a scratch column,
// we need the mapping and type information.
ArrayList<VectorExpression> groupByKeyExpressionsList = new ArrayList<VectorExpression>();
for (int i = 0; i < reduceSinkKeyColumnMap.length; i++) {
VectorExpression ve = allKeyExpressions[i];
reduceSinkKeyColumnMap[i] = ve.getOutputColumn();
reduceSinkKeyTypeInfos[i] = keysDescs.get(i).getTypeInfo();
reduceSinkKeyColumnVectorTypes[i] =
VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkKeyTypeInfos[i]);
if (!IdentityExpression.isColumnOnly(ve)) {
groupByKeyExpressionsList.add(ve);
}
}
if (groupByKeyExpressionsList.size() == 0) {
reduceSinkKeyExpressions = null;
} else {
reduceSinkKeyExpressions = groupByKeyExpressionsList.toArray(new VectorExpression[0]);
}
vectorReduceSinkInfo.setReduceSinkKeyColumnMap(reduceSinkKeyColumnMap);
vectorReduceSinkInfo.setReduceSinkKeyTypeInfos(reduceSinkKeyTypeInfos);
vectorReduceSinkInfo.setReduceSinkKeyColumnVectorTypes(reduceSinkKeyColumnVectorTypes);
vectorReduceSinkInfo.setReduceSinkKeyExpressions(reduceSinkKeyExpressions);
}
ArrayList<ExprNodeDesc> valueDescs = desc.getValueCols();
final boolean isEmptyValue = (valueDescs.size() == 0);
if (!isEmptyValue) {
VectorExpression[] allValueExpressions = vContext.getVectorExpressions(valueDescs);
final int[] reduceSinkValueColumnMap = new int[allValueExpressions.length];
final TypeInfo[] reduceSinkValueTypeInfos = new TypeInfo[allValueExpressions.length];
final Type[] reduceSinkValueColumnVectorTypes = new Type[allValueExpressions.length];
VectorExpression[] reduceSinkValueExpressions;
ArrayList<VectorExpression> reduceSinkValueExpressionsList = new ArrayList<VectorExpression>();
for (int i = 0; i < valueDescs.size(); ++i) {
VectorExpression ve = allValueExpressions[i];
reduceSinkValueColumnMap[i] = ve.getOutputColumn();
reduceSinkValueTypeInfos[i] = valueDescs.get(i).getTypeInfo();
reduceSinkValueColumnVectorTypes[i] =
VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkValueTypeInfos[i]);
if (!IdentityExpression.isColumnOnly(ve)) {
reduceSinkValueExpressionsList.add(ve);
}
}
if (reduceSinkValueExpressionsList.size() == 0) {
reduceSinkValueExpressions = null;
} else {
reduceSinkValueExpressions = reduceSinkValueExpressionsList.toArray(new VectorExpression[0]);
}
vectorReduceSinkInfo.setReduceSinkValueColumnMap(reduceSinkValueColumnMap);
vectorReduceSinkInfo.setReduceSinkValueTypeInfos(reduceSinkValueTypeInfos);
vectorReduceSinkInfo.setReduceSinkValueColumnVectorTypes(reduceSinkValueColumnVectorTypes);
vectorReduceSinkInfo.setReduceSinkValueExpressions(reduceSinkValueExpressions);
}
boolean useUniformHash = desc.getReducerTraits().contains(UNIFORM);
vectorReduceSinkInfo.setUseUniformHash(useUniformHash);
List<ExprNodeDesc> bucketDescs = desc.getBucketCols();
final boolean isEmptyBuckets = (bucketDescs == null || bucketDescs.size() == 0);
List<ExprNodeDesc> partitionDescs = desc.getPartitionCols();
final boolean isEmptyPartitions = (partitionDescs == null || partitionDescs.size() == 0);
if (useUniformHash || (isEmptyKey && isEmptyBuckets && isEmptyPartitions)) {
// NOTE: For Uniform Hash or no buckets/partitions, when the key is empty, we will use the VectorReduceSinkEmptyKeyOperator instead.
} else {
// Collect bucket and/or partition information for object hashing.
int[] reduceSinkBucketColumnMap = null;
TypeInfo[] reduceSinkBucketTypeInfos = null;
Type[] reduceSinkBucketColumnVectorTypes = null;
VectorExpression[] reduceSinkBucketExpressions = null;
if (!isEmptyBuckets) {
VectorExpression[] allBucketExpressions = vContext.getVectorExpressions(bucketDescs);
reduceSinkBucketColumnMap = new int[bucketDescs.size()];
reduceSinkBucketTypeInfos = new TypeInfo[bucketDescs.size()];
reduceSinkBucketColumnVectorTypes = new Type[bucketDescs.size()];
ArrayList<VectorExpression> reduceSinkBucketExpressionsList = new ArrayList<VectorExpression>();
for (int i = 0; i < bucketDescs.size(); ++i) {
VectorExpression ve = allBucketExpressions[i];
reduceSinkBucketColumnMap[i] = ve.getOutputColumn();
reduceSinkBucketTypeInfos[i] = bucketDescs.get(i).getTypeInfo();
reduceSinkBucketColumnVectorTypes[i] =
VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkBucketTypeInfos[i]);
if (!IdentityExpression.isColumnOnly(ve)) {
reduceSinkBucketExpressionsList.add(ve);
}
}
if (reduceSinkBucketExpressionsList.size() == 0) {
reduceSinkBucketExpressions = null;
} else {
reduceSinkBucketExpressions = reduceSinkBucketExpressionsList.toArray(new VectorExpression[0]);
}
}
int[] reduceSinkPartitionColumnMap = null;
TypeInfo[] reduceSinkPartitionTypeInfos = null;
Type[] reduceSinkPartitionColumnVectorTypes = null;
VectorExpression[] reduceSinkPartitionExpressions = null;
if (!isEmptyPartitions) {
VectorExpression[] allPartitionExpressions = vContext.getVectorExpressions(partitionDescs);
reduceSinkPartitionColumnMap = new int[partitionDescs.size()];
reduceSinkPartitionTypeInfos = new TypeInfo[partitionDescs.size()];
reduceSinkPartitionColumnVectorTypes = new Type[partitionDescs.size()];
ArrayList<VectorExpression> reduceSinkPartitionExpressionsList = new ArrayList<VectorExpression>();
for (int i = 0; i < partitionDescs.size(); ++i) {
VectorExpression ve = allPartitionExpressions[i];
reduceSinkPartitionColumnMap[i] = ve.getOutputColumn();
reduceSinkPartitionTypeInfos[i] = partitionDescs.get(i).getTypeInfo();
reduceSinkPartitionColumnVectorTypes[i] =
VectorizationContext.getColumnVectorTypeFromTypeInfo(reduceSinkPartitionTypeInfos[i]);
if (!IdentityExpression.isColumnOnly(ve)) {
reduceSinkPartitionExpressionsList.add(ve);
}
}
if (reduceSinkPartitionExpressionsList.size() == 0) {
reduceSinkPartitionExpressions = null;
} else {
reduceSinkPartitionExpressions = reduceSinkPartitionExpressionsList.toArray(new VectorExpression[0]);
}
}
vectorReduceSinkInfo.setReduceSinkBucketColumnMap(reduceSinkBucketColumnMap);
vectorReduceSinkInfo.setReduceSinkBucketTypeInfos(reduceSinkBucketTypeInfos);
vectorReduceSinkInfo.setReduceSinkBucketColumnVectorTypes(reduceSinkBucketColumnVectorTypes);
vectorReduceSinkInfo.setReduceSinkBucketExpressions(reduceSinkBucketExpressions);
vectorReduceSinkInfo.setReduceSinkPartitionColumnMap(reduceSinkPartitionColumnMap);
vectorReduceSinkInfo.setReduceSinkPartitionTypeInfos(reduceSinkPartitionTypeInfos);
vectorReduceSinkInfo.setReduceSinkPartitionColumnVectorTypes(reduceSinkPartitionColumnVectorTypes);
vectorReduceSinkInfo.setReduceSinkPartitionExpressions(reduceSinkPartitionExpressions);
}
// Remember the condition variables for EXPLAIN regardless.
vectorDesc.setIsVectorizationReduceSinkNativeEnabled(isVectorizationReduceSinkNativeEnabled);
vectorDesc.setEngine(engine);
vectorDesc.setIsEmptyKey(isEmptyKey);
vectorDesc.setIsEmptyValue(isEmptyValue);
vectorDesc.setIsEmptyBuckets(isEmptyBuckets);
vectorDesc.setIsEmptyPartitions(isEmptyPartitions);
vectorDesc.setHasPTFTopN(hasPTFTopN);
vectorDesc.setHasDistinctColumns(hasDistinctColumns);
vectorDesc.setIsKeyBinarySortable(isKeyBinarySortable);
vectorDesc.setIsValueLazyBinary(isValueLazyBinary);
// This indicates we logged an inconsistency (from our point-of-view) and will not make this
// operator native...
vectorDesc.setIsUnexpectedCondition(isUnexpectedCondition);
// Many restrictions.
if (!isVectorizationReduceSinkNativeEnabled ||
!isTezOrSpark ||
hasPTFTopN ||
hasDistinctColumns ||
!isKeyBinarySortable ||
!isValueLazyBinary ||
isUnexpectedCondition) {
return false;
}
return true;
}
private boolean usesVectorUDFAdaptor(VectorExpression vecExpr) {
if (vecExpr == null) {
return false;
}
if (vecExpr instanceof VectorUDFAdaptor) {
return true;
}
if (usesVectorUDFAdaptor(vecExpr.getChildExpressions())) {
return true;
}
return false;
}
private boolean usesVectorUDFAdaptor(VectorExpression[] vecExprs) {
if (vecExprs == null) {
return false;
}
for (VectorExpression vecExpr : vecExprs) {
if (usesVectorUDFAdaptor(vecExpr)) {
return true;
}
}
return false;
}
public static Operator<? extends OperatorDesc> vectorizeTableScanOperator(
Operator<? extends OperatorDesc> tableScanOp, VectorizationContext vContext)
throws HiveException {
TableScanDesc tableScanDesc = (TableScanDesc) tableScanOp.getConf();
VectorTableScanDesc vectorTableScanDesc = new VectorTableScanDesc();
tableScanDesc.setVectorDesc(vectorTableScanDesc);
vectorTableScanDesc.setProjectedOutputColumns(
ArrayUtils.toPrimitive(vContext.getProjectedColumns().toArray(new Integer[0])));
return tableScanOp;
}
public static Operator<? extends OperatorDesc> vectorizeFilterOperator(
Operator<? extends OperatorDesc> filterOp, VectorizationContext vContext)
throws HiveException {
FilterDesc filterDesc = (FilterDesc) filterOp.getConf();
VectorFilterDesc vectorFilterDesc = new VectorFilterDesc();
filterDesc.setVectorDesc(vectorFilterDesc);
ExprNodeDesc predicateExpr = filterDesc.getPredicate();
VectorExpression vectorPredicateExpr =
vContext.getVectorExpression(predicateExpr, VectorExpressionDescriptor.Mode.FILTER);
vectorFilterDesc.setPredicateExpression(vectorPredicateExpr);
return OperatorFactory.getVectorOperator(
filterOp.getCompilationOpContext(), filterDesc, vContext);
}
/*
* NOTE: The VectorGroupByDesc has already been allocated and partially populated.
*/
public static Operator<? extends OperatorDesc> vectorizeGroupByOperator(
Operator<? extends OperatorDesc> groupByOp, VectorizationContext vContext)
throws HiveException {
GroupByDesc groupByDesc = (GroupByDesc) groupByOp.getConf();
List<ExprNodeDesc> keysDesc = groupByDesc.getKeys();
VectorExpression[] vecKeyExpressions = vContext.getVectorExpressions(keysDesc);
ArrayList<AggregationDesc> aggrDesc = groupByDesc.getAggregators();
final int size = aggrDesc.size();
VectorAggregateExpression[] vecAggregators = new VectorAggregateExpression[size];
int[] projectedOutputColumns = new int[size];
for (int i = 0; i < size; ++i) {
AggregationDesc aggDesc = aggrDesc.get(i);
vecAggregators[i] = vContext.getAggregatorExpression(aggDesc);
// GroupBy generates a new vectorized row batch...
projectedOutputColumns[i] = i;
}
VectorGroupByDesc vectorGroupByDesc = (VectorGroupByDesc) groupByDesc.getVectorDesc();
vectorGroupByDesc.setKeyExpressions(vecKeyExpressions);
vectorGroupByDesc.setAggregators(vecAggregators);
vectorGroupByDesc.setProjectedOutputColumns(projectedOutputColumns);
return OperatorFactory.getVectorOperator(
groupByOp.getCompilationOpContext(), groupByDesc, vContext);
}
public static Operator<? extends OperatorDesc> vectorizeSelectOperator(
Operator<? extends OperatorDesc> selectOp, VectorizationContext vContext)
throws HiveException {
SelectDesc selectDesc = (SelectDesc) selectOp.getConf();
VectorSelectDesc vectorSelectDesc = new VectorSelectDesc();
selectDesc.setVectorDesc(vectorSelectDesc);
List<ExprNodeDesc> colList = selectDesc.getColList();
int index = 0;
final int size = colList.size();
VectorExpression[] vectorSelectExprs = new VectorExpression[size];
int[] projectedOutputColumns = new int[size];
for (int i = 0; i < size; i++) {
ExprNodeDesc expr = colList.get(i);
VectorExpression ve = vContext.getVectorExpression(expr);
projectedOutputColumns[i] = ve.getOutputColumn();
if (ve instanceof IdentityExpression) {
// Suppress useless evaluation.
continue;
}
vectorSelectExprs[index++] = ve;
}
if (index < size) {
vectorSelectExprs = Arrays.copyOf(vectorSelectExprs, index);
}
vectorSelectDesc.setSelectExpressions(vectorSelectExprs);
vectorSelectDesc.setProjectedOutputColumns(projectedOutputColumns);
return OperatorFactory.getVectorOperator(
selectOp.getCompilationOpContext(), selectDesc, vContext);
}
public Operator<? extends OperatorDesc> vectorizeOperator(Operator<? extends OperatorDesc> op,
VectorizationContext vContext, boolean isTezOrSpark, VectorTaskColumnInfo vectorTaskColumnInfo)
throws HiveException {
Operator<? extends OperatorDesc> vectorOp = null;
boolean isNative;
switch (op.getType()) {
case TABLESCAN:
vectorOp = vectorizeTableScanOperator(op, vContext);
isNative = true;
break;
case MAPJOIN:
{
if (op instanceof MapJoinOperator) {
VectorMapJoinInfo vectorMapJoinInfo = new VectorMapJoinInfo();
MapJoinDesc desc = (MapJoinDesc) op.getConf();
boolean specialize = canSpecializeMapJoin(op, desc, isTezOrSpark, vContext, vectorMapJoinInfo);
if (!specialize) {
Class<? extends Operator<?>> opClass = null;
// *NON-NATIVE* vector map differences for LEFT OUTER JOIN and Filtered...
List<ExprNodeDesc> bigTableFilters = desc.getFilters().get((byte) desc.getPosBigTable());
boolean isOuterAndFiltered = (!desc.isNoOuterJoin() && bigTableFilters.size() > 0);
if (!isOuterAndFiltered) {
opClass = VectorMapJoinOperator.class;
} else {
opClass = VectorMapJoinOuterFilteredOperator.class;
}
vectorOp = OperatorFactory.getVectorOperator(
opClass, op.getCompilationOpContext(), op.getConf(), vContext);
isNative = false;
} else {
// TEMPORARY Until Native Vector Map Join with Hybrid passes tests...
// HiveConf.setBoolVar(physicalContext.getConf(),
// HiveConf.ConfVars.HIVEUSEHYBRIDGRACEHASHJOIN, false);
vectorOp = specializeMapJoinOperator(op, vContext, desc, vectorMapJoinInfo);
isNative = true;
if (vectorTaskColumnInfo != null) {
if (usesVectorUDFAdaptor(vectorMapJoinInfo.getBigTableKeyExpressions())) {
vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true);
}
if (usesVectorUDFAdaptor(vectorMapJoinInfo.getBigTableValueExpressions())) {
vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true);
}
}
}
} else {
Preconditions.checkState(op instanceof SMBMapJoinOperator);
SMBJoinDesc smbJoinSinkDesc = (SMBJoinDesc) op.getConf();
VectorSMBJoinDesc vectorSMBJoinDesc = new VectorSMBJoinDesc();
smbJoinSinkDesc.setVectorDesc(vectorSMBJoinDesc);
vectorOp = OperatorFactory.getVectorOperator(
op.getCompilationOpContext(), smbJoinSinkDesc, vContext);
isNative = false;
}
}
break;
case REDUCESINK:
{
VectorReduceSinkInfo vectorReduceSinkInfo = new VectorReduceSinkInfo();
ReduceSinkDesc desc = (ReduceSinkDesc) op.getConf();
boolean specialize = canSpecializeReduceSink(desc, isTezOrSpark, vContext, vectorReduceSinkInfo);
if (!specialize) {
vectorOp = OperatorFactory.getVectorOperator(
op.getCompilationOpContext(), op.getConf(), vContext);
isNative = false;
} else {
vectorOp = specializeReduceSinkOperator(op, vContext, desc, vectorReduceSinkInfo);
isNative = true;
if (vectorTaskColumnInfo != null) {
if (usesVectorUDFAdaptor(vectorReduceSinkInfo.getReduceSinkKeyExpressions())) {
vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true);
}
if (usesVectorUDFAdaptor(vectorReduceSinkInfo.getReduceSinkValueExpressions())) {
vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true);
}
}
}
}
break;
case FILTER:
{
vectorOp = vectorizeFilterOperator(op, vContext);
isNative = true;
if (vectorTaskColumnInfo != null) {
VectorFilterDesc vectorFilterDesc =
(VectorFilterDesc) ((AbstractOperatorDesc) vectorOp.getConf()).getVectorDesc();
VectorExpression vectorPredicateExpr = vectorFilterDesc.getPredicateExpression();
if (usesVectorUDFAdaptor(vectorPredicateExpr)) {
vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true);
}
}
}
break;
case SELECT:
{
vectorOp = vectorizeSelectOperator(op, vContext);
isNative = true;
if (vectorTaskColumnInfo != null) {
VectorSelectDesc vectorSelectDesc =
(VectorSelectDesc) ((AbstractOperatorDesc) vectorOp.getConf()).getVectorDesc();
VectorExpression[] vectorSelectExprs = vectorSelectDesc.getSelectExpressions();
if (usesVectorUDFAdaptor(vectorSelectExprs)) {
vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true);
}
}
}
break;
case GROUPBY:
{
vectorOp = vectorizeGroupByOperator(op, vContext);
isNative = false;
if (vectorTaskColumnInfo != null) {
VectorGroupByDesc vectorGroupByDesc =
(VectorGroupByDesc) ((AbstractOperatorDesc) vectorOp.getConf()).getVectorDesc();
if (!vectorGroupByDesc.isVectorOutput()) {
vectorTaskColumnInfo.setGroupByVectorOutput(false);
}
VectorExpression[] vecKeyExpressions = vectorGroupByDesc.getKeyExpressions();
if (usesVectorUDFAdaptor(vecKeyExpressions)) {
vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true);
}
VectorAggregateExpression[] vecAggregators = vectorGroupByDesc.getAggregators();
for (VectorAggregateExpression vecAggr : vecAggregators) {
if (usesVectorUDFAdaptor(vecAggr.inputExpression())) {
vectorTaskColumnInfo.setUsesVectorUDFAdaptor(true);
}
}
}
}
break;
case FILESINK:
{
FileSinkDesc fileSinkDesc = (FileSinkDesc) op.getConf();
VectorFileSinkDesc vectorFileSinkDesc = new VectorFileSinkDesc();
fileSinkDesc.setVectorDesc(vectorFileSinkDesc);
vectorOp = OperatorFactory.getVectorOperator(
op.getCompilationOpContext(), fileSinkDesc, vContext);
isNative = false;
}
break;
case LIMIT:
{
LimitDesc limitDesc = (LimitDesc) op.getConf();
VectorLimitDesc vectorLimitDesc = new VectorLimitDesc();
limitDesc.setVectorDesc(vectorLimitDesc);
vectorOp = OperatorFactory.getVectorOperator(
op.getCompilationOpContext(), limitDesc, vContext);
isNative = true;
}
break;
case EVENT:
{
AppMasterEventDesc eventDesc = (AppMasterEventDesc) op.getConf();
VectorAppMasterEventDesc vectorEventDesc = new VectorAppMasterEventDesc();
eventDesc.setVectorDesc(vectorEventDesc);
vectorOp = OperatorFactory.getVectorOperator(
op.getCompilationOpContext(), eventDesc, vContext);
isNative = true;
}
break;
case HASHTABLESINK:
{
SparkHashTableSinkDesc sparkHashTableSinkDesc = (SparkHashTableSinkDesc) op.getConf();
VectorSparkHashTableSinkDesc vectorSparkHashTableSinkDesc = new VectorSparkHashTableSinkDesc();
sparkHashTableSinkDesc.setVectorDesc(vectorSparkHashTableSinkDesc);
vectorOp = OperatorFactory.getVectorOperator(
op.getCompilationOpContext(), sparkHashTableSinkDesc, vContext);
isNative = true;
}
break;
case SPARKPRUNINGSINK:
{
SparkPartitionPruningSinkDesc sparkPartitionPruningSinkDesc = (SparkPartitionPruningSinkDesc) op.getConf();
VectorSparkPartitionPruningSinkDesc vectorSparkPartitionPruningSinkDesc = new VectorSparkPartitionPruningSinkDesc();
sparkPartitionPruningSinkDesc.setVectorDesc(vectorSparkPartitionPruningSinkDesc);
vectorOp = OperatorFactory.getVectorOperator(
op.getCompilationOpContext(), sparkPartitionPruningSinkDesc, vContext);
isNative = true;
}
break;
default:
// These are children of GROUP BY operators with non-vector outputs.
isNative = false;
vectorOp = op;
break;
}
Preconditions.checkState(vectorOp != null);
if (vectorTaskColumnInfo != null && !isNative) {
vectorTaskColumnInfo.setAllNative(false);
}
LOG.debug("vectorizeOperator " + vectorOp.getClass().getName());
LOG.debug("vectorizeOperator " + vectorOp.getConf().getClass().getName());
if (vectorOp != op) {
fixupParentChildOperators(op, vectorOp);
((AbstractOperatorDesc) vectorOp.getConf()).setVectorMode(true);
}
return vectorOp;
}
private boolean isVirtualColumn(ColumnInfo column) {
// Not using method column.getIsVirtualCol() because partitioning columns are also
// treated as virtual columns in ColumnInfo.
if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(column.getInternalName())) {
return true;
}
return false;
}
public void debugDisplayAllMaps(BaseWork work) {
VectorizedRowBatchCtx vectorizedRowBatchCtx = work.getVectorizedRowBatchCtx();
String[] allColumnNames = vectorizedRowBatchCtx.getRowColumnNames();
Object columnTypeInfos = vectorizedRowBatchCtx.getRowColumnTypeInfos();
int partitionColumnCount = vectorizedRowBatchCtx.getPartitionColumnCount();
String[] scratchColumnTypeNames =vectorizedRowBatchCtx.getScratchColumnTypeNames();
LOG.debug("debugDisplayAllMaps allColumnNames " + Arrays.toString(allColumnNames));
LOG.debug("debugDisplayAllMaps columnTypeInfos " + Arrays.deepToString((Object[]) columnTypeInfos));
LOG.debug("debugDisplayAllMaps partitionColumnCount " + partitionColumnCount);
LOG.debug("debugDisplayAllMaps scratchColumnTypeNames " + Arrays.toString(scratchColumnTypeNames));
}
}