/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.plan; import java.util.ArrayList; import java.util.Arrays; import java.util.EnumSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Set; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.plan.Explain.Level; import org.apache.hadoop.hive.ql.plan.Explain.Vectorization; import org.apache.hadoop.hive.ql.plan.VectorReduceSinkDesc.ReduceSinkKeyType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * ReduceSinkDesc. * */ @Explain(displayName = "Reduce Output Operator", explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED }) public class ReduceSinkDesc extends AbstractOperatorDesc { private static final long serialVersionUID = 1L; /** * Key columns are passed to reducer in the "key". */ private java.util.ArrayList<ExprNodeDesc> keyCols; private java.util.ArrayList<java.lang.String> outputKeyColumnNames; private List<List<Integer>> distinctColumnIndices; /** * Value columns are passed to reducer in the "value". */ private java.util.ArrayList<ExprNodeDesc> valueCols; private java.util.ArrayList<java.lang.String> outputValueColumnNames; /** * Describe how to serialize the key. */ private TableDesc keySerializeInfo; /** * Describe how to serialize the value. */ private TableDesc valueSerializeInfo; /** * The tag for this reducesink descriptor. */ private int tag; /** * Number of distribution keys. */ private int numDistributionKeys; /** * Used in tez. Holds the name of the output * that this reduce sink is writing to. */ private String outputName; /** * The partition columns (CLUSTER BY or DISTRIBUTE BY in Hive language). * Partition columns decide the reducer that the current row goes to. * Partition columns are not passed to reducer. */ private java.util.ArrayList<ExprNodeDesc> partitionCols; private int numReducers; /** * Bucket information */ private int numBuckets; private List<ExprNodeDesc> bucketCols; private int topN = -1; private float topNMemoryUsage = -1; private boolean mapGroupBy; // for group-by, values with same key on top-K should be forwarded //flag used to control how TopN handled for PTF/Windowing partitions. private boolean isPTFReduceSink = false; private boolean skipTag; // Skip writing tags when feeding into mapjoin hashtable public static enum ReducerTraits { UNSET(0), // unset FIXED(1), // distribution of keys is fixed AUTOPARALLEL(2), // can change reducer count (ORDER BY can concat adjacent buckets) UNIFORM(3), // can redistribute into buckets uniformly (GROUP BY can) QUICKSTART(4); // do not wait for downstream tasks private final int trait; private ReducerTraits(int trait) { this.trait = trait; } }; // Is reducer auto-parallelism unset (FIXED, UNIFORM, PARALLEL) private EnumSet<ReducerTraits> reduceTraits = EnumSet.of(ReducerTraits.UNSET); // whether this RS is deduplicated private transient boolean isDeduplicated = false; // used by spark mode to decide whether global order is needed private transient boolean hasOrderBy = false; private static transient Logger LOG = LoggerFactory.getLogger(ReduceSinkDesc.class); public ReduceSinkDesc() { } public ReduceSinkDesc(ArrayList<ExprNodeDesc> keyCols, int numDistributionKeys, ArrayList<ExprNodeDesc> valueCols, ArrayList<String> outputKeyColumnNames, List<List<Integer>> distinctColumnIndices, ArrayList<String> outputValueColumnNames, int tag, ArrayList<ExprNodeDesc> partitionCols, int numReducers, final TableDesc keySerializeInfo, final TableDesc valueSerializeInfo) { this.keyCols = keyCols; this.numDistributionKeys = numDistributionKeys; this.valueCols = valueCols; this.outputKeyColumnNames = outputKeyColumnNames; this.outputValueColumnNames = outputValueColumnNames; this.tag = tag; this.numReducers = numReducers; this.partitionCols = partitionCols; this.keySerializeInfo = keySerializeInfo; this.valueSerializeInfo = valueSerializeInfo; this.distinctColumnIndices = distinctColumnIndices; this.setNumBuckets(-1); this.setBucketCols(null); this.vectorDesc = null; } @Override public Object clone() { ReduceSinkDesc desc = new ReduceSinkDesc(); desc.setKeyCols((ArrayList<ExprNodeDesc>) getKeyCols().clone()); desc.setValueCols((ArrayList<ExprNodeDesc>) getValueCols().clone()); desc.setOutputKeyColumnNames((ArrayList<String>) getOutputKeyColumnNames().clone()); List<List<Integer>> distinctColumnIndicesClone = new ArrayList<List<Integer>>(); for (List<Integer> distinctColumnIndex : getDistinctColumnIndices()) { List<Integer> tmp = new ArrayList<Integer>(); tmp.addAll(distinctColumnIndex); distinctColumnIndicesClone.add(tmp); } desc.setDistinctColumnIndices(distinctColumnIndicesClone); desc.setOutputValueColumnNames((ArrayList<String>) getOutputValueColumnNames().clone()); desc.setNumDistributionKeys(getNumDistributionKeys()); desc.setTag(getTag()); desc.setNumReducers(getNumReducers()); desc.setPartitionCols((ArrayList<ExprNodeDesc>) getPartitionCols().clone()); desc.setKeySerializeInfo((TableDesc) getKeySerializeInfo().clone()); desc.setValueSerializeInfo((TableDesc) getValueSerializeInfo().clone()); desc.setNumBuckets(numBuckets); desc.setBucketCols(bucketCols); desc.setStatistics(this.getStatistics()); desc.setSkipTag(skipTag); desc.reduceTraits = reduceTraits.clone(); desc.setDeduplicated(isDeduplicated); desc.setHasOrderBy(hasOrderBy); if (vectorDesc != null) { throw new RuntimeException("Clone with vectorization desc not supported"); } desc.vectorDesc = null; desc.outputName = outputName; return desc; } public java.util.ArrayList<java.lang.String> getOutputKeyColumnNames() { return outputKeyColumnNames; } public void setOutputKeyColumnNames( java.util.ArrayList<java.lang.String> outputKeyColumnNames) { this.outputKeyColumnNames = outputKeyColumnNames; } public java.util.ArrayList<java.lang.String> getOutputValueColumnNames() { return outputValueColumnNames; } public void setOutputValueColumnNames( java.util.ArrayList<java.lang.String> outputValueColumnNames) { this.outputValueColumnNames = outputValueColumnNames; } @Explain(displayName = "key expressions") public String getKeyColString() { return PlanUtils.getExprListString(keyCols); } public java.util.ArrayList<ExprNodeDesc> getKeyCols() { return keyCols; } public void setKeyCols(final java.util.ArrayList<ExprNodeDesc> keyCols) { this.keyCols = keyCols; } public int getNumDistributionKeys() { return this.numDistributionKeys; } public void setNumDistributionKeys(int numKeys) { this.numDistributionKeys = numKeys; } @Explain(displayName = "value expressions") public String getValueColsString() { return PlanUtils.getExprListString(valueCols); } public java.util.ArrayList<ExprNodeDesc> getValueCols() { return valueCols; } public void setValueCols(final java.util.ArrayList<ExprNodeDesc> valueCols) { this.valueCols = valueCols; } @Explain(displayName = "Map-reduce partition columns") public String getParitionColsString() { return PlanUtils.getExprListString(partitionCols); } @Explain(displayName = "PartitionCols", explainLevels = { Level.USER }) public String getUserLevelExplainParitionColsString() { return PlanUtils.getExprListString(partitionCols, true); } public java.util.ArrayList<ExprNodeDesc> getPartitionCols() { return partitionCols; } public void setPartitionCols( final java.util.ArrayList<ExprNodeDesc> partitionCols) { this.partitionCols = partitionCols; } public boolean isPartitioning() { if (partitionCols != null && !partitionCols.isEmpty()) { return true; } return false; } @Explain(displayName = "tag", explainLevels = { Level.EXTENDED }) public int getTag() { return tag; } public void setTag(int tag) { this.tag = tag; } public int getTopN() { return topN; } public void setTopN(int topN) { this.topN = topN; } @Explain(displayName = "TopN", explainLevels = { Level.EXTENDED }) public Integer getTopNExplain() { return topN > 0 ? topN : null; } public float getTopNMemoryUsage() { return topNMemoryUsage; } public void setTopNMemoryUsage(float topNMemoryUsage) { this.topNMemoryUsage = topNMemoryUsage; } @Explain(displayName = "TopN Hash Memory Usage") public Float getTopNMemoryUsageExplain() { return topN > 0 && topNMemoryUsage > 0 ? topNMemoryUsage : null; } public boolean isMapGroupBy() { return mapGroupBy; } public void setMapGroupBy(boolean mapGroupBy) { this.mapGroupBy = mapGroupBy; } public boolean isPTFReduceSink() { return isPTFReduceSink; } public void setPTFReduceSink(boolean isPTFReduceSink) { this.isPTFReduceSink = isPTFReduceSink; } /** * Returns the number of reducers for the map-reduce job. -1 means to decide * the number of reducers at runtime. This enables Hive to estimate the number * of reducers based on the map-reduce input data size, which is only * available right before we start the map-reduce job. */ public int getNumReducers() { return numReducers; } public void setNumReducers(int numReducers) { this.numReducers = numReducers; } public TableDesc getKeySerializeInfo() { return keySerializeInfo; } public void setKeySerializeInfo(TableDesc keySerializeInfo) { this.keySerializeInfo = keySerializeInfo; } public TableDesc getValueSerializeInfo() { return valueSerializeInfo; } public void setValueSerializeInfo(TableDesc valueSerializeInfo) { this.valueSerializeInfo = valueSerializeInfo; } /** * Returns the sort order of the key columns. * * @return null, which means ascending order for all key columns, or a String * of the same length as key columns, that consists of only "+" * (ascending order) and "-" (descending order). */ @Explain(displayName = "sort order") public String getOrder() { return keySerializeInfo.getProperties().getProperty( org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_SORT_ORDER); } public void setOrder(String orderStr) { keySerializeInfo.getProperties().setProperty( org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_SORT_ORDER, orderStr); } public boolean isOrdering() { if (this.getOrder() != null && !this.getOrder().isEmpty()) { return true; } return false; } /** * Returns the null order in the key columns. * * @return null, which means default for all key columns, or a String * of the same length as key columns, that consists of only "a" * (null first) and "z" (null last). */ @Explain(displayName = "null sort order", explainLevels = { Level.EXTENDED }) public String getNullOrder() { return keySerializeInfo.getProperties().getProperty( org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_NULL_SORT_ORDER); } public void setNullOrder(String nullOrderStr) { keySerializeInfo.getProperties().setProperty( org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_NULL_SORT_ORDER, nullOrderStr); } public List<List<Integer>> getDistinctColumnIndices() { return distinctColumnIndices; } public void setDistinctColumnIndices( List<List<Integer>> distinctColumnIndices) { this.distinctColumnIndices = distinctColumnIndices; } @Explain(displayName = "outputname", explainLevels = { Level.USER }) public String getOutputName() { return outputName; } public void setOutputName(String outputName) { this.outputName = outputName; } public int getNumBuckets() { return numBuckets; } public void setNumBuckets(int numBuckets) { this.numBuckets = numBuckets; } public List<ExprNodeDesc> getBucketCols() { return bucketCols; } public void setBucketCols(List<ExprNodeDesc> bucketCols) { this.bucketCols = bucketCols; } public void setSkipTag(boolean value) { this.skipTag = value; } public boolean getSkipTag() { return skipTag; } @Explain(displayName = "auto parallelism", explainLevels = { Level.EXTENDED }) public final boolean isAutoParallel() { return (this.reduceTraits.contains(ReducerTraits.AUTOPARALLEL)); } public final boolean isSlowStart() { return !(this.reduceTraits.contains(ReducerTraits.QUICKSTART)); } @Explain(displayName = "quick start", displayOnlyOnTrue = true, explainLevels = {Explain.Level.EXTENDED }) public final boolean isQuickStart() { return !isSlowStart(); } public final EnumSet<ReducerTraits> getReducerTraits() { return this.reduceTraits; } public final void setReducerTraits(EnumSet<ReducerTraits> traits) { // we don't allow turning on auto parallel once it has been // explicitly turned off. That is to avoid scenarios where // auto parallelism could break assumptions about number of // reducers or hash function. boolean wasUnset = this.reduceTraits.remove(ReducerTraits.UNSET); if (this.reduceTraits.contains(ReducerTraits.FIXED)) { return; } else if (traits.contains(ReducerTraits.FIXED)) { this.reduceTraits.removeAll(EnumSet.of( ReducerTraits.AUTOPARALLEL, ReducerTraits.UNIFORM)); this.reduceTraits.addAll(traits); } else { this.reduceTraits.addAll(traits); } } public boolean isDeduplicated() { return isDeduplicated; } public void setDeduplicated(boolean isDeduplicated) { this.isDeduplicated = isDeduplicated; } public boolean hasOrderBy() { return hasOrderBy; } public void setHasOrderBy(boolean hasOrderBy) { this.hasOrderBy = hasOrderBy; } // Use LinkedHashSet to give predictable display order. private static final Set<String> vectorizableReduceSinkNativeEngines = new LinkedHashSet<String>(Arrays.asList("tez", "spark")); public class ReduceSinkOperatorExplainVectorization extends OperatorExplainVectorization { private final ReduceSinkDesc reduceSinkDesc; private final VectorReduceSinkDesc vectorReduceSinkDesc; private final VectorReduceSinkInfo vectorReduceSinkInfo; private VectorizationCondition[] nativeConditions; public ReduceSinkOperatorExplainVectorization(ReduceSinkDesc reduceSinkDesc, VectorDesc vectorDesc) { // VectorReduceSinkOperator is not native vectorized. super(vectorDesc, ((VectorReduceSinkDesc) vectorDesc).reduceSinkKeyType()!= ReduceSinkKeyType.NONE); this.reduceSinkDesc = reduceSinkDesc; vectorReduceSinkDesc = (VectorReduceSinkDesc) vectorDesc; vectorReduceSinkInfo = vectorReduceSinkDesc.getVectorReduceSinkInfo(); } @Explain(vectorization = Vectorization.EXPRESSION, displayName = "keyExpressions", explainLevels = { Level.DEFAULT, Level.EXTENDED }) public List<String> getKeyExpression() { if (!isNative) { return null; } return vectorExpressionsToStringList(vectorReduceSinkInfo.getReduceSinkKeyExpressions()); } @Explain(vectorization = Vectorization.EXPRESSION, displayName = "valueExpressions", explainLevels = { Level.DEFAULT, Level.EXTENDED }) public List<String> getValueExpression() { if (!isNative) { return null; } return vectorExpressionsToStringList(vectorReduceSinkInfo.getReduceSinkValueExpressions()); } @Explain(vectorization = Vectorization.DETAIL, displayName = "keyColumns", explainLevels = { Level.DEFAULT, Level.EXTENDED }) public String getKeyColumns() { if (!isNative) { return null; } int[] keyColumnMap = vectorReduceSinkInfo.getReduceSinkKeyColumnMap(); if (keyColumnMap == null) { // Always show an array. keyColumnMap = new int[0]; } return Arrays.toString(keyColumnMap); } @Explain(vectorization = Vectorization.DETAIL, displayName = "valueColumns", explainLevels = { Level.DEFAULT, Level.EXTENDED }) public String getValueColumns() { if (!isNative) { return null; } int[] valueColumnMap = vectorReduceSinkInfo.getReduceSinkValueColumnMap(); if (valueColumnMap == null) { // Always show an array. valueColumnMap = new int[0]; } return Arrays.toString(valueColumnMap); } @Explain(vectorization = Vectorization.DETAIL, displayName = "bucketColumns", explainLevels = { Level.DEFAULT, Level.EXTENDED }) public String getBucketColumns() { if (!isNative) { return null; } int[] bucketColumnMap = vectorReduceSinkInfo.getReduceSinkBucketColumnMap(); if (bucketColumnMap == null || bucketColumnMap.length == 0) { // Suppress empty column map. return null; } return Arrays.toString(bucketColumnMap); } @Explain(vectorization = Vectorization.DETAIL, displayName = "partitionColumns", explainLevels = { Level.DEFAULT, Level.EXTENDED }) public String getPartitionColumns() { if (!isNative) { return null; } int[] partitionColumnMap = vectorReduceSinkInfo.getReduceSinkPartitionColumnMap(); if (partitionColumnMap == null || partitionColumnMap.length == 0) { // Suppress empty column map. return null; } return Arrays.toString(partitionColumnMap); } private VectorizationCondition[] createNativeConditions() { boolean enabled = vectorReduceSinkDesc.getIsVectorizationReduceSinkNativeEnabled(); String engine = vectorReduceSinkDesc.getEngine(); String engineInSupportedCondName = HiveConf.ConfVars.HIVE_EXECUTION_ENGINE.varname + " " + engine + " IN " + vectorizableReduceSinkNativeEngines; boolean engineInSupported = vectorizableReduceSinkNativeEngines.contains(engine); VectorizationCondition[] conditions = new VectorizationCondition[] { new VectorizationCondition( enabled, HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCESINK_NEW_ENABLED.varname), new VectorizationCondition( engineInSupported, engineInSupportedCondName), new VectorizationCondition( !vectorReduceSinkDesc.getHasPTFTopN(), "No PTF TopN"), new VectorizationCondition( !vectorReduceSinkDesc.getHasDistinctColumns(), "No DISTINCT columns"), new VectorizationCondition( vectorReduceSinkDesc.getIsKeyBinarySortable(), "BinarySortableSerDe for keys"), new VectorizationCondition( vectorReduceSinkDesc.getIsValueLazyBinary(), "LazyBinarySerDe for values") }; if (vectorReduceSinkDesc.getIsUnexpectedCondition()) { VectorizationCondition[] newConditions = new VectorizationCondition[conditions.length + 1]; System.arraycopy(conditions, 0, newConditions, 0, conditions.length); newConditions[conditions.length] = new VectorizationCondition( false, "NOT UnexpectedCondition"); conditions = newConditions; } return conditions; } @Explain(vectorization = Vectorization.OPERATOR, displayName = "nativeConditionsMet", explainLevels = { Level.DEFAULT, Level.EXTENDED }) public List<String> getNativeConditionsMet() { if (nativeConditions == null) { nativeConditions = createNativeConditions(); } return VectorizationCondition.getConditionsMet(nativeConditions); } @Explain(vectorization = Vectorization.OPERATOR, displayName = "nativeConditionsNotMet", explainLevels = { Level.DEFAULT, Level.EXTENDED }) public List<String> getNativeConditionsNotMet() { if (nativeConditions == null) { nativeConditions = createNativeConditions(); } return VectorizationCondition.getConditionsNotMet(nativeConditions); } } @Explain(vectorization = Vectorization.OPERATOR, displayName = "Reduce Sink Vectorization", explainLevels = { Level.DEFAULT, Level.EXTENDED }) public ReduceSinkOperatorExplainVectorization getReduceSinkVectorization() { if (vectorDesc == null) { return null; } return new ReduceSinkOperatorExplainVectorization(this, vectorDesc); } }