/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.plan;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.plan.Explain.Level;
import org.apache.hadoop.hive.ql.plan.Explain.Vectorization;
import org.apache.hadoop.hive.ql.plan.VectorReduceSinkDesc.ReduceSinkKeyType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* ReduceSinkDesc.
*
*/
@Explain(displayName = "Reduce Output Operator", explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED })
public class ReduceSinkDesc extends AbstractOperatorDesc {
private static final long serialVersionUID = 1L;
/**
* Key columns are passed to reducer in the "key".
*/
private java.util.ArrayList<ExprNodeDesc> keyCols;
private java.util.ArrayList<java.lang.String> outputKeyColumnNames;
private List<List<Integer>> distinctColumnIndices;
/**
* Value columns are passed to reducer in the "value".
*/
private java.util.ArrayList<ExprNodeDesc> valueCols;
private java.util.ArrayList<java.lang.String> outputValueColumnNames;
/**
* Describe how to serialize the key.
*/
private TableDesc keySerializeInfo;
/**
* Describe how to serialize the value.
*/
private TableDesc valueSerializeInfo;
/**
* The tag for this reducesink descriptor.
*/
private int tag;
/**
* Number of distribution keys.
*/
private int numDistributionKeys;
/**
* Used in tez. Holds the name of the output
* that this reduce sink is writing to.
*/
private String outputName;
/**
* The partition columns (CLUSTER BY or DISTRIBUTE BY in Hive language).
* Partition columns decide the reducer that the current row goes to.
* Partition columns are not passed to reducer.
*/
private java.util.ArrayList<ExprNodeDesc> partitionCols;
private int numReducers;
/**
* Bucket information
*/
private int numBuckets;
private List<ExprNodeDesc> bucketCols;
private int topN = -1;
private float topNMemoryUsage = -1;
private boolean mapGroupBy; // for group-by, values with same key on top-K should be forwarded
//flag used to control how TopN handled for PTF/Windowing partitions.
private boolean isPTFReduceSink = false;
private boolean skipTag; // Skip writing tags when feeding into mapjoin hashtable
public static enum ReducerTraits {
UNSET(0), // unset
FIXED(1), // distribution of keys is fixed
AUTOPARALLEL(2), // can change reducer count (ORDER BY can concat adjacent buckets)
UNIFORM(3), // can redistribute into buckets uniformly (GROUP BY can)
QUICKSTART(4); // do not wait for downstream tasks
private final int trait;
private ReducerTraits(int trait) {
this.trait = trait;
}
};
// Is reducer auto-parallelism unset (FIXED, UNIFORM, PARALLEL)
private EnumSet<ReducerTraits> reduceTraits = EnumSet.of(ReducerTraits.UNSET);
// whether this RS is deduplicated
private transient boolean isDeduplicated = false;
// used by spark mode to decide whether global order is needed
private transient boolean hasOrderBy = false;
private static transient Logger LOG = LoggerFactory.getLogger(ReduceSinkDesc.class);
public ReduceSinkDesc() {
}
public ReduceSinkDesc(ArrayList<ExprNodeDesc> keyCols,
int numDistributionKeys,
ArrayList<ExprNodeDesc> valueCols,
ArrayList<String> outputKeyColumnNames,
List<List<Integer>> distinctColumnIndices,
ArrayList<String> outputValueColumnNames, int tag,
ArrayList<ExprNodeDesc> partitionCols, int numReducers,
final TableDesc keySerializeInfo, final TableDesc valueSerializeInfo) {
this.keyCols = keyCols;
this.numDistributionKeys = numDistributionKeys;
this.valueCols = valueCols;
this.outputKeyColumnNames = outputKeyColumnNames;
this.outputValueColumnNames = outputValueColumnNames;
this.tag = tag;
this.numReducers = numReducers;
this.partitionCols = partitionCols;
this.keySerializeInfo = keySerializeInfo;
this.valueSerializeInfo = valueSerializeInfo;
this.distinctColumnIndices = distinctColumnIndices;
this.setNumBuckets(-1);
this.setBucketCols(null);
this.vectorDesc = null;
}
@Override
public Object clone() {
ReduceSinkDesc desc = new ReduceSinkDesc();
desc.setKeyCols((ArrayList<ExprNodeDesc>) getKeyCols().clone());
desc.setValueCols((ArrayList<ExprNodeDesc>) getValueCols().clone());
desc.setOutputKeyColumnNames((ArrayList<String>) getOutputKeyColumnNames().clone());
List<List<Integer>> distinctColumnIndicesClone = new ArrayList<List<Integer>>();
for (List<Integer> distinctColumnIndex : getDistinctColumnIndices()) {
List<Integer> tmp = new ArrayList<Integer>();
tmp.addAll(distinctColumnIndex);
distinctColumnIndicesClone.add(tmp);
}
desc.setDistinctColumnIndices(distinctColumnIndicesClone);
desc.setOutputValueColumnNames((ArrayList<String>) getOutputValueColumnNames().clone());
desc.setNumDistributionKeys(getNumDistributionKeys());
desc.setTag(getTag());
desc.setNumReducers(getNumReducers());
desc.setPartitionCols((ArrayList<ExprNodeDesc>) getPartitionCols().clone());
desc.setKeySerializeInfo((TableDesc) getKeySerializeInfo().clone());
desc.setValueSerializeInfo((TableDesc) getValueSerializeInfo().clone());
desc.setNumBuckets(numBuckets);
desc.setBucketCols(bucketCols);
desc.setStatistics(this.getStatistics());
desc.setSkipTag(skipTag);
desc.reduceTraits = reduceTraits.clone();
desc.setDeduplicated(isDeduplicated);
desc.setHasOrderBy(hasOrderBy);
if (vectorDesc != null) {
throw new RuntimeException("Clone with vectorization desc not supported");
}
desc.vectorDesc = null;
desc.outputName = outputName;
return desc;
}
public java.util.ArrayList<java.lang.String> getOutputKeyColumnNames() {
return outputKeyColumnNames;
}
public void setOutputKeyColumnNames(
java.util.ArrayList<java.lang.String> outputKeyColumnNames) {
this.outputKeyColumnNames = outputKeyColumnNames;
}
public java.util.ArrayList<java.lang.String> getOutputValueColumnNames() {
return outputValueColumnNames;
}
public void setOutputValueColumnNames(
java.util.ArrayList<java.lang.String> outputValueColumnNames) {
this.outputValueColumnNames = outputValueColumnNames;
}
@Explain(displayName = "key expressions")
public String getKeyColString() {
return PlanUtils.getExprListString(keyCols);
}
public java.util.ArrayList<ExprNodeDesc> getKeyCols() {
return keyCols;
}
public void setKeyCols(final java.util.ArrayList<ExprNodeDesc> keyCols) {
this.keyCols = keyCols;
}
public int getNumDistributionKeys() {
return this.numDistributionKeys;
}
public void setNumDistributionKeys(int numKeys) {
this.numDistributionKeys = numKeys;
}
@Explain(displayName = "value expressions")
public String getValueColsString() {
return PlanUtils.getExprListString(valueCols);
}
public java.util.ArrayList<ExprNodeDesc> getValueCols() {
return valueCols;
}
public void setValueCols(final java.util.ArrayList<ExprNodeDesc> valueCols) {
this.valueCols = valueCols;
}
@Explain(displayName = "Map-reduce partition columns")
public String getParitionColsString() {
return PlanUtils.getExprListString(partitionCols);
}
@Explain(displayName = "PartitionCols", explainLevels = { Level.USER })
public String getUserLevelExplainParitionColsString() {
return PlanUtils.getExprListString(partitionCols, true);
}
public java.util.ArrayList<ExprNodeDesc> getPartitionCols() {
return partitionCols;
}
public void setPartitionCols(
final java.util.ArrayList<ExprNodeDesc> partitionCols) {
this.partitionCols = partitionCols;
}
public boolean isPartitioning() {
if (partitionCols != null && !partitionCols.isEmpty()) {
return true;
}
return false;
}
@Explain(displayName = "tag", explainLevels = { Level.EXTENDED })
public int getTag() {
return tag;
}
public void setTag(int tag) {
this.tag = tag;
}
public int getTopN() {
return topN;
}
public void setTopN(int topN) {
this.topN = topN;
}
@Explain(displayName = "TopN", explainLevels = { Level.EXTENDED })
public Integer getTopNExplain() {
return topN > 0 ? topN : null;
}
public float getTopNMemoryUsage() {
return topNMemoryUsage;
}
public void setTopNMemoryUsage(float topNMemoryUsage) {
this.topNMemoryUsage = topNMemoryUsage;
}
@Explain(displayName = "TopN Hash Memory Usage")
public Float getTopNMemoryUsageExplain() {
return topN > 0 && topNMemoryUsage > 0 ? topNMemoryUsage : null;
}
public boolean isMapGroupBy() {
return mapGroupBy;
}
public void setMapGroupBy(boolean mapGroupBy) {
this.mapGroupBy = mapGroupBy;
}
public boolean isPTFReduceSink() {
return isPTFReduceSink;
}
public void setPTFReduceSink(boolean isPTFReduceSink) {
this.isPTFReduceSink = isPTFReduceSink;
}
/**
* Returns the number of reducers for the map-reduce job. -1 means to decide
* the number of reducers at runtime. This enables Hive to estimate the number
* of reducers based on the map-reduce input data size, which is only
* available right before we start the map-reduce job.
*/
public int getNumReducers() {
return numReducers;
}
public void setNumReducers(int numReducers) {
this.numReducers = numReducers;
}
public TableDesc getKeySerializeInfo() {
return keySerializeInfo;
}
public void setKeySerializeInfo(TableDesc keySerializeInfo) {
this.keySerializeInfo = keySerializeInfo;
}
public TableDesc getValueSerializeInfo() {
return valueSerializeInfo;
}
public void setValueSerializeInfo(TableDesc valueSerializeInfo) {
this.valueSerializeInfo = valueSerializeInfo;
}
/**
* Returns the sort order of the key columns.
*
* @return null, which means ascending order for all key columns, or a String
* of the same length as key columns, that consists of only "+"
* (ascending order) and "-" (descending order).
*/
@Explain(displayName = "sort order")
public String getOrder() {
return keySerializeInfo.getProperties().getProperty(
org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_SORT_ORDER);
}
public void setOrder(String orderStr) {
keySerializeInfo.getProperties().setProperty(
org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_SORT_ORDER,
orderStr);
}
public boolean isOrdering() {
if (this.getOrder() != null && !this.getOrder().isEmpty()) {
return true;
}
return false;
}
/**
* Returns the null order in the key columns.
*
* @return null, which means default for all key columns, or a String
* of the same length as key columns, that consists of only "a"
* (null first) and "z" (null last).
*/
@Explain(displayName = "null sort order", explainLevels = { Level.EXTENDED })
public String getNullOrder() {
return keySerializeInfo.getProperties().getProperty(
org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_NULL_SORT_ORDER);
}
public void setNullOrder(String nullOrderStr) {
keySerializeInfo.getProperties().setProperty(
org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_NULL_SORT_ORDER,
nullOrderStr);
}
public List<List<Integer>> getDistinctColumnIndices() {
return distinctColumnIndices;
}
public void setDistinctColumnIndices(
List<List<Integer>> distinctColumnIndices) {
this.distinctColumnIndices = distinctColumnIndices;
}
@Explain(displayName = "outputname", explainLevels = { Level.USER })
public String getOutputName() {
return outputName;
}
public void setOutputName(String outputName) {
this.outputName = outputName;
}
public int getNumBuckets() {
return numBuckets;
}
public void setNumBuckets(int numBuckets) {
this.numBuckets = numBuckets;
}
public List<ExprNodeDesc> getBucketCols() {
return bucketCols;
}
public void setBucketCols(List<ExprNodeDesc> bucketCols) {
this.bucketCols = bucketCols;
}
public void setSkipTag(boolean value) {
this.skipTag = value;
}
public boolean getSkipTag() {
return skipTag;
}
@Explain(displayName = "auto parallelism", explainLevels = { Level.EXTENDED })
public final boolean isAutoParallel() {
return (this.reduceTraits.contains(ReducerTraits.AUTOPARALLEL));
}
public final boolean isSlowStart() {
return !(this.reduceTraits.contains(ReducerTraits.QUICKSTART));
}
@Explain(displayName = "quick start", displayOnlyOnTrue = true, explainLevels = {Explain.Level.EXTENDED })
public final boolean isQuickStart() {
return !isSlowStart();
}
public final EnumSet<ReducerTraits> getReducerTraits() {
return this.reduceTraits;
}
public final void setReducerTraits(EnumSet<ReducerTraits> traits) {
// we don't allow turning on auto parallel once it has been
// explicitly turned off. That is to avoid scenarios where
// auto parallelism could break assumptions about number of
// reducers or hash function.
boolean wasUnset = this.reduceTraits.remove(ReducerTraits.UNSET);
if (this.reduceTraits.contains(ReducerTraits.FIXED)) {
return;
} else if (traits.contains(ReducerTraits.FIXED)) {
this.reduceTraits.removeAll(EnumSet.of(
ReducerTraits.AUTOPARALLEL,
ReducerTraits.UNIFORM));
this.reduceTraits.addAll(traits);
} else {
this.reduceTraits.addAll(traits);
}
}
public boolean isDeduplicated() {
return isDeduplicated;
}
public void setDeduplicated(boolean isDeduplicated) {
this.isDeduplicated = isDeduplicated;
}
public boolean hasOrderBy() {
return hasOrderBy;
}
public void setHasOrderBy(boolean hasOrderBy) {
this.hasOrderBy = hasOrderBy;
}
// Use LinkedHashSet to give predictable display order.
private static final Set<String> vectorizableReduceSinkNativeEngines =
new LinkedHashSet<String>(Arrays.asList("tez", "spark"));
public class ReduceSinkOperatorExplainVectorization extends OperatorExplainVectorization {
private final ReduceSinkDesc reduceSinkDesc;
private final VectorReduceSinkDesc vectorReduceSinkDesc;
private final VectorReduceSinkInfo vectorReduceSinkInfo;
private VectorizationCondition[] nativeConditions;
public ReduceSinkOperatorExplainVectorization(ReduceSinkDesc reduceSinkDesc, VectorDesc vectorDesc) {
// VectorReduceSinkOperator is not native vectorized.
super(vectorDesc, ((VectorReduceSinkDesc) vectorDesc).reduceSinkKeyType()!= ReduceSinkKeyType.NONE);
this.reduceSinkDesc = reduceSinkDesc;
vectorReduceSinkDesc = (VectorReduceSinkDesc) vectorDesc;
vectorReduceSinkInfo = vectorReduceSinkDesc.getVectorReduceSinkInfo();
}
@Explain(vectorization = Vectorization.EXPRESSION, displayName = "keyExpressions", explainLevels = { Level.DEFAULT, Level.EXTENDED })
public List<String> getKeyExpression() {
if (!isNative) {
return null;
}
return vectorExpressionsToStringList(vectorReduceSinkInfo.getReduceSinkKeyExpressions());
}
@Explain(vectorization = Vectorization.EXPRESSION, displayName = "valueExpressions", explainLevels = { Level.DEFAULT, Level.EXTENDED })
public List<String> getValueExpression() {
if (!isNative) {
return null;
}
return vectorExpressionsToStringList(vectorReduceSinkInfo.getReduceSinkValueExpressions());
}
@Explain(vectorization = Vectorization.DETAIL, displayName = "keyColumns", explainLevels = { Level.DEFAULT, Level.EXTENDED })
public String getKeyColumns() {
if (!isNative) {
return null;
}
int[] keyColumnMap = vectorReduceSinkInfo.getReduceSinkKeyColumnMap();
if (keyColumnMap == null) {
// Always show an array.
keyColumnMap = new int[0];
}
return Arrays.toString(keyColumnMap);
}
@Explain(vectorization = Vectorization.DETAIL, displayName = "valueColumns", explainLevels = { Level.DEFAULT, Level.EXTENDED })
public String getValueColumns() {
if (!isNative) {
return null;
}
int[] valueColumnMap = vectorReduceSinkInfo.getReduceSinkValueColumnMap();
if (valueColumnMap == null) {
// Always show an array.
valueColumnMap = new int[0];
}
return Arrays.toString(valueColumnMap);
}
@Explain(vectorization = Vectorization.DETAIL, displayName = "bucketColumns", explainLevels = { Level.DEFAULT, Level.EXTENDED })
public String getBucketColumns() {
if (!isNative) {
return null;
}
int[] bucketColumnMap = vectorReduceSinkInfo.getReduceSinkBucketColumnMap();
if (bucketColumnMap == null || bucketColumnMap.length == 0) {
// Suppress empty column map.
return null;
}
return Arrays.toString(bucketColumnMap);
}
@Explain(vectorization = Vectorization.DETAIL, displayName = "partitionColumns", explainLevels = { Level.DEFAULT, Level.EXTENDED })
public String getPartitionColumns() {
if (!isNative) {
return null;
}
int[] partitionColumnMap = vectorReduceSinkInfo.getReduceSinkPartitionColumnMap();
if (partitionColumnMap == null || partitionColumnMap.length == 0) {
// Suppress empty column map.
return null;
}
return Arrays.toString(partitionColumnMap);
}
private VectorizationCondition[] createNativeConditions() {
boolean enabled = vectorReduceSinkDesc.getIsVectorizationReduceSinkNativeEnabled();
String engine = vectorReduceSinkDesc.getEngine();
String engineInSupportedCondName =
HiveConf.ConfVars.HIVE_EXECUTION_ENGINE.varname + " " + engine + " IN " + vectorizableReduceSinkNativeEngines;
boolean engineInSupported = vectorizableReduceSinkNativeEngines.contains(engine);
VectorizationCondition[] conditions = new VectorizationCondition[] {
new VectorizationCondition(
enabled,
HiveConf.ConfVars.HIVE_VECTORIZATION_REDUCESINK_NEW_ENABLED.varname),
new VectorizationCondition(
engineInSupported,
engineInSupportedCondName),
new VectorizationCondition(
!vectorReduceSinkDesc.getHasPTFTopN(),
"No PTF TopN"),
new VectorizationCondition(
!vectorReduceSinkDesc.getHasDistinctColumns(),
"No DISTINCT columns"),
new VectorizationCondition(
vectorReduceSinkDesc.getIsKeyBinarySortable(),
"BinarySortableSerDe for keys"),
new VectorizationCondition(
vectorReduceSinkDesc.getIsValueLazyBinary(),
"LazyBinarySerDe for values")
};
if (vectorReduceSinkDesc.getIsUnexpectedCondition()) {
VectorizationCondition[] newConditions = new VectorizationCondition[conditions.length + 1];
System.arraycopy(conditions, 0, newConditions, 0, conditions.length);
newConditions[conditions.length] =
new VectorizationCondition(
false,
"NOT UnexpectedCondition");
conditions = newConditions;
}
return conditions;
}
@Explain(vectorization = Vectorization.OPERATOR, displayName = "nativeConditionsMet", explainLevels = { Level.DEFAULT, Level.EXTENDED })
public List<String> getNativeConditionsMet() {
if (nativeConditions == null) {
nativeConditions = createNativeConditions();
}
return VectorizationCondition.getConditionsMet(nativeConditions);
}
@Explain(vectorization = Vectorization.OPERATOR, displayName = "nativeConditionsNotMet", explainLevels = { Level.DEFAULT, Level.EXTENDED })
public List<String> getNativeConditionsNotMet() {
if (nativeConditions == null) {
nativeConditions = createNativeConditions();
}
return VectorizationCondition.getConditionsNotMet(nativeConditions);
}
}
@Explain(vectorization = Vectorization.OPERATOR, displayName = "Reduce Sink Vectorization", explainLevels = { Level.DEFAULT, Level.EXTENDED })
public ReduceSinkOperatorExplainVectorization getReduceSinkVectorization() {
if (vectorDesc == null) {
return null;
}
return new ReduceSinkOperatorExplainVectorization(this, vectorDesc);
}
}