/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeNullDesc;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
/**
*this transformation does bucket group by optimization.
*/
public class GroupByOptimizer implements Transform {
private static final Log LOG = LogFactory.getLog(GroupByOptimizer.class
.getName());
public GroupByOptimizer() {
}
@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
GroupByOptProcCtx groupByOptimizeCtx = new GroupByOptProcCtx();
// process group-by pattern
opRules.put(new RuleRegExp("R1", "GBY%RS%GBY%"),
getMapAggreSortedGroupbyProc(pctx));
// The dispatcher fires the processor corresponding to the closest matching
// rule and passes the context along
Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules,
groupByOptimizeCtx);
GraphWalker ogw = new DefaultGraphWalker(disp);
// Create a list of topop nodes
ArrayList<Node> topNodes = new ArrayList<Node>();
topNodes.addAll(pctx.getTopOps().values());
ogw.startWalking(topNodes, null);
return pctx;
}
private NodeProcessor getDefaultProc() {
return new NodeProcessor() {
@Override
public Object process(Node nd, Stack<Node> stack,
NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
return null;
}
};
}
private NodeProcessor getMapAggreSortedGroupbyProc(ParseContext pctx) {
return new BucketGroupByProcessor(pctx);
}
/**
* BucketGroupByProcessor.
*
*/
public class BucketGroupByProcessor implements NodeProcessor {
protected ParseContext pGraphContext;
public BucketGroupByProcessor(ParseContext pGraphContext) {
this.pGraphContext = pGraphContext;
}
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
// GBY,RS,GBY... (top to bottom)
GroupByOperator op = (GroupByOperator) stack.get(stack.size() - 3);
checkBucketGroupBy(op);
return null;
}
private void checkBucketGroupBy(GroupByOperator curr)
throws SemanticException {
// if this is not a HASH groupby, return
if (curr.getConf().getMode() != GroupByDesc.Mode.HASH) {
return;
}
Set<String> tblNames = pGraphContext.getGroupOpToInputTables().get(curr);
if (tblNames == null || tblNames.size() == 0) {
return;
}
boolean bucketGroupBy = true;
GroupByDesc desc = curr.getConf();
List<ExprNodeDesc> groupByKeys = new LinkedList<ExprNodeDesc>();
groupByKeys.addAll(desc.getKeys());
// compute groupby columns from groupby keys
List<String> groupByCols = new ArrayList<String>();
while (groupByKeys.size() > 0) {
ExprNodeDesc node = groupByKeys.remove(0);
if (node instanceof ExprNodeColumnDesc) {
groupByCols.addAll(node.getCols());
} else if ((node instanceof ExprNodeConstantDesc)
|| (node instanceof ExprNodeNullDesc)) {
// nothing
} else if (node instanceof ExprNodeFieldDesc) {
groupByKeys.add(0, ((ExprNodeFieldDesc) node).getDesc());
continue;
} else if (node instanceof ExprNodeGenericFuncDesc) {
ExprNodeGenericFuncDesc udfNode = ((ExprNodeGenericFuncDesc) node);
GenericUDF udf = udfNode.getGenericUDF();
if (!FunctionRegistry.isDeterministic(udf)) {
return;
}
groupByKeys.addAll(0, udfNode.getChildExprs());
} else {
return;
}
}
if (groupByCols.size() == 0) {
return;
}
for (String table : tblNames) {
Operator<? extends Serializable> topOp = pGraphContext.getTopOps().get(
table);
if (topOp == null || (!(topOp instanceof TableScanOperator))) {
// this is in a sub-query.
// In future, we need to infer subq's columns propery. For example
// "select key, count(1)
// from (from clustergroupbyselect key, value where ds='210') group by key, 3;",
// even though the group by op is in a subquery, it can be changed to
// bucket groupby.
return;
}
TableScanOperator ts = (TableScanOperator) topOp;
Table destTable = pGraphContext.getTopToTable().get(ts);
if (destTable == null) {
return;
}
if (!destTable.isPartitioned()) {
List<String> bucketCols = destTable.getBucketCols();
List<String> sortCols = Utilities
.getColumnNamesFromSortCols(destTable.getSortCols());
bucketGroupBy = matchBucketOrSortedColumns(groupByCols, bucketCols,
sortCols);
if (!bucketGroupBy) {
return;
}
} else {
PrunedPartitionList partsList = null;
try {
partsList = pGraphContext.getOpToPartList().get(ts);
if (partsList == null) {
partsList = PartitionPruner.prune(destTable, pGraphContext
.getOpToPartPruner().get(ts), pGraphContext.getConf(), table,
pGraphContext.getPrunedPartitions());
pGraphContext.getOpToPartList().put(ts, partsList);
}
} catch (HiveException e) {
// Has to use full name to make sure it does not conflict with
// org.apache.commons.lang.StringUtils
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
throw new SemanticException(e.getMessage(), e);
}
List<Partition> parts = new ArrayList<Partition>();
parts.addAll(partsList.getConfirmedPartns());
parts.addAll(partsList.getUnknownPartns());
for (Partition part : parts) {
List<String> bucketCols = part.getBucketCols();
List<String> sortCols = part.getSortColNames();
bucketGroupBy = matchBucketOrSortedColumns(groupByCols, bucketCols,
sortCols);
if (!bucketGroupBy) {
return;
}
}
}
}
curr.getConf().setBucketGroup(bucketGroupBy);
}
/**
* Given the group by keys, bucket columns, sort column, this method
* determines if we can use sorted group by or not.
*
* We use bucket columns only when the sorted column set is empty and if all
* group by columns are contained in bucket columns.
*
* If we can can not determine by looking at bucketed columns and the table
* has sort columns, we resort to sort columns. We can use bucket group by
* if the groupby column set is an exact prefix match of sort columns.
*
* @param groupByCols
* @param bucketCols
* @param sortCols
* @return
* @throws SemanticException
*/
private boolean matchBucketOrSortedColumns(List<String> groupByCols,
List<String> bucketCols, List<String> sortCols) throws SemanticException {
boolean ret = false;
if (sortCols == null || sortCols.size() == 0) {
ret = matchBucketColumns(groupByCols, bucketCols);
}
if (!ret && sortCols != null && sortCols.size() >= groupByCols.size()) {
// check sort columns, if groupByCols is a prefix subset of sort
// columns, we will use sorted group by. For example, if data is sorted
// by column a, b, c, and a query wants to group by b,a, we will use
// sorted group by. But if the query wants to groupby b,c, then sorted
// group by can not be used.
int num = groupByCols.size();
for (int i = 0; i < num; i++) {
if (sortCols.indexOf(groupByCols.get(i)) > (num - 1)) {
return false;
}
}
return true;
}
return ret;
}
/*
* All group by columns should be contained in the bucket column set. And
* the number of group by columns should be equal to number of bucket
* columns.
*/
private boolean matchBucketColumns(List<String> grpCols,
List<String> tblBucketCols) throws SemanticException {
if (tblBucketCols == null || tblBucketCols.size() == 0
|| grpCols.size() == 0 || grpCols.size() != tblBucketCols.size()) {
return false;
}
for (int i = 0; i < grpCols.size(); i++) {
String tblCol = grpCols.get(i);
if (!tblBucketCols.contains(tblCol)) {
return false;
}
}
return true;
}
}
/**
* GroupByOptProcCtx.
*
*/
public class GroupByOptProcCtx implements NodeProcessorCtx {
}
}