/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements.See the NOTICE file * distributed with this work for additional information * regarding copyright ownership.The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License.You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.optimizer; import java.io.Serializable; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Stack; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.GroupByOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; import org.apache.hadoop.hive.ql.lib.GraphWalker; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.lib.Rule; import org.apache.hadoop.hive.ql.lib.RuleRegExp; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeNullDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; /** *this transformation does bucket group by optimization. */ public class GroupByOptimizer implements Transform { private static final Log LOG = LogFactory.getLog(GroupByOptimizer.class .getName()); public GroupByOptimizer() { } @Override public ParseContext transform(ParseContext pctx) throws SemanticException { Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); GroupByOptProcCtx groupByOptimizeCtx = new GroupByOptProcCtx(); // process group-by pattern opRules.put(new RuleRegExp("R1", "GBY%RS%GBY%"), getMapAggreSortedGroupbyProc(pctx)); // The dispatcher fires the processor corresponding to the closest matching // rule and passes the context along Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, groupByOptimizeCtx); GraphWalker ogw = new DefaultGraphWalker(disp); // Create a list of topop nodes ArrayList<Node> topNodes = new ArrayList<Node>(); topNodes.addAll(pctx.getTopOps().values()); ogw.startWalking(topNodes, null); return pctx; } private NodeProcessor getDefaultProc() { return new NodeProcessor() { @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { return null; } }; } private NodeProcessor getMapAggreSortedGroupbyProc(ParseContext pctx) { return new BucketGroupByProcessor(pctx); } /** * BucketGroupByProcessor. * */ public class BucketGroupByProcessor implements NodeProcessor { protected ParseContext pGraphContext; public BucketGroupByProcessor(ParseContext pGraphContext) { this.pGraphContext = pGraphContext; } @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { // GBY,RS,GBY... (top to bottom) GroupByOperator op = (GroupByOperator) stack.get(stack.size() - 3); checkBucketGroupBy(op); return null; } private void checkBucketGroupBy(GroupByOperator curr) throws SemanticException { // if this is not a HASH groupby, return if (curr.getConf().getMode() != GroupByDesc.Mode.HASH) { return; } Set<String> tblNames = pGraphContext.getGroupOpToInputTables().get(curr); if (tblNames == null || tblNames.size() == 0) { return; } boolean bucketGroupBy = true; GroupByDesc desc = curr.getConf(); List<ExprNodeDesc> groupByKeys = new LinkedList<ExprNodeDesc>(); groupByKeys.addAll(desc.getKeys()); // compute groupby columns from groupby keys List<String> groupByCols = new ArrayList<String>(); while (groupByKeys.size() > 0) { ExprNodeDesc node = groupByKeys.remove(0); if (node instanceof ExprNodeColumnDesc) { groupByCols.addAll(node.getCols()); } else if ((node instanceof ExprNodeConstantDesc) || (node instanceof ExprNodeNullDesc)) { // nothing } else if (node instanceof ExprNodeFieldDesc) { groupByKeys.add(0, ((ExprNodeFieldDesc) node).getDesc()); continue; } else if (node instanceof ExprNodeGenericFuncDesc) { ExprNodeGenericFuncDesc udfNode = ((ExprNodeGenericFuncDesc) node); GenericUDF udf = udfNode.getGenericUDF(); if (!FunctionRegistry.isDeterministic(udf)) { return; } groupByKeys.addAll(0, udfNode.getChildExprs()); } else { return; } } if (groupByCols.size() == 0) { return; } for (String table : tblNames) { Operator<? extends Serializable> topOp = pGraphContext.getTopOps().get( table); if (topOp == null || (!(topOp instanceof TableScanOperator))) { // this is in a sub-query. // In future, we need to infer subq's columns propery. For example // "select key, count(1) // from (from clustergroupbyselect key, value where ds='210') group by key, 3;", // even though the group by op is in a subquery, it can be changed to // bucket groupby. return; } TableScanOperator ts = (TableScanOperator) topOp; Table destTable = pGraphContext.getTopToTable().get(ts); if (destTable == null) { return; } if (!destTable.isPartitioned()) { List<String> bucketCols = destTable.getBucketCols(); List<String> sortCols = Utilities .getColumnNamesFromSortCols(destTable.getSortCols()); bucketGroupBy = matchBucketOrSortedColumns(groupByCols, bucketCols, sortCols); if (!bucketGroupBy) { return; } } else { PrunedPartitionList partsList = null; try { partsList = pGraphContext.getOpToPartList().get(ts); if (partsList == null) { partsList = PartitionPruner.prune(destTable, pGraphContext .getOpToPartPruner().get(ts), pGraphContext.getConf(), table, pGraphContext.getPrunedPartitions()); pGraphContext.getOpToPartList().put(ts, partsList); } } catch (HiveException e) { // Has to use full name to make sure it does not conflict with // org.apache.commons.lang.StringUtils LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); throw new SemanticException(e.getMessage(), e); } List<Partition> parts = new ArrayList<Partition>(); parts.addAll(partsList.getConfirmedPartns()); parts.addAll(partsList.getUnknownPartns()); for (Partition part : parts) { List<String> bucketCols = part.getBucketCols(); List<String> sortCols = part.getSortColNames(); bucketGroupBy = matchBucketOrSortedColumns(groupByCols, bucketCols, sortCols); if (!bucketGroupBy) { return; } } } } curr.getConf().setBucketGroup(bucketGroupBy); } /** * Given the group by keys, bucket columns, sort column, this method * determines if we can use sorted group by or not. * * We use bucket columns only when the sorted column set is empty and if all * group by columns are contained in bucket columns. * * If we can can not determine by looking at bucketed columns and the table * has sort columns, we resort to sort columns. We can use bucket group by * if the groupby column set is an exact prefix match of sort columns. * * @param groupByCols * @param bucketCols * @param sortCols * @return * @throws SemanticException */ private boolean matchBucketOrSortedColumns(List<String> groupByCols, List<String> bucketCols, List<String> sortCols) throws SemanticException { boolean ret = false; if (sortCols == null || sortCols.size() == 0) { ret = matchBucketColumns(groupByCols, bucketCols); } if (!ret && sortCols != null && sortCols.size() >= groupByCols.size()) { // check sort columns, if groupByCols is a prefix subset of sort // columns, we will use sorted group by. For example, if data is sorted // by column a, b, c, and a query wants to group by b,a, we will use // sorted group by. But if the query wants to groupby b,c, then sorted // group by can not be used. int num = groupByCols.size(); for (int i = 0; i < num; i++) { if (sortCols.indexOf(groupByCols.get(i)) > (num - 1)) { return false; } } return true; } return ret; } /* * All group by columns should be contained in the bucket column set. And * the number of group by columns should be equal to number of bucket * columns. */ private boolean matchBucketColumns(List<String> grpCols, List<String> tblBucketCols) throws SemanticException { if (tblBucketCols == null || tblBucketCols.size() == 0 || grpCols.size() == 0 || grpCols.size() != tblBucketCols.size()) { return false; } for (int i = 0; i < grpCols.size(); i++) { String tblCol = grpCols.get(i); if (!tblBucketCols.contains(tblCol)) { return false; } } return true; } } /** * GroupByOptProcCtx. * */ public class GroupByOptProcCtx implements NodeProcessorCtx { } }