/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.optimizer.calcite.translator; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import org.apache.calcite.rel.RelNode; import org.apache.calcite.rel.core.AggregateCall; import org.apache.calcite.rex.RexInputRef; import org.apache.calcite.util.ImmutableBitSet; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.exec.ColumnInfo; import org.apache.hadoop.hive.ql.exec.GroupByOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.OperatorFactory; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.RowSchema; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveGroupingID; import org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverter.OpAttr; import org.apache.hadoop.hive.ql.parse.ASTNode; import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.GenericUDAFInfo; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.AggregationDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.PlanUtils; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import com.google.common.collect.ImmutableList; /** * TODO:<br> * 1. Change the output col/ExprNodeColumn names to external names.<br> * 2. Verify if we need to use the "KEY."/"VALUE." in RS cols; switch to * external names if possible.<br> * 3. In ExprNode & in ColumnInfo the tableAlias/VirtualColumn is specified * differently for different GB/RS in pipeline. Remove the different treatments. * 4. VirtualColMap needs to be maintained * */ public class HiveGBOpConvUtil { private static enum HIVEGBPHYSICALMODE { MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB, MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB, MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT, MAP_SIDE_GB_SKEW_GBKEYS_AND_DIST_UDAF_NOT_PRESENT, NO_MAP_SIDE_GB_NO_SKEW, NO_MAP_SIDE_GB_SKEW }; private static class UDAFAttrs { private boolean isDistinctUDAF; private String udafName; private GenericUDAFEvaluator udafEvaluator; private final ArrayList<ExprNodeDesc> udafParams = new ArrayList<ExprNodeDesc>(); private List<Integer> udafParamsIndxInGBInfoDistExprs = new ArrayList<Integer>(); // We store the position of the argument for the function in the input. private List<Integer> argList; }; private static class GBInfo { private final List<String> outputColNames = new ArrayList<String>(); private final List<String> gbKeyColNamesInInput = new ArrayList<String>(); private final List<TypeInfo> gbKeyTypes = new ArrayList<TypeInfo>(); private final List<ExprNodeDesc> gbKeys = new ArrayList<ExprNodeDesc>(); private final List<Integer> grpSets = new ArrayList<Integer>(); private boolean grpSetRqrAdditionalMRJob; private boolean grpIdFunctionNeeded; private final List<String> distExprNames = new ArrayList<String>(); private final List<TypeInfo> distExprTypes = new ArrayList<TypeInfo>(); private final List<ExprNodeDesc> distExprNodes = new ArrayList<ExprNodeDesc>(); private final List<List<Integer>> distColIndices = new ArrayList<List<Integer>>(); private final List<ExprNodeDesc> deDupedNonDistIrefs = new ArrayList<ExprNodeDesc>(); private final List<UDAFAttrs> udafAttrs = new ArrayList<UDAFAttrs>(); private boolean containsDistinctAggr = false; float groupByMemoryUsage; float memoryThreshold; private HIVEGBPHYSICALMODE gbPhysicalPipelineMode; }; private static HIVEGBPHYSICALMODE getAggOPMode(HiveConf hc, GBInfo gbInfo) { HIVEGBPHYSICALMODE gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB; if (hc.getBoolVar(HiveConf.ConfVars.HIVEMAPSIDEAGGREGATE)) { if (!hc.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) { if (!gbInfo.grpSetRqrAdditionalMRJob) { gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB; } else { gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB; } } else { if (gbInfo.containsDistinctAggr || !gbInfo.gbKeys.isEmpty()) { gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT; } else { gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_AND_DIST_UDAF_NOT_PRESENT; } } } else { if (!hc.getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) { gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.NO_MAP_SIDE_GB_NO_SKEW; } else { gbPhysicalPipelineMode = HIVEGBPHYSICALMODE.NO_MAP_SIDE_GB_SKEW; } } return gbPhysicalPipelineMode; } // For each of the GB op in the logical GB this should be called seperately; // otherwise GBevaluator and expr nodes may get shared among multiple GB ops private static GBInfo getGBInfo(HiveAggregate aggRel, OpAttr inputOpAf, HiveConf hc) throws SemanticException { GBInfo gbInfo = new GBInfo(); // 0. Collect AggRel output col Names gbInfo.outputColNames.addAll(aggRel.getRowType().getFieldNames()); // 1. Collect GB Keys RelNode aggInputRel = aggRel.getInput(); ExprNodeConverter exprConv = new ExprNodeConverter(inputOpAf.tabAlias, aggInputRel.getRowType(), new HashSet<Integer>(), aggRel.getCluster().getTypeFactory(), true); ExprNodeDesc tmpExprNodeDesc; for (int i : aggRel.getGroupSet()) { RexInputRef iRef = new RexInputRef(i, aggInputRel.getRowType().getFieldList() .get(i).getType()); tmpExprNodeDesc = iRef.accept(exprConv); gbInfo.gbKeys.add(tmpExprNodeDesc); gbInfo.gbKeyColNamesInInput.add(aggInputRel.getRowType().getFieldNames().get(i)); gbInfo.gbKeyTypes.add(tmpExprNodeDesc.getTypeInfo()); } // 2. Collect Grouping Set info if (aggRel.indicator) { // 2.1 Translate Grouping set col bitset ImmutableList<ImmutableBitSet> lstGrpSet = aggRel.getGroupSets(); int bitmap = 0; for (ImmutableBitSet grpSet : lstGrpSet) { bitmap = 0; for (Integer bitIdx : grpSet.asList()) { bitmap = SemanticAnalyzer.setBit(bitmap, bitIdx); } gbInfo.grpSets.add(bitmap); } Collections.sort(gbInfo.grpSets); // 2.2 Check if GRpSet require additional MR Job gbInfo.grpSetRqrAdditionalMRJob = gbInfo.grpSets.size() > hc .getIntVar(HiveConf.ConfVars.HIVE_NEW_JOB_GROUPING_SET_CARDINALITY); // 2.3 Check if GROUPING_ID needs to be projected out if (!aggRel.getAggCallList().isEmpty() && (aggRel.getAggCallList().get(aggRel.getAggCallList().size() - 1).getAggregation() == HiveGroupingID.INSTANCE)) { gbInfo.grpIdFunctionNeeded = true; } } // 3. Walk through UDAF & Collect Distinct Info Set<Integer> distinctRefs = new HashSet<Integer>(); Map<Integer, Integer> distParamInRefsToOutputPos = new HashMap<Integer, Integer>(); for (AggregateCall aggCall : aggRel.getAggCallList()) { if ((aggCall.getAggregation() == HiveGroupingID.INSTANCE) || !aggCall.isDistinct()) { continue; } List<Integer> argLst = new ArrayList<Integer>(aggCall.getArgList()); List<String> argNames = HiveCalciteUtil.getFieldNames(argLst, aggInputRel); ExprNodeDesc distinctExpr; for (int i = 0; i < argLst.size(); i++) { if (!distinctRefs.contains(argLst.get(i))) { distinctRefs.add(argLst.get(i)); distinctExpr = HiveCalciteUtil.getExprNode(argLst.get(i), aggInputRel, exprConv); // Only distinct nodes that are NOT part of the key should be added to distExprNodes if (ExprNodeDescUtils.indexOf(distinctExpr, gbInfo.gbKeys) < 0) { distParamInRefsToOutputPos.put(argLst.get(i), gbInfo.distExprNodes.size()); gbInfo.distExprNodes.add(distinctExpr); gbInfo.distExprNames.add(argNames.get(i)); gbInfo.distExprTypes.add(distinctExpr.getTypeInfo()); } } } } // 4. Walk through UDAF & Collect UDAF Info Set<Integer> deDupedNonDistIrefsSet = new HashSet<Integer>(); for (AggregateCall aggCall : aggRel.getAggCallList()) { if (aggCall.getAggregation() == HiveGroupingID.INSTANCE) { continue; } UDAFAttrs udafAttrs = new UDAFAttrs(); List<ExprNodeDesc> argExps = HiveCalciteUtil.getExprNodes(aggCall.getArgList(), aggInputRel, inputOpAf.tabAlias); udafAttrs.udafParams.addAll(argExps); udafAttrs.udafName = aggCall.getAggregation().getName(); udafAttrs.argList = aggCall.getArgList(); udafAttrs.isDistinctUDAF = aggCall.isDistinct(); List<Integer> argLst = new ArrayList<Integer>(aggCall.getArgList()); List<Integer> distColIndicesOfUDAF = new ArrayList<Integer>(); List<Integer> distUDAFParamsIndxInDistExprs = new ArrayList<Integer>(); for (int i = 0; i < argLst.size(); i++) { // NOTE: distinct expr can be part of of GB key if (udafAttrs.isDistinctUDAF) { ExprNodeDesc argExpr = argExps.get(i); Integer found = ExprNodeDescUtils.indexOf(argExpr, gbInfo.gbKeys); distColIndicesOfUDAF.add(found < 0 ? distParamInRefsToOutputPos.get(argLst.get(i)) + gbInfo.gbKeys.size() + (gbInfo.grpSets.size() > 0 ? 1 : 0) : found); distUDAFParamsIndxInDistExprs.add(distParamInRefsToOutputPos.get(argLst.get(i))); } else { // TODO: this seems wrong (following what Hive Regular does) if (!distParamInRefsToOutputPos.containsKey(argLst.get(i)) && !deDupedNonDistIrefsSet.contains(argLst.get(i))) { deDupedNonDistIrefsSet.add(argLst.get(i)); gbInfo.deDupedNonDistIrefs.add(udafAttrs.udafParams.get(i)); } } } if (udafAttrs.isDistinctUDAF) { gbInfo.containsDistinctAggr = true; udafAttrs.udafParamsIndxInGBInfoDistExprs = distUDAFParamsIndxInDistExprs; gbInfo.distColIndices.add(distColIndicesOfUDAF); } // special handling for count, similar to PlanModifierForASTConv::replaceEmptyGroupAggr() udafAttrs.udafEvaluator = SemanticAnalyzer.getGenericUDAFEvaluator(udafAttrs.udafName, new ArrayList<ExprNodeDesc>(udafAttrs.udafParams), new ASTNode(), udafAttrs.isDistinctUDAF, udafAttrs.udafParams.size() == 0 && "count".equalsIgnoreCase(udafAttrs.udafName) ? true : false); gbInfo.udafAttrs.add(udafAttrs); } // 4. Gather GB Memory threshold gbInfo.groupByMemoryUsage = HiveConf.getFloatVar(hc, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY); gbInfo.memoryThreshold = HiveConf.getFloatVar(hc, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD); // 5. Gather GB Physical pipeline (based on user config & Grping Sets size) gbInfo.gbPhysicalPipelineMode = getAggOPMode(hc, gbInfo); return gbInfo; } static OpAttr translateGB(OpAttr inputOpAf, HiveAggregate aggRel, HiveConf hc) throws SemanticException { OpAttr translatedGBOpAttr = null; GBInfo gbInfo = getGBInfo(aggRel, inputOpAf, hc); switch (gbInfo.gbPhysicalPipelineMode) { case MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB: translatedGBOpAttr = genMapSideGBNoSkewNoAddMRJob(inputOpAf, aggRel, gbInfo); break; case MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB: translatedGBOpAttr = genMapSideGBNoSkewAddMRJob(inputOpAf, aggRel, gbInfo); break; case MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT: translatedGBOpAttr = genMapSideGBSkewGBKeysOrDistUDAFPresent(inputOpAf, aggRel, gbInfo); break; case MAP_SIDE_GB_SKEW_GBKEYS_AND_DIST_UDAF_NOT_PRESENT: translatedGBOpAttr = genMapSideGBSkewGBKeysAndDistUDAFNotPresent(inputOpAf, aggRel, gbInfo); break; case NO_MAP_SIDE_GB_NO_SKEW: translatedGBOpAttr = genNoMapSideGBNoSkew(inputOpAf, aggRel, gbInfo); break; case NO_MAP_SIDE_GB_SKEW: translatedGBOpAttr = genNoMapSideGBSkew(inputOpAf, aggRel, gbInfo); break; } return translatedGBOpAttr; } /** * GB-RS-GB1 * * Construct GB-RS-GB Pipe line. User has enabled Map Side GB, specified no * skew and Grp Set is below the threshold. * * @param inputOpAf * @param aggRel * @param gbInfo * @return * @throws SemanticException */ private static OpAttr genMapSideGBNoSkewNoAddMRJob(OpAttr inputOpAf, HiveAggregate aggRel, GBInfo gbInfo) throws SemanticException { OpAttr mapSideGB = null; OpAttr mapSideRS = null; OpAttr reduceSideGB = null; // 1. Insert MapSide GB mapSideGB = genMapSideGB(inputOpAf, gbInfo); // 2. Insert MapSide RS mapSideRS = genMapSideGBRS(mapSideGB, gbInfo); // 3. Insert ReduceSide GB reduceSideGB = genReduceSideGB1(mapSideRS, gbInfo, false, false, GroupByDesc.Mode.MERGEPARTIAL); return reduceSideGB; } /** * GB-RS-GB1-RS-GB2 */ private static OpAttr genGBRSGBRSGBOpPipeLine(OpAttr inputOpAf, HiveAggregate aggRel, GBInfo gbInfo) throws SemanticException { OpAttr mapSideGB = null; OpAttr mapSideRS = null; OpAttr reduceSideGB1 = null; OpAttr reduceSideRS = null; OpAttr reduceSideGB2 = null; // 1. Insert MapSide GB mapSideGB = genMapSideGB(inputOpAf, gbInfo); // 2. Insert MapSide RS mapSideRS = genMapSideGBRS(mapSideGB, gbInfo); // 3. Insert ReduceSide GB1 boolean computeGrpSet = (gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT) ? false : true; reduceSideGB1 = genReduceSideGB1(mapSideRS, gbInfo, computeGrpSet, false, GroupByDesc.Mode.PARTIALS); // 4. Insert RS on reduce side with Reduce side GB as input reduceSideRS = genReduceGBRS(reduceSideGB1, gbInfo); // 5. Insert ReduceSide GB2 reduceSideGB2 = genReduceSideGB2(reduceSideRS, gbInfo); return reduceSideGB2; } /** * GB-RS-GB1-RS-GB2 * * @param inputOpAf * @param aggRel * @param gbInfo * @return * @throws SemanticException */ private static OpAttr genMapSideGBNoSkewAddMRJob(OpAttr inputOpAf, HiveAggregate aggRel, GBInfo gbInfo) throws SemanticException { // 1. Sanity check if (gbInfo.containsDistinctAggr) { String errorMsg = "The number of rows per input row due to grouping sets is " + gbInfo.grpSets.size(); throw new SemanticException( ErrorMsg.HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_DISTINCTS.getMsg(errorMsg)); } // 2. Gen GB-RS-GB-RS-GB pipeline return genGBRSGBRSGBOpPipeLine(inputOpAf, aggRel, gbInfo); } /** * GB-RS-GB1-RS-GB2 * * @param inputOpAf * @param aggRel * @param gbInfo * @return * @throws SemanticException */ private static OpAttr genMapSideGBSkewGBKeysOrDistUDAFPresent(OpAttr inputOpAf, HiveAggregate aggRel, GBInfo gbInfo) throws SemanticException { // 1. Sanity check if (gbInfo.grpSetRqrAdditionalMRJob) { String errorMsg = "The number of rows per input row due to grouping sets is " + gbInfo.grpSets.size(); throw new SemanticException( ErrorMsg.HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_SKEW.getMsg(errorMsg)); } // 2. Gen GB-RS-GB-RS-GB pipeline return genGBRSGBRSGBOpPipeLine(inputOpAf, aggRel, gbInfo); } /** * GB-RS-GB2 * * @param inputOpAf * @param aggRel * @param gbInfo * @return * @throws SemanticException */ private static OpAttr genMapSideGBSkewGBKeysAndDistUDAFNotPresent(OpAttr inputOpAf, HiveAggregate aggRel, GBInfo gbInfo) throws SemanticException { OpAttr mapSideGB = null; OpAttr mapSideRS = null; OpAttr reduceSideGB2 = null; // 1. Sanity check if (gbInfo.grpSetRqrAdditionalMRJob) { String errorMsg = "The number of rows per input row due to grouping sets is " + gbInfo.grpSets.size(); throw new SemanticException( ErrorMsg.HIVE_GROUPING_SETS_THRESHOLD_NOT_ALLOWED_WITH_SKEW.getMsg(errorMsg)); } // 1. Insert MapSide GB mapSideGB = genMapSideGB(inputOpAf, gbInfo); // 2. Insert MapSide RS mapSideRS = genMapSideGBRS(mapSideGB, gbInfo); // 3. Insert ReduceSide GB2 reduceSideGB2 = genReduceSideGB2(mapSideRS, gbInfo); return reduceSideGB2; } /** * RS-Gb1 * * @param inputOpAf * @param aggRel * @param gbInfo * @return * @throws SemanticException */ private static OpAttr genNoMapSideGBNoSkew(OpAttr inputOpAf, HiveAggregate aggRel, GBInfo gbInfo) throws SemanticException { OpAttr mapSideRS = null; OpAttr reduceSideGB1NoMapGB = null; // 1. Insert MapSide RS mapSideRS = genMapSideRS(inputOpAf, gbInfo); // 2. Insert ReduceSide GB reduceSideGB1NoMapGB = genReduceSideGB1NoMapGB(mapSideRS, gbInfo, GroupByDesc.Mode.COMPLETE); return reduceSideGB1NoMapGB; } /** * RS-GB1-RS-GB2 * * @param inputOpAf * @param aggRel * @param gbInfo * @return * @throws SemanticException */ private static OpAttr genNoMapSideGBSkew(OpAttr inputOpAf, HiveAggregate aggRel, GBInfo gbInfo) throws SemanticException { OpAttr mapSideRS = null; OpAttr reduceSideGB1NoMapGB = null; OpAttr reduceSideRS = null; OpAttr reduceSideGB2 = null; // 1. Insert MapSide RS mapSideRS = genMapSideRS(inputOpAf, gbInfo); // 2. Insert ReduceSide GB reduceSideGB1NoMapGB = genReduceSideGB1NoMapGB(mapSideRS, gbInfo, GroupByDesc.Mode.PARTIAL1); // 3. Insert RS on reduce side with Reduce side GB as input reduceSideRS = genReduceGBRS(reduceSideGB1NoMapGB, gbInfo); // 4. Insert ReduceSide GB2 reduceSideGB2 = genReduceSideGB2(reduceSideRS, gbInfo); return reduceSideGB2; } private static int getParallelismForReduceSideRS(GBInfo gbInfo) { int degreeOfParallelism = 0; switch (gbInfo.gbPhysicalPipelineMode) { case MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB: case MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT: case NO_MAP_SIDE_GB_SKEW: if (gbInfo.gbKeys.isEmpty()) { degreeOfParallelism = 1; } else { degreeOfParallelism = -1; } break; default: throw new RuntimeException( "Unable to determine Reducer Parallelism - Invalid Physical Mode: " + gbInfo.gbPhysicalPipelineMode); } return degreeOfParallelism; } private static int getParallelismForMapSideRS(GBInfo gbInfo) { int degreeOfParallelism = 0; switch (gbInfo.gbPhysicalPipelineMode) { case MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB: case MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB: case NO_MAP_SIDE_GB_NO_SKEW: if (gbInfo.gbKeys.isEmpty()) { degreeOfParallelism = 1; } else { degreeOfParallelism = -1; } break; case NO_MAP_SIDE_GB_SKEW: case MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT: degreeOfParallelism = -1; break; case MAP_SIDE_GB_SKEW_GBKEYS_AND_DIST_UDAF_NOT_PRESENT: degreeOfParallelism = 1; break; default: throw new RuntimeException( "Unable to determine Reducer Parallelism - Invalid Physical Mode: " + gbInfo.gbPhysicalPipelineMode); } return degreeOfParallelism; } private static int getNumPartFieldsForReduceSideRS(GBInfo gbInfo) { int numPartFields = 0; switch (gbInfo.gbPhysicalPipelineMode) { case MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB: numPartFields = gbInfo.gbKeys.size() + 1; break; case MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT: case NO_MAP_SIDE_GB_SKEW: numPartFields = gbInfo.gbKeys.size(); break; default: throw new RuntimeException( "Unable to determine Number of Partition Fields - Invalid Physical Mode: " + gbInfo.gbPhysicalPipelineMode); } return numPartFields; } private static int getNumPartFieldsForMapSideRS(GBInfo gbInfo) { int numPartFields = 0; switch (gbInfo.gbPhysicalPipelineMode) { case MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB: case MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB: case MAP_SIDE_GB_SKEW_GBKEYS_AND_DIST_UDAF_NOT_PRESENT: case NO_MAP_SIDE_GB_NO_SKEW: numPartFields += gbInfo.gbKeys.size(); break; case NO_MAP_SIDE_GB_SKEW: case MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT: if (gbInfo.containsDistinctAggr) { numPartFields = Integer.MAX_VALUE; } else { numPartFields = -1; } break; default: throw new RuntimeException( "Unable to determine Number of Partition Fields - Invalid Physical Mode: " + gbInfo.gbPhysicalPipelineMode); } return numPartFields; } private static boolean inclGrpSetInReduceSide(GBInfo gbInfo) { boolean inclGrpSet = false; if (gbInfo.grpSets.size() > 0 && (gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_ADD_MR_JOB || gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT)) { inclGrpSet = true; } return inclGrpSet; } private static boolean inclGrpSetInMapSide(GBInfo gbInfo) { boolean inclGrpSet = false; if (gbInfo.grpSets.size() > 0 && ((gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB) || gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT)) { inclGrpSet = true; } return inclGrpSet; } private static OpAttr genReduceGBRS(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException { Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>(); ArrayList<String> outputColumnNames = new ArrayList<String>(); ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>(); GroupByOperator reduceSideGB1 = (GroupByOperator) inputOpAf.inputs.get(0); List<ColumnInfo> gb1ColInfoLst = reduceSideGB1.getSchema().getSignature(); ArrayList<ExprNodeDesc> reduceKeys = getReduceKeysForRS(reduceSideGB1, 0, gbInfo.gbKeys.size() - 1, outputColumnNames, false, colInfoLst, colExprMap, true, true); if (inclGrpSetInReduceSide(gbInfo)) { addGrpSetCol(false, gb1ColInfoLst.get(reduceKeys.size()).getInternalName(), true, reduceKeys, outputColumnNames, colInfoLst, colExprMap); } ArrayList<ExprNodeDesc> reduceValues = getValueKeysForRS(reduceSideGB1, reduceSideGB1.getConf() .getKeys().size(), outputColumnNames, colInfoLst, colExprMap, true, true); ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(PlanUtils .getReduceSinkDesc(reduceKeys, reduceValues, outputColumnNames, true, -1, getNumPartFieldsForReduceSideRS(gbInfo), getParallelismForReduceSideRS(gbInfo), AcidUtils.Operation.NOT_ACID), new RowSchema(colInfoLst), reduceSideGB1); rsOp.setColumnExprMap(colExprMap); return new OpAttr("", new HashSet<Integer>(), rsOp); } private static OpAttr genMapSideGBRS(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException { Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>(); List<String> outputKeyColumnNames = new ArrayList<String>(); List<String> outputValueColumnNames = new ArrayList<String>(); ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>(); GroupByOperator mapGB = (GroupByOperator) inputOpAf.inputs.get(0); ArrayList<ExprNodeDesc> reduceKeys = getReduceKeysForRS(mapGB, 0, gbInfo.gbKeys.size() - 1, outputKeyColumnNames, false, colInfoLst, colExprMap, false, false); int keyLength = reduceKeys.size(); if (inclGrpSetInMapSide(gbInfo)) { addGrpSetCol(false, SemanticAnalyzer.getColumnInternalName(reduceKeys.size()), true, reduceKeys, outputKeyColumnNames, colInfoLst, colExprMap); keyLength++; } if (mapGB.getConf().getKeys().size() > reduceKeys.size()) { // NOTE: All dist cols have single output col name; reduceKeys.addAll(getReduceKeysForRS(mapGB, reduceKeys.size(), mapGB.getConf().getKeys() .size() - 1, outputKeyColumnNames, true, colInfoLst, colExprMap, false, false)); } else if (!gbInfo.distColIndices.isEmpty()) { // This is the case where distinct cols are part of GB Keys in which case // we still need to add it to out put col names outputKeyColumnNames.add(SemanticAnalyzer.getColumnInternalName(reduceKeys.size())); } ArrayList<ExprNodeDesc> reduceValues = getValueKeysForRS(mapGB, mapGB.getConf().getKeys() .size(), outputValueColumnNames, colInfoLst, colExprMap, false, false); ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(PlanUtils .getReduceSinkDesc(reduceKeys, keyLength, reduceValues, gbInfo.distColIndices, outputKeyColumnNames, outputValueColumnNames, true, -1, getNumPartFieldsForMapSideRS( gbInfo), getParallelismForMapSideRS(gbInfo), AcidUtils.Operation.NOT_ACID), new RowSchema(colInfoLst), mapGB); rsOp.setColumnExprMap(colExprMap); return new OpAttr("", new HashSet<Integer>(), rsOp); } private static OpAttr genMapSideRS(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException { Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>(); List<String> outputKeyColumnNames = new ArrayList<String>(); List<String> outputValueColumnNames = new ArrayList<String>(); ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>(); String outputColName; // 1. Add GB Keys to reduce keys ArrayList<ExprNodeDesc> reduceKeys= new ArrayList<ExprNodeDesc>(); for (int i = 0; i < gbInfo.gbKeys.size(); i++) { //gbInfo already has ExprNode for gbkeys reduceKeys.add(gbInfo.gbKeys.get(i)); String colOutputName = SemanticAnalyzer.getColumnInternalName(i); outputKeyColumnNames.add(colOutputName); colInfoLst.add(new ColumnInfo(Utilities.ReduceField.KEY.toString() + "." + colOutputName, gbInfo.gbKeyTypes.get(i), "", false)); colExprMap.put(colOutputName, gbInfo.gbKeys.get(i)); } // Note: GROUPING SETS are not allowed with map side aggregation set to false so we don't have to worry about it int keyLength = reduceKeys.size(); // 2. Add Dist UDAF args to reduce keys if (gbInfo.containsDistinctAggr) { // TODO: Why is this needed (doesn't represent any cols) String udafName = SemanticAnalyzer.getColumnInternalName(reduceKeys.size()); outputKeyColumnNames.add(udafName); for (int i = 0; i < gbInfo.distExprNodes.size(); i++) { reduceKeys.add(gbInfo.distExprNodes.get(i)); //this part of reduceKeys is later used to create column names strictly for non-distinct aggregates // with parameters same as distinct keys which expects _col0 at the end. So we always append // _col0 at the end instead of _col<i> outputColName = SemanticAnalyzer.getColumnInternalName(0); String field = Utilities.ReduceField.KEY.toString() + "." + udafName + ":" + i + "." + outputColName; ColumnInfo colInfo = new ColumnInfo(field, gbInfo.distExprNodes.get(i).getTypeInfo(), null, false); colInfoLst.add(colInfo); colExprMap.put(field, gbInfo.distExprNodes.get(i)); } } // 3. Add UDAF args deduped to reduce values ArrayList<ExprNodeDesc> reduceValues = new ArrayList<ExprNodeDesc>(); for (int i = 0; i < gbInfo.deDupedNonDistIrefs.size(); i++) { reduceValues.add(gbInfo.deDupedNonDistIrefs.get(i)); outputColName = SemanticAnalyzer.getColumnInternalName(reduceValues.size() - 1); outputValueColumnNames.add(outputColName); String field = Utilities.ReduceField.VALUE.toString() + "." + outputColName; colInfoLst.add(new ColumnInfo(field, reduceValues.get(reduceValues.size() - 1).getTypeInfo(), null, false)); colExprMap.put(field, reduceValues.get(reduceValues.size() - 1)); } // 4. Gen RS ReduceSinkOperator rsOp = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(PlanUtils .getReduceSinkDesc(reduceKeys, keyLength, reduceValues, gbInfo.distColIndices, outputKeyColumnNames, outputValueColumnNames, true, -1, getNumPartFieldsForMapSideRS(gbInfo), getParallelismForMapSideRS(gbInfo), AcidUtils.Operation.NOT_ACID), new RowSchema( colInfoLst), inputOpAf.inputs.get(0)); rsOp.setColumnExprMap(colExprMap); return new OpAttr("", new HashSet<Integer>(), rsOp); } private static OpAttr genReduceSideGB2(OpAttr inputOpAf, GBInfo gbInfo) throws SemanticException { ArrayList<String> outputColNames = new ArrayList<String>(); ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>(); Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>(); String colOutputName = null; ReduceSinkOperator rs = (ReduceSinkOperator) inputOpAf.inputs.get(0); List<ColumnInfo> rsColInfoLst = rs.getSchema().getSignature(); ColumnInfo ci; // 1. Build GB Keys, grouping set starting position // 1.1 First Add original GB Keys ArrayList<ExprNodeDesc> gbKeys = ExprNodeDescUtils.genExprNodeDesc(rs, 0, gbInfo.gbKeys.size() - 1, false, false); for (int i = 0; i < gbInfo.gbKeys.size(); i++) { ci = rsColInfoLst.get(i); colOutputName = gbInfo.outputColNames.get(i); outputColNames.add(colOutputName); colInfoLst.add(new ColumnInfo(colOutputName, ci.getType(), "", false)); colExprMap.put(colOutputName, gbKeys.get(i)); } // 1.2 Add GrpSet Col int groupingSetsPosition = -1; if (inclGrpSetInReduceSide(gbInfo) && gbInfo.grpIdFunctionNeeded) { groupingSetsPosition = gbKeys.size(); ExprNodeDesc grpSetColExpr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, rsColInfoLst.get(groupingSetsPosition).getInternalName(), null, false); gbKeys.add(grpSetColExpr); colOutputName = gbInfo.outputColNames.get(gbInfo.outputColNames.size() - 1); ; outputColNames.add(colOutputName); colInfoLst.add(new ColumnInfo(colOutputName, TypeInfoFactory.stringTypeInfo, null, true)); colExprMap.put(colOutputName, grpSetColExpr); } // 2. Add UDAF UDAFAttrs udafAttr; ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>(); int udafStartPosInGBInfOutputColNames = gbInfo.grpSets.isEmpty() ? gbInfo.gbKeys.size() : gbInfo.gbKeys.size() * 2; int udafStartPosInInputRS = gbInfo.grpSets.isEmpty() ? gbInfo.gbKeys.size() : gbInfo.gbKeys.size() + 1; for (int i = 0; i < gbInfo.udafAttrs.size(); i++) { udafAttr = gbInfo.udafAttrs.get(i); ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>(); aggParameters.add(new ExprNodeColumnDesc(rsColInfoLst.get(udafStartPosInInputRS + i))); colOutputName = gbInfo.outputColNames.get(udafStartPosInGBInfOutputColNames + i); outputColNames.add(colOutputName); Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.FINAL, udafAttr.isDistinctUDAF); GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, udafMode, aggParameters); aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, false, udafMode)); colInfoLst.add(new ColumnInfo(colOutputName, udaf.returnType, "", false)); } Operator rsGBOp2 = OperatorFactory.getAndMakeChild(new GroupByDesc(GroupByDesc.Mode.FINAL, outputColNames, gbKeys, aggregations, false, gbInfo.groupByMemoryUsage, gbInfo.memoryThreshold, null, false, groupingSetsPosition, gbInfo.containsDistinctAggr), new RowSchema(colInfoLst), rs); rsGBOp2.setColumnExprMap(colExprMap); // TODO: Shouldn't we propgate vc? is it vc col from tab or all vc return new OpAttr("", new HashSet<Integer>(), rsGBOp2); } private static OpAttr genReduceSideGB1(OpAttr inputOpAf, GBInfo gbInfo, boolean computeGrpSet, boolean propagateConstInDistinctUDAF, GroupByDesc.Mode gbMode) throws SemanticException { ArrayList<String> outputColNames = new ArrayList<String>(); ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>(); Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>(); String colOutputName = null; ReduceSinkOperator rs = (ReduceSinkOperator) inputOpAf.inputs.get(0); List<ColumnInfo> rsColInfoLst = rs.getSchema().getSignature(); ColumnInfo ci; boolean finalGB = (gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_NO_SKEW_NO_ADD_MR_JOB); // 1. Build GB Keys, grouping set starting position // 1.1 First Add original GB Keys ArrayList<ExprNodeDesc> gbKeys = ExprNodeDescUtils.genExprNodeDesc(rs, 0, gbInfo.gbKeys.size() - 1, false, false); for (int i = 0; i < gbInfo.gbKeys.size(); i++) { ci = rsColInfoLst.get(i); if (finalGB) { colOutputName = gbInfo.outputColNames.get(i); } else { colOutputName = SemanticAnalyzer.getColumnInternalName(i); } outputColNames.add(colOutputName); colInfoLst.add(new ColumnInfo(colOutputName, ci.getType(), "", false)); colExprMap.put(colOutputName, gbKeys.get(i)); } // 1.2 Add GrpSet Col int groupingSetsColPosition = -1; if ((!finalGB && gbInfo.grpSets.size() > 0) || (finalGB && gbInfo.grpIdFunctionNeeded)) { groupingSetsColPosition = gbInfo.gbKeys.size(); if (computeGrpSet) { // GrpSet Col needs to be constructed gbKeys.add(new ExprNodeConstantDesc("0")); } else { // GrpSet Col already part of input RS // TODO: Can't we just copy the ExprNodeDEsc from input (Do we need to // explicitly set table alias to null & VC to false gbKeys.addAll(ExprNodeDescUtils.genExprNodeDesc(rs, groupingSetsColPosition, groupingSetsColPosition, false, true)); } colOutputName = SemanticAnalyzer.getColumnInternalName(groupingSetsColPosition); if (finalGB) { colOutputName = gbInfo.outputColNames.get(gbInfo.outputColNames.size() - 1); } outputColNames.add(colOutputName); colInfoLst.add(new ColumnInfo(colOutputName, TypeInfoFactory.stringTypeInfo, null, true)); colExprMap.put(colOutputName, gbKeys.get(groupingSetsColPosition)); } // 2. Walk through UDAF and add them to GB String lastReduceKeyColName = null; if (!rs.getConf().getOutputKeyColumnNames().isEmpty()) { lastReduceKeyColName = rs.getConf().getOutputKeyColumnNames() .get(rs.getConf().getOutputKeyColumnNames().size() - 1); } int numDistinctUDFs = 0; int distinctStartPosInReduceKeys = gbKeys.size(); List<ExprNodeDesc> reduceValues = rs.getConf().getValueCols(); ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>(); int udafColStartPosInOriginalGB = (gbInfo.grpSets.size() > 0) ? gbInfo.gbKeys.size() * 2 : gbInfo.gbKeys.size(); int udafColStartPosInRS = rs.getConf().getKeyCols().size(); for (int i = 0; i < gbInfo.udafAttrs.size(); i++) { UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i); ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>(); if (udafAttr.isDistinctUDAF) { ColumnInfo rsDistUDAFParamColInfo; ExprNodeDesc distinctUDAFParam; ExprNodeDesc constantPropDistinctUDAFParam; for (int j = 0; j < udafAttr.udafParamsIndxInGBInfoDistExprs.size(); j++) { rsDistUDAFParamColInfo = rsColInfoLst.get(distinctStartPosInReduceKeys + j); String rsDistUDAFParamName = rsDistUDAFParamColInfo.getInternalName(); // TODO: verify if this is needed if (lastReduceKeyColName != null) { rsDistUDAFParamName = Utilities.ReduceField.KEY.name() + "." + lastReduceKeyColName + ":" + numDistinctUDFs + "." + SemanticAnalyzer.getColumnInternalName(j); } distinctUDAFParam = new ExprNodeColumnDesc(rsDistUDAFParamColInfo.getType(), rsDistUDAFParamName, rsDistUDAFParamColInfo.getTabAlias(), rsDistUDAFParamColInfo.getIsVirtualCol()); if (propagateConstInDistinctUDAF) { // TODO: Implement propConstDistUDAFParams constantPropDistinctUDAFParam = SemanticAnalyzer .isConstantParameterInAggregationParameters( rsDistUDAFParamColInfo.getInternalName(), reduceValues); if (constantPropDistinctUDAFParam != null) { distinctUDAFParam = constantPropDistinctUDAFParam; } } aggParameters.add(distinctUDAFParam); } numDistinctUDFs++; } else { aggParameters.add(new ExprNodeColumnDesc(rsColInfoLst.get(udafColStartPosInRS + i))); } Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(gbMode, udafAttr.isDistinctUDAF); GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, udafMode, aggParameters); aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, (gbMode != GroupByDesc.Mode.FINAL && udafAttr.isDistinctUDAF), udafMode)); if (finalGB) { colOutputName = gbInfo.outputColNames.get(udafColStartPosInOriginalGB + i); } else { colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() + aggregations.size() - 1); } colInfoLst.add(new ColumnInfo(colOutputName, udaf.returnType, "", false)); outputColNames.add(colOutputName); } // Nothing special needs to be done for grouping sets if // this is the final group by operator, and multiple rows corresponding to // the // grouping sets have been generated upstream. // However, if an addition MR job has been created to handle grouping sets, // additional rows corresponding to grouping sets need to be created here. //TODO: Clean up/refactor assumptions boolean includeGrpSetInGBDesc = (gbInfo.grpSets.size() > 0) && !finalGB && !(gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.MAP_SIDE_GB_SKEW_GBKEYS_OR_DIST_UDAF_PRESENT); Operator rsGBOp = OperatorFactory.getAndMakeChild(new GroupByDesc(gbMode, outputColNames, gbKeys, aggregations, gbInfo.groupByMemoryUsage, gbInfo.memoryThreshold, gbInfo.grpSets, includeGrpSetInGBDesc, groupingSetsColPosition, gbInfo.containsDistinctAggr), new RowSchema(colInfoLst), rs); rsGBOp.setColumnExprMap(colExprMap); return new OpAttr("", new HashSet<Integer>(), rsGBOp); } /** * RS-GB0 * * @param inputOpAf * @param gbInfo * @param gbMode * @return * @throws SemanticException */ private static OpAttr genReduceSideGB1NoMapGB(OpAttr inputOpAf, GBInfo gbInfo, GroupByDesc.Mode gbMode) throws SemanticException { ArrayList<String> outputColNames = new ArrayList<String>(); ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>(); Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>(); String colOutputName = null; ReduceSinkOperator rs = (ReduceSinkOperator) inputOpAf.inputs.get(0); List<ColumnInfo> rsColInfoLst = rs.getSchema().getSignature(); ColumnInfo ci; boolean useOriginalGBNames = (gbInfo.gbPhysicalPipelineMode == HIVEGBPHYSICALMODE.NO_MAP_SIDE_GB_NO_SKEW); // 1. Build GB Keys, grouping set starting position // 1.1 First Add original GB Keys ArrayList<ExprNodeDesc> gbKeys = ExprNodeDescUtils.genExprNodeDesc(rs, 0, gbInfo.gbKeys.size() - 1, true, false); for (int i = 0; i < gbInfo.gbKeys.size(); i++) { ci = rsColInfoLst.get(i); if (useOriginalGBNames) { colOutputName = gbInfo.outputColNames.get(i); } else { colOutputName = SemanticAnalyzer.getColumnInternalName(i); } outputColNames.add(colOutputName); colInfoLst.add(new ColumnInfo(colOutputName, ci.getType(), null, false)); colExprMap.put(colOutputName, gbKeys.get(i)); } // 2. Walk through UDAF and add them to GB String lastReduceKeyColName = null; if (!rs.getConf().getOutputKeyColumnNames().isEmpty()) { lastReduceKeyColName = rs.getConf().getOutputKeyColumnNames() .get(rs.getConf().getOutputKeyColumnNames().size() - 1); } int numDistinctUDFs = 0; List<ExprNodeDesc> reduceValues = rs.getConf().getValueCols(); ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>(); int udafColStartPosInOriginalGB = gbInfo.gbKeys.size(); // the positions in rsColInfoLst are as follows // --grpkey--,--distkey--,--values-- // but distUDAF may be before/after some non-distUDAF, // i.e., their positions can be mixed. // so for all UDAF we first check to see if it is groupby key, if not is it distinct key // if not it should be value List<Integer> distinctPositions = new ArrayList<>(); Map<Integer, ArrayList<ExprNodeDesc>> indexToParameter = new TreeMap<>(); for (int i = 0; i < gbInfo.udafAttrs.size(); i++) { UDAFAttrs udafAttr = gbInfo.udafAttrs.get(i); ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>(); ColumnInfo rsUDAFParamColInfo; ExprNodeDesc udafParam; ExprNodeDesc constantPropDistinctUDAFParam; for (int j = 0; j < udafAttr.udafParams.size(); j++) { int argPos = getColInfoPos(udafAttr.udafParams.get(j), gbInfo); rsUDAFParamColInfo = rsColInfoLst.get(argPos); String rsUDAFParamName = rsUDAFParamColInfo.getInternalName(); if (udafAttr.isDistinctUDAF && lastReduceKeyColName != null) { rsUDAFParamName = Utilities.ReduceField.KEY.name() + "." + lastReduceKeyColName + ":" + numDistinctUDFs + "." + SemanticAnalyzer.getColumnInternalName(j); } udafParam = new ExprNodeColumnDesc(rsUDAFParamColInfo.getType(), rsUDAFParamName, rsUDAFParamColInfo.getTabAlias(), rsUDAFParamColInfo.getIsVirtualCol()); constantPropDistinctUDAFParam = SemanticAnalyzer .isConstantParameterInAggregationParameters(rsUDAFParamColInfo.getInternalName(), reduceValues); if (constantPropDistinctUDAFParam != null) { udafParam = constantPropDistinctUDAFParam; } aggParameters.add(udafParam); } indexToParameter.put(i, aggParameters); if(udafAttr.isDistinctUDAF) { numDistinctUDFs++; } } for(int index : indexToParameter.keySet()){ UDAFAttrs udafAttr = gbInfo.udafAttrs.get(index); Mode udafMode = SemanticAnalyzer.groupByDescModeToUDAFMode(gbMode, udafAttr.isDistinctUDAF); GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, udafMode, indexToParameter.get(index)); aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, udafAttr.isDistinctUDAF, udafMode)); if (useOriginalGBNames) { colOutputName = gbInfo.outputColNames.get(udafColStartPosInOriginalGB + index); } else { colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() + aggregations.size() - 1); } colInfoLst.add(new ColumnInfo(colOutputName, udaf.returnType, "", false)); outputColNames.add(colOutputName); } Operator rsGB1 = OperatorFactory.getAndMakeChild(new GroupByDesc(gbMode, outputColNames, gbKeys, aggregations, false, gbInfo.groupByMemoryUsage, gbInfo.memoryThreshold, null, false, -1, numDistinctUDFs > 0), new RowSchema(colInfoLst), rs); rsGB1.setColumnExprMap(colExprMap); return new OpAttr("", new HashSet<Integer>(), rsGB1); } private static int getColInfoPos(ExprNodeDesc aggExpr, GBInfo gbInfo ) { //first see if it is gbkeys int gbKeyIndex = ExprNodeDescUtils.indexOf(aggExpr, gbInfo.gbKeys); if(gbKeyIndex < 0 ) { //then check if it is distinct key int distinctKeyIndex = ExprNodeDescUtils.indexOf(aggExpr, gbInfo.distExprNodes); if(distinctKeyIndex < 0) { // lastly it should be in deDupedNonDistIrefs int deDupValIndex = ExprNodeDescUtils.indexOf(aggExpr, gbInfo.deDupedNonDistIrefs); assert(deDupValIndex >= 0); return gbInfo.gbKeys.size() + gbInfo.distExprNodes.size() + deDupValIndex; } else { //aggExpr is part of distinct key return gbInfo.gbKeys.size() + distinctKeyIndex; } } else { return gbKeyIndex; } } @SuppressWarnings("unchecked") private static OpAttr genMapSideGB(OpAttr inputOpAf, GBInfo gbAttrs) throws SemanticException { ArrayList<String> outputColNames = new ArrayList<String>(); ArrayList<ColumnInfo> colInfoLst = new ArrayList<ColumnInfo>(); Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>(); Set<String> gbKeyColsAsNamesFrmIn = new HashSet<String>(); String colOutputName = null; // 1. Build GB Keys, grouping set starting position // 1.1 First Add original GB Keys ArrayList<ExprNodeDesc> gbKeys = new ArrayList<ExprNodeDesc>(); for (int i = 0; i < gbAttrs.gbKeys.size(); i++) { gbKeys.add(gbAttrs.gbKeys.get(i)); colOutputName = SemanticAnalyzer.getColumnInternalName(i); colInfoLst.add(new ColumnInfo(colOutputName, gbAttrs.gbKeyTypes.get(i), "", false)); outputColNames.add(colOutputName); gbKeyColsAsNamesFrmIn.add(gbAttrs.gbKeyColNamesInInput.get(i)); colExprMap.put(colOutputName, gbKeys.get(i)); } // 1.2. Adjust GroupingSet Position, GBKeys for GroupingSet Position if // needed. NOTE: GroupingID is added to map side GB only if we don't GrpSet // doesn't require additional MR Jobs int groupingSetsPosition = -1; boolean inclGrpID = inclGrpSetInMapSide(gbAttrs); if (inclGrpID) { groupingSetsPosition = gbKeys.size(); addGrpSetCol(true, null, false, gbKeys, outputColNames, colInfoLst, colExprMap); } // 1.3. Add all distinct params // NOTE: distinct expr can not be part of of GB key (we assume plan // gen would have prevented it) for (int i = 0; i < gbAttrs.distExprNodes.size(); i++) { if (!gbKeyColsAsNamesFrmIn.contains(gbAttrs.distExprNames.get(i))) { gbKeys.add(gbAttrs.distExprNodes.get(i)); colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() - 1); colInfoLst.add(new ColumnInfo(colOutputName, gbAttrs.distExprTypes.get(i), "", false)); outputColNames.add(colOutputName); gbKeyColsAsNamesFrmIn.add(gbAttrs.distExprNames.get(i)); colExprMap.put(colOutputName, gbKeys.get(gbKeys.size() - 1)); } } // 2. Build Aggregations ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>(); for (UDAFAttrs udafAttr : gbAttrs.udafAttrs) { Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.HASH, udafAttr.isDistinctUDAF); aggregations.add(new AggregationDesc(udafAttr.udafName.toLowerCase(), udafAttr.udafEvaluator, udafAttr.udafParams, udafAttr.isDistinctUDAF, amode)); GenericUDAFInfo udafInfo; try { udafInfo = SemanticAnalyzer.getGenericUDAFInfo(udafAttr.udafEvaluator, amode, udafAttr.udafParams); } catch (SemanticException e) { throw new RuntimeException(e); } colOutputName = SemanticAnalyzer.getColumnInternalName(gbKeys.size() + aggregations.size() - 1); colInfoLst.add(new ColumnInfo(colOutputName, udafInfo.returnType, "", false)); outputColNames.add(colOutputName); } // 3. Create GB @SuppressWarnings("rawtypes") Operator gbOp = OperatorFactory.getAndMakeChild(new GroupByDesc(GroupByDesc.Mode.HASH, outputColNames, gbKeys, aggregations, false, gbAttrs.groupByMemoryUsage, gbAttrs.memoryThreshold, gbAttrs.grpSets, inclGrpID, groupingSetsPosition, gbAttrs.containsDistinctAggr), new RowSchema(colInfoLst), inputOpAf.inputs.get(0)); // 5. Setup Expr Col Map // NOTE: UDAF is not included in ExprColMap gbOp.setColumnExprMap(colExprMap); return new OpAttr("", new HashSet<Integer>(), gbOp); } private static void addGrpSetCol(boolean createConstantExpr, String grpSetIDExprName, boolean addReducePrefixToColInfoName, List<ExprNodeDesc> exprLst, List<String> outputColumnNames, List<ColumnInfo> colInfoLst, Map<String, ExprNodeDesc> colExprMap) throws SemanticException { String outputColName = null; ExprNodeDesc grpSetColExpr = null; if (createConstantExpr) { grpSetColExpr = new ExprNodeConstantDesc("0"); } else { grpSetColExpr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, grpSetIDExprName, null, false); } exprLst.add(grpSetColExpr); outputColName = SemanticAnalyzer.getColumnInternalName(exprLst.size() - 1); outputColumnNames.add(outputColName); String internalColName = outputColName; if (addReducePrefixToColInfoName) { internalColName = Utilities.ReduceField.KEY.toString() + "." + outputColName; } colInfoLst.add(new ColumnInfo(internalColName, grpSetColExpr.getTypeInfo(), null, true)); colExprMap.put(internalColName, grpSetColExpr); } /** * Get Reduce Keys for RS following MapSide GB * * @param reduceKeys * assumed to be deduped list of exprs * @param outputKeyColumnNames * @param colExprMap * @return List of ExprNodeDesc of ReduceKeys * @throws SemanticException */ private static ArrayList<ExprNodeDesc> getReduceKeysForRS(Operator inOp, int startPos, int endPos, List<String> outputKeyColumnNames, boolean addOnlyOneKeyColName, ArrayList<ColumnInfo> colInfoLst, Map<String, ExprNodeDesc> colExprMap, boolean addEmptyTabAlias, boolean setColToNonVirtual) throws SemanticException { ArrayList<ExprNodeDesc> reduceKeys = null; if (endPos < 0) { reduceKeys = new ArrayList<ExprNodeDesc>(); } else { reduceKeys = ExprNodeDescUtils.genExprNodeDesc(inOp, startPos, endPos, addEmptyTabAlias, setColToNonVirtual); int outColNameIndx = startPos; for (int i = 0; i < reduceKeys.size(); ++i) { String outputColName = SemanticAnalyzer.getColumnInternalName(outColNameIndx); outColNameIndx++; if (!addOnlyOneKeyColName || i == 0) { outputKeyColumnNames.add(outputColName); } // TODO: Verify if this is needed (Why can't it be always null/empty String tabAlias = addEmptyTabAlias ? "" : null; ColumnInfo colInfo = new ColumnInfo(Utilities.ReduceField.KEY.toString() + "." + outputColName, reduceKeys.get(i).getTypeInfo(), tabAlias, false); colInfoLst.add(colInfo); colExprMap.put(colInfo.getInternalName(), reduceKeys.get(i)); } } return reduceKeys; } /** * Get Value Keys for RS following MapSide GB * * @param GroupByOperator * MapSide GB * @param outputKeyColumnNames * @param colExprMap * @return List of ExprNodeDesc of Values * @throws SemanticException */ private static ArrayList<ExprNodeDesc> getValueKeysForRS(Operator inOp, int aggStartPos, List<String> outputKeyColumnNames, ArrayList<ColumnInfo> colInfoLst, Map<String, ExprNodeDesc> colExprMap, boolean addEmptyTabAlias, boolean setColToNonVirtual) throws SemanticException { List<ColumnInfo> mapGBColInfoLst = inOp.getSchema().getSignature(); ArrayList<ExprNodeDesc> valueKeys = null; if (aggStartPos >= mapGBColInfoLst.size()) { valueKeys = new ArrayList<ExprNodeDesc>(); } else { valueKeys = ExprNodeDescUtils.genExprNodeDesc(inOp, aggStartPos, mapGBColInfoLst.size() - 1, true, setColToNonVirtual); for (int i = 0; i < valueKeys.size(); ++i) { String outputColName = SemanticAnalyzer.getColumnInternalName(i); outputKeyColumnNames.add(outputColName); // TODO: Verify if this is needed (Why can't it be always null/empty String tabAlias = addEmptyTabAlias ? "" : null; ColumnInfo colInfo = new ColumnInfo(Utilities.ReduceField.VALUE.toString() + "." + outputColName, valueKeys.get(i).getTypeInfo(), tabAlias, false); colInfoLst.add(colInfo); colExprMap.put(colInfo.getInternalName(), valueKeys.get(i)); } } return valueKeys; } // TODO: Implement this private static ExprNodeDesc propConstDistUDAFParams() { return null; } }