/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.optimizer.correlation; import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVECONVERTJOIN; import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASK; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Stack; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.ql.exec.GroupByOperator; import org.apache.hadoop.hive.ql.exec.JoinOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.PTFOperator; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; import org.apache.hadoop.hive.ql.lib.GraphWalker; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.lib.Rule; import org.apache.hadoop.hive.ql.lib.RuleRegExp; import org.apache.hadoop.hive.ql.optimizer.Transform; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.PlanUtils; import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * If two reducer sink operators share the same partition/sort columns and order, * they can be merged. This should happen after map join optimization because map * join optimization will remove reduce sink operators. * * This optimizer removes/replaces child-RS (not parent) which is safer way for DefaultGraphWalker. */ public class ReduceSinkDeDuplication extends Transform { protected static final Logger LOG = LoggerFactory.getLogger(ReduceSinkDeDuplication.class); private static final String RS = ReduceSinkOperator.getOperatorName(); private static final String GBY = GroupByOperator.getOperatorName(); private static final String JOIN = JoinOperator.getOperatorName(); protected ParseContext pGraphContext; @Override public ParseContext transform(ParseContext pctx) throws SemanticException { pGraphContext = pctx; // generate pruned column list for all relevant operators ReduceSinkDeduplicateProcCtx cppCtx = new ReduceSinkDeduplicateProcCtx(pGraphContext); // for auto convert map-joins, it not safe to dedup in here (todo) boolean mergeJoins = !pctx.getConf().getBoolVar(HIVECONVERTJOIN) && !pctx.getConf().getBoolVar(HIVECONVERTJOINNOCONDITIONALTASK) && !pctx.getConf().getBoolVar(ConfVars.HIVE_CONVERT_JOIN_BUCKET_MAPJOIN_TEZ) && !pctx.getConf().getBoolVar(ConfVars.HIVEDYNAMICPARTITIONHASHJOIN); // If multiple rules can be matched with same cost, last rule will be choosen as a processor // see DefaultRuleDispatcher#dispatch() Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); opRules.put(new RuleRegExp("R1", RS + "%.*%" + RS + "%"), ReduceSinkDeduplicateProcFactory.getReducerReducerProc()); opRules.put(new RuleRegExp("R2", RS + "%" + GBY + "%.*%" + RS + "%"), ReduceSinkDeduplicateProcFactory.getGroupbyReducerProc()); if (mergeJoins) { opRules.put(new RuleRegExp("R3", JOIN + "%.*%" + RS + "%"), ReduceSinkDeduplicateProcFactory.getJoinReducerProc()); } // TODO RS+JOIN // The dispatcher fires the processor corresponding to the closest matching // rule and passes the context along Dispatcher disp = new DefaultRuleDispatcher(ReduceSinkDeduplicateProcFactory .getDefaultProc(), opRules, cppCtx); GraphWalker ogw = new DefaultGraphWalker(disp); // Create a list of topop nodes ArrayList<Node> topNodes = new ArrayList<Node>(); topNodes.addAll(pGraphContext.getTopOps().values()); ogw.startWalking(topNodes, null); return pGraphContext; } protected class ReduceSinkDeduplicateProcCtx extends AbstractCorrelationProcCtx { public ReduceSinkDeduplicateProcCtx(ParseContext pctx) { super(pctx); } } static class ReduceSinkDeduplicateProcFactory { public static NodeProcessor getReducerReducerProc() { return new ReducerReducerProc(); } public static NodeProcessor getGroupbyReducerProc() { return new GroupbyReducerProc(); } public static NodeProcessor getJoinReducerProc() { return new JoinReducerProc(); } public static NodeProcessor getDefaultProc() { return new DefaultProc(); } } /* * do nothing. */ static class DefaultProc implements NodeProcessor { @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { return null; } } public abstract static class AbsctractReducerReducerProc implements NodeProcessor { @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { ReduceSinkDeduplicateProcCtx dedupCtx = (ReduceSinkDeduplicateProcCtx) procCtx; if (dedupCtx.hasBeenRemoved((Operator<?>) nd)) { return false; } ReduceSinkOperator cRS = (ReduceSinkOperator) nd; Operator<?> child = CorrelationUtilities.getSingleChild(cRS); if (child instanceof JoinOperator) { return false; // not supported } if (child instanceof GroupByOperator) { GroupByOperator cGBY = (GroupByOperator) child; if (!CorrelationUtilities.hasGroupingSet(cRS) && !cGBY.getConf().isGroupingSetsPresent()) { return process(cRS, cGBY, dedupCtx); } return false; } if (child instanceof SelectOperator) { return process(cRS, dedupCtx); } return false; } protected abstract Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException; protected abstract Object process(ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException; // for JOIN-RS case, it's not possible generally to merge if child has // less key/partition columns than parents protected boolean merge(ReduceSinkOperator cRS, JoinOperator pJoin, int minReducer) throws SemanticException { List<Operator<?>> parents = pJoin.getParentOperators(); ReduceSinkOperator[] pRSs = parents.toArray(new ReduceSinkOperator[parents.size()]); ReduceSinkDesc cRSc = cRS.getConf(); for (ReduceSinkOperator pRSNs : pRSs) { ReduceSinkDesc pRSNc = pRSNs.getConf(); if (cRSc.getKeyCols().size() != pRSNc.getKeyCols().size()) { return false; } if (cRSc.getPartitionCols().size() != pRSNc.getPartitionCols().size()) { return false; } Integer moveReducerNumTo = checkNumReducer(cRSc.getNumReducers(), pRSNc.getNumReducers()); if (moveReducerNumTo == null || moveReducerNumTo > 0 && cRSc.getNumReducers() < minReducer) { return false; } Integer moveRSOrderTo = checkOrder(true, cRSc.getOrder(), pRSNc.getOrder(), cRSc.getNullOrder(), pRSNc.getNullOrder()); if (moveRSOrderTo == null) { return false; } } boolean[] sorted = CorrelationUtilities.getSortedTags(pJoin); int cKeySize = cRSc.getKeyCols().size(); for (int i = 0; i < cKeySize; i++) { ExprNodeDesc cexpr = cRSc.getKeyCols().get(i); ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length]; for (int tag = 0; tag < pRSs.length; tag++) { pexprs[tag] = pRSs[tag].getConf().getKeyCols().get(i); } int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted); if (found != i) { return false; } } int cPartSize = cRSc.getPartitionCols().size(); for (int i = 0; i < cPartSize; i++) { ExprNodeDesc cexpr = cRSc.getPartitionCols().get(i); ExprNodeDesc[] pexprs = new ExprNodeDesc[pRSs.length]; for (int tag = 0; tag < pRSs.length; tag++) { pexprs[tag] = pRSs[tag].getConf().getPartitionCols().get(i); } int found = CorrelationUtilities.indexOf(cexpr, pexprs, cRS, pRSs, sorted); if (found != i) { return false; } } for (ReduceSinkOperator pRS : pRSs) { pRS.getConf().setNumReducers(cRS.getConf().getNumReducers()); } return true; } /** * Current RSDedup remove/replace child RS. For key columns, * sorting order, and the number of reducers, copy * more specific part of configurations of child RS to that of parent RS. * For partitioning columns, if both child RS and parent RS have been assigned * partitioning columns, we will choose the more general partitioning columns. * If parent RS has not been assigned any partitioning column, we will use * partitioning columns (if exist) of child RS. */ protected boolean merge(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) throws SemanticException { int[] result = extractMergeDirections(cRS, pRS, minReducer); if (result == null) { return false; } if (result[0] > 0) { // The sorting columns of the child RS are more specific than // those of the parent RS. Assign sorting columns of the child RS // to the parent RS. List<ExprNodeDesc> childKCs = cRS.getConf().getKeyCols(); pRS.getConf().setKeyCols(ExprNodeDescUtils.backtrack(childKCs, cRS, pRS)); } if (result[1] < 0) { // The partitioning columns of the parent RS are more specific than // those of the child RS. List<ExprNodeDesc> childPCs = cRS.getConf().getPartitionCols(); if (childPCs != null && !childPCs.isEmpty()) { // If partitioning columns of the child RS are assigned, // assign these to the partitioning columns of the parent RS. pRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(childPCs, cRS, pRS)); } } else if (result[1] > 0) { // The partitioning columns of the child RS are more specific than // those of the parent RS. List<ExprNodeDesc> parentPCs = pRS.getConf().getPartitionCols(); if (parentPCs == null || parentPCs.isEmpty()) { // If partitioning columns of the parent RS are not assigned, // assign partitioning columns of the child RS to the parent RS. ArrayList<ExprNodeDesc> childPCs = cRS.getConf().getPartitionCols(); pRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(childPCs, cRS, pRS)); } } if (result[2] > 0) { // The sorting order of the child RS is more specific than // that of the parent RS. Assign the sorting order of the child RS // to the parent RS. if (result[0] <= 0) { // Sorting columns of the parent RS are more specific than those of the // child RS but Sorting order of the child RS is more specific than // that of the parent RS. throw new SemanticException("Sorting columns and order don't match. " + "Try set " + HiveConf.ConfVars.HIVEOPTREDUCEDEDUPLICATION + "=false;"); } pRS.getConf().setOrder(cRS.getConf().getOrder()); pRS.getConf().setNullOrder(cRS.getConf().getNullOrder()); } else { // The sorting order of the parent RS is more specific or they are equal. // We will copy the order from the child RS, and then fill in the order // of the rest of columns with the one taken from parent RS. StringBuilder order = new StringBuilder(cRS.getConf().getOrder()); StringBuilder orderNull = new StringBuilder(cRS.getConf().getNullOrder()); order.append(pRS.getConf().getOrder().substring(order.length())); orderNull.append(pRS.getConf().getNullOrder().substring(orderNull.length())); pRS.getConf().setOrder(order.toString()); pRS.getConf().setNullOrder(orderNull.toString()); } if (result[3] > 0) { // The number of reducers of the child RS is more specific than // that of the parent RS. Assign the number of reducers of the child RS // to the parent RS. pRS.getConf().setNumReducers(cRS.getConf().getNumReducers()); } if (result[4] > 0) { // This case happens only when pRS key is empty in which case we can use // number of distribution keys and key serialization info from cRS if (pRS.getConf().getKeyCols() != null && pRS.getConf().getKeyCols().size() == 0 && cRS.getConf().getKeyCols() != null && cRS.getConf().getKeyCols().size() == 0) { // As setNumDistributionKeys is a subset of keycols, the size should // be 0 too. This condition maybe too strict. We may extend it in the // future. TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(new ArrayList<FieldSchema>(), pRS .getConf().getOrder(), pRS.getConf().getNullOrder()); pRS.getConf().setKeySerializeInfo(keyTable); } } return true; } /** * Returns merge directions between two RSs for criterias (ordering, number of reducers, * reducer keys, partition keys). Returns null if any of categories is not mergeable. * * Values for each index can be -1, 0, 1 * 1. 0 means two configuration in the category is the same * 2. for -1, configuration of parent RS is more specific than child RS * 3. for 1, configuration of child RS is more specific than parent RS */ private int[] extractMergeDirections(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minReducer) throws SemanticException { ReduceSinkDesc cConf = cRS.getConf(); ReduceSinkDesc pConf = pRS.getConf(); // If there is a PTF between cRS and pRS we cannot ignore the order direction final boolean checkStrictEquality = isStrictEqualityNeeded(cRS, pRS); Integer moveRSOrderTo = checkOrder(checkStrictEquality, cConf.getOrder(), pConf.getOrder(), cConf.getNullOrder(), pConf.getNullOrder()); if (moveRSOrderTo == null) { return null; } // if cRS is being used for distinct - the two reduce sinks are incompatible if (cConf.getDistinctColumnIndices().size() >= 2) { return null; } Integer moveReducerNumTo = checkNumReducer(cConf.getNumReducers(), pConf.getNumReducers()); if (moveReducerNumTo == null || moveReducerNumTo > 0 && cConf.getNumReducers() < minReducer) { return null; } List<ExprNodeDesc> ckeys = cConf.getKeyCols(); List<ExprNodeDesc> pkeys = pConf.getKeyCols(); Integer moveKeyColTo = checkExprs(ckeys, pkeys, cRS, pRS); if (moveKeyColTo == null) { return null; } List<ExprNodeDesc> cpars = cConf.getPartitionCols(); List<ExprNodeDesc> ppars = pConf.getPartitionCols(); Integer movePartitionColTo = checkExprs(cpars, ppars, cRS, pRS); if (movePartitionColTo == null) { return null; } Integer moveNumDistKeyTo = checkNumDistributionKey(cConf.getNumDistributionKeys(), pConf.getNumDistributionKeys()); return new int[] {moveKeyColTo, movePartitionColTo, moveRSOrderTo, moveReducerNumTo, moveNumDistKeyTo}; } private boolean isStrictEqualityNeeded(ReduceSinkOperator cRS, ReduceSinkOperator pRS) { Operator<? extends OperatorDesc> parent = cRS.getParentOperators().get(0); while (parent != pRS) { assert parent.getNumParent() == 1; if (parent instanceof PTFOperator) { return true; } parent = parent.getParentOperators().get(0); } return false; } private Integer checkNumDistributionKey(int cnd, int pnd) { // number of distribution keys of cRS is chosen only when numDistKeys of pRS // is 0 or less. In all other cases, distribution of the keys is based on // the pRS which is more generic than cRS. // Examples: // case 1: if pRS sort key is (a, b) and cRS sort key is (a, b, c) and number of // distribution keys are 2 and 3 resp. then after merge the sort keys will // be (a, b, c) while the number of distribution keys will be 2. // case 2: if pRS sort key is empty and number of distribution keys is 0 // and if cRS sort key is (a, b) and number of distribution keys is 2 then // after merge new sort key will be (a, b) and number of distribution keys // will be 2. if (pnd <= 0) { return 1; } return 0; } /** * Overlapping part of keys should be the same between parent and child. * And if child has more keys than parent, non-overlapping part of keys * should be backtrackable to parent. */ private Integer checkExprs(List<ExprNodeDesc> ckeys, List<ExprNodeDesc> pkeys, ReduceSinkOperator cRS, ReduceSinkOperator pRS) throws SemanticException { // If ckeys or pkeys have constant node expressions avoid the merge. for (ExprNodeDesc ck : ckeys) { if (ck instanceof ExprNodeConstantDesc) { return null; } } for (ExprNodeDesc pk : pkeys) { if (pk instanceof ExprNodeConstantDesc) { return null; } } Integer moveKeyColTo = 0; if (ckeys == null || ckeys.isEmpty()) { if (pkeys != null && !pkeys.isEmpty()) { moveKeyColTo = -1; } } else { if (pkeys == null || pkeys.isEmpty()) { for (ExprNodeDesc ckey : ckeys) { if (ExprNodeDescUtils.backtrack(ckey, cRS, pRS) == null) { // cKey is not present in parent return null; } } moveKeyColTo = 1; } else { moveKeyColTo = sameKeys(ckeys, pkeys, cRS, pRS); } } return moveKeyColTo; } // backtrack key exprs of child to parent and compare it with parent's protected Integer sameKeys(List<ExprNodeDesc> cexprs, List<ExprNodeDesc> pexprs, Operator<?> child, Operator<?> parent) throws SemanticException { int common = Math.min(cexprs.size(), pexprs.size()); int limit = Math.max(cexprs.size(), pexprs.size()); int i = 0; for (; i < common; i++) { ExprNodeDesc pexpr = pexprs.get(i); ExprNodeDesc cexpr = ExprNodeDescUtils.backtrack(cexprs.get(i), child, parent); if (cexpr == null || !pexpr.isSame(cexpr)) { return null; } } for (; i < limit; i++) { if (cexprs.size() > pexprs.size()) { if (ExprNodeDescUtils.backtrack(cexprs.get(i), child, parent) == null) { // cKey is not present in parent return null; } } } return Integer.valueOf(cexprs.size()).compareTo(pexprs.size()); } protected Integer checkOrder(boolean checkStrictEquality, String corder, String porder, String cNullOrder, String pNullOrder) { assert corder.length() == cNullOrder.length(); assert porder.length() == pNullOrder.length(); if (corder == null || corder.trim().equals("")) { if (porder == null || porder.trim().equals("")) { return 0; } return -1; } if (porder == null || porder.trim().equals("")) { return 1; } corder = corder.trim(); porder = porder.trim(); if (checkStrictEquality) { // order of overlapping keys should be exactly the same cNullOrder = cNullOrder.trim(); pNullOrder = pNullOrder.trim(); int target = Math.min(corder.length(), porder.length()); if (!corder.substring(0, target).equals(porder.substring(0, target)) || !cNullOrder.substring(0, target).equals(pNullOrder.substring(0, target))) { return null; } } return Integer.valueOf(corder.length()).compareTo(porder.length()); } /** * If number of reducers for RS is -1, the RS can have any number of reducers. * It's generally true except for order-by or forced bucketing cases. * if both of num-reducers are not -1, those number should be the same. */ protected Integer checkNumReducer(int creduce, int preduce) { if (creduce < 0) { if (preduce < 0) { return 0; } return -1; } if (preduce < 0) { return 1; } if (creduce != preduce) { return null; } return 0; } protected boolean aggressiveDedup(ReduceSinkOperator cRS, ReduceSinkOperator pRS, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { assert cRS.getNumParent() == 1; ReduceSinkDesc cConf = cRS.getConf(); ReduceSinkDesc pConf = pRS.getConf(); List<ExprNodeDesc> cKeys = cConf.getKeyCols(); List<ExprNodeDesc> pKeys = pConf.getKeyCols(); // Check that in the path between cRS and pRS, there are only Select operators // i.e. the sequence must be pRS-SEL*-cRS Operator<? extends OperatorDesc> parent = cRS.getParentOperators().get(0); while (parent != pRS) { assert parent.getNumParent() == 1; if (!(parent instanceof SelectOperator)) { return false; } parent = parent.getParentOperators().get(0); } // If child keys are null or empty, we bail out if (cKeys == null || cKeys.isEmpty()) { return false; } // If parent keys are null or empty, we bail out if (pKeys == null || pKeys.isEmpty()) { return false; } // Backtrack key columns of cRS to pRS // If we cannot backtrack any of the columns, bail out List<ExprNodeDesc> cKeysInParentRS = ExprNodeDescUtils.backtrack(cKeys, cRS, pRS); for (int i = 0; i < cKeysInParentRS.size(); i++) { ExprNodeDesc pexpr = cKeysInParentRS.get(i); if (pexpr == null) { // We cannot backtrack the expression, we bail out return false; } } cRS.getConf().setKeyCols(ExprNodeDescUtils.backtrack(cKeysInParentRS, cRS, pRS)); // Backtrack partition columns of cRS to pRS // If we cannot backtrack any of the columns, bail out List<ExprNodeDesc> cPartitionInParentRS = ExprNodeDescUtils.backtrack( cConf.getPartitionCols(), cRS, pRS); for (int i = 0; i < cPartitionInParentRS.size(); i++) { ExprNodeDesc pexpr = cPartitionInParentRS.get(i); if (pexpr == null) { // We cannot backtrack the expression, we bail out return false; } } cRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(cPartitionInParentRS, cRS, pRS)); // Backtrack value columns of cRS to pRS // If we cannot backtrack any of the columns, bail out List<ExprNodeDesc> cValueInParentRS = ExprNodeDescUtils.backtrack( cConf.getValueCols(), cRS, pRS); for (int i = 0; i < cValueInParentRS.size(); i++) { ExprNodeDesc pexpr = cValueInParentRS.get(i); if (pexpr == null) { // We cannot backtrack the expression, we bail out return false; } } cRS.getConf().setValueCols(ExprNodeDescUtils.backtrack(cValueInParentRS, cRS, pRS)); // Backtrack bucket columns of cRS to pRS (if any) // If we cannot backtrack any of the columns, bail out if (cConf.getBucketCols() != null) { List<ExprNodeDesc> cBucketInParentRS = ExprNodeDescUtils.backtrack( cConf.getBucketCols(), cRS, pRS); for (int i = 0; i < cBucketInParentRS.size(); i++) { ExprNodeDesc pexpr = cBucketInParentRS.get(i); if (pexpr == null) { // We cannot backtrack the expression, we bail out return false; } } cRS.getConf().setBucketCols(ExprNodeDescUtils.backtrack(cBucketInParentRS, cRS, pRS)); } // Update column expression map for (Entry<String, ExprNodeDesc> e : cRS.getColumnExprMap().entrySet()) { e.setValue(ExprNodeDescUtils.backtrack(e.getValue(), cRS, pRS)); } // Replace pRS with cRS and remove operator sequence from pRS to cRS // Recall that the sequence must be pRS-SEL*-cRS parent = cRS.getParentOperators().get(0); while (parent != pRS) { dedupCtx.addRemovedOperator(parent); parent = parent.getParentOperators().get(0); } dedupCtx.addRemovedOperator(pRS); cRS.getParentOperators().clear(); for (Operator<? extends OperatorDesc> op : pRS.getParentOperators()) { op.replaceChild(pRS, cRS); cRS.getParentOperators().add(op); } pRS.getParentOperators().clear(); pRS.getChildOperators().clear(); return true; } } static class GroupbyReducerProc extends AbsctractReducerReducerProc { // pRS-pGBY-cRS @Override public Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { GroupByOperator pGBY = CorrelationUtilities.findPossibleParent( cRS, GroupByOperator.class, dedupCtx.trustScript()); if (pGBY == null) { return false; } ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( pGBY, ReduceSinkOperator.class, dedupCtx.trustScript()); if (pRS != null && merge(cRS, pRS, dedupCtx.minReducer())) { CorrelationUtilities.replaceReduceSinkWithSelectOperator( cRS, dedupCtx.getPctx(), dedupCtx); pRS.getConf().setDeduplicated(true); return true; } return false; } // pRS-pGBY-cRS-cGBY @Override public Object process(ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { Operator<?> start = CorrelationUtilities.getStartForGroupBy(cRS, dedupCtx); GroupByOperator pGBY = CorrelationUtilities.findPossibleParent( start, GroupByOperator.class, dedupCtx.trustScript()); if (pGBY == null) { return false; } ReduceSinkOperator pRS = CorrelationUtilities.getSingleParent(pGBY, ReduceSinkOperator.class); if (pRS != null && merge(cRS, pRS, dedupCtx.minReducer())) { CorrelationUtilities.removeReduceSinkForGroupBy( cRS, cGBY, dedupCtx.getPctx(), dedupCtx); pRS.getConf().setDeduplicated(true); return true; } return false; } } static class JoinReducerProc extends AbsctractReducerReducerProc { // pRS-pJOIN-cRS @Override public Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { JoinOperator pJoin = CorrelationUtilities.findPossibleParent(cRS, JoinOperator.class, dedupCtx.trustScript()); if (pJoin != null && merge(cRS, pJoin, dedupCtx.minReducer())) { pJoin.getConf().setFixedAsSorted(true); CorrelationUtilities.replaceReduceSinkWithSelectOperator( cRS, dedupCtx.getPctx(), dedupCtx); ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( pJoin, ReduceSinkOperator.class, dedupCtx.trustScript()); if (pRS != null) { pRS.getConf().setDeduplicated(true); } return true; } return false; } // pRS-pJOIN-cRS-cGBY @Override public Object process(ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { Operator<?> start = CorrelationUtilities.getStartForGroupBy(cRS, dedupCtx); JoinOperator pJoin = CorrelationUtilities.findPossibleParent( start, JoinOperator.class, dedupCtx.trustScript()); if (pJoin != null && merge(cRS, pJoin, dedupCtx.minReducer())) { pJoin.getConf().setFixedAsSorted(true); CorrelationUtilities.removeReduceSinkForGroupBy( cRS, cGBY, dedupCtx.getPctx(), dedupCtx); ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( pJoin, ReduceSinkOperator.class, dedupCtx.trustScript()); if (pRS != null) { pRS.getConf().setDeduplicated(true); } return true; } return false; } } static class ReducerReducerProc extends AbsctractReducerReducerProc { // pRS-cRS @Override public Object process(ReduceSinkOperator cRS, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( cRS, ReduceSinkOperator.class, dedupCtx.trustScript()); if (pRS != null) { // Try extended deduplication if (aggressiveDedup(cRS, pRS, dedupCtx)) { return true; } // Normal deduplication if (merge(cRS, pRS, dedupCtx.minReducer())) { CorrelationUtilities.replaceReduceSinkWithSelectOperator( cRS, dedupCtx.getPctx(), dedupCtx); pRS.getConf().setDeduplicated(true); return true; } } return false; } // pRS-cRS-cGBY @Override public Object process(ReduceSinkOperator cRS, GroupByOperator cGBY, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException { Operator<?> start = CorrelationUtilities.getStartForGroupBy(cRS, dedupCtx); ReduceSinkOperator pRS = CorrelationUtilities.findPossibleParent( start, ReduceSinkOperator.class, dedupCtx.trustScript()); if (pRS != null && merge(cRS, pRS, dedupCtx.minReducer())) { if (dedupCtx.getPctx().getConf().getBoolVar(HiveConf.ConfVars.HIVEGROUPBYSKEW)) { return false; } CorrelationUtilities.removeReduceSinkForGroupBy(cRS, cGBY, dedupCtx.getPctx(), dedupCtx); pRS.getConf().setDeduplicated(true); return true; } return false; } } }