/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.optimizer.physical; import static org.apache.hadoop.hive.ql.optimizer.physical.LlapDecider.LlapMode.all; import static org.apache.hadoop.hive.ql.optimizer.physical.LlapDecider.LlapMode.auto; import static org.apache.hadoop.hive.ql.optimizer.physical.LlapDecider.LlapMode.only; import static org.apache.hadoop.hive.ql.optimizer.physical.LlapDecider.LlapMode.map; import static org.apache.hadoop.hive.ql.optimizer.physical.LlapDecider.LlapMode.none; import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; import java.util.Deque; import java.util.EnumSet; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Stack; import com.google.common.base.Preconditions; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.ql.exec.FilterOperator; import org.apache.hadoop.hive.ql.exec.FunctionInfo; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.GroupByOperator; import org.apache.hadoop.hive.ql.exec.MapJoinOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.ScriptOperator; import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.tez.TezTask; import org.apache.hadoop.hive.ql.io.HiveInputFormat; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; import org.apache.hadoop.hive.ql.lib.GraphWalker; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.lib.Rule; import org.apache.hadoop.hive.ql.lib.RuleRegExp; import org.apache.hadoop.hive.ql.lib.TaskGraphWalker; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.AggregationDesc; import org.apache.hadoop.hive.ql.plan.BaseWork; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.ReduceWork; import org.apache.hadoop.hive.ql.plan.SelectDesc; import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.TezWork; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * LlapDecider takes care of tagging certain vertices in the execution graph as * "llap", which in turn causes them to be submitted to an llap daemon instead * of a regular yarn container. * * The actual algorithm used is driven by LLAP_EXECUTION_MODE. "all", "none" and * "map" mechanically tag those elements. "auto" tries to be smarter by looking * for suitable vertices. * * Regardless of the algorithm used, it's always ensured that there's not user * code that will be sent to the daemon (ie.: script operators, temporary * functions, etc) */ public class LlapDecider implements PhysicalPlanResolver { protected static transient final Logger LOG = LoggerFactory.getLogger(LlapDecider.class); private HiveConf conf; public enum LlapMode { map, // map operators only all, // all operators. Launch containers if user code etc prevents running inside llap. none, // no operators only, // Try running everything in llap, fail if that is not possible (non blessed user code, script, etc) auto // please hive, choose for me } private LlapMode mode; private final LlapClusterStateForCompile clusterState; public LlapDecider(LlapClusterStateForCompile clusterState) { this.clusterState = clusterState; } class LlapDecisionDispatcher implements Dispatcher { private final HiveConf conf; private final boolean doSkipUdfCheck; private final boolean arePermanentFnsAllowed; private final boolean shouldUber; private final float minReducersPerExec; private final int executorsPerNode; private List<MapJoinOperator> mapJoinOpList; private final Map<Rule, NodeProcessor> rules; public LlapDecisionDispatcher(PhysicalContext pctx, LlapMode mode) { conf = pctx.getConf(); doSkipUdfCheck = HiveConf.getBoolVar(conf, ConfVars.LLAP_SKIP_COMPILE_UDF_CHECK); arePermanentFnsAllowed = HiveConf.getBoolVar(conf, ConfVars.LLAP_ALLOW_PERMANENT_FNS); // Don't user uber in "all" mode - everything can go into LLAP, which is better than uber. shouldUber = HiveConf.getBoolVar(conf, ConfVars.LLAP_AUTO_ALLOW_UBER) && (mode != all); minReducersPerExec = HiveConf.getFloatVar( conf, ConfVars.TEZ_LLAP_MIN_REDUCER_PER_EXECUTOR); executorsPerNode = HiveConf.getIntVar(conf, ConfVars.LLAP_DAEMON_NUM_EXECUTORS); // TODO# hmm mapJoinOpList = new ArrayList<MapJoinOperator>(); rules = getRules(); } @Override public Object dispatch(Node nd, Stack<Node> stack, Object... nodeOutputs) throws SemanticException { @SuppressWarnings("unchecked") Task<? extends Serializable> currTask = (Task<? extends Serializable>) nd; if (currTask instanceof TezTask) { TezWork work = ((TezTask) currTask).getWork(); for (BaseWork w: work.getAllWork()) { handleWork(work, w); } } return null; } private void handleWork(TezWork tezWork, BaseWork work) throws SemanticException { boolean workCanBeDoneInLlap = evaluateWork(tezWork, work); LOG.debug( "Work " + work + " " + (workCanBeDoneInLlap ? "can" : "cannot") + " be done in LLAP"); if (workCanBeDoneInLlap) { for (MapJoinOperator graceMapJoinOp : mapJoinOpList) { LOG.debug("Disabling hybrid grace hash join in case of LLAP " + "and non-dynamic partition hash join."); graceMapJoinOp.getConf().setHybridHashJoin(false); } adjustAutoParallelism(work); convertWork(tezWork, work); } mapJoinOpList.clear(); } private void adjustAutoParallelism(BaseWork work) { if (minReducersPerExec <= 0 || !(work instanceof ReduceWork)) return; ReduceWork reduceWork = (ReduceWork)work; if (reduceWork.isAutoReduceParallelism() == false && reduceWork.isUniformDistribution() == false) { return; // Not based on ARP and cannot assume uniform distribution, bail. } clusterState.initClusterInfo(); int targetCount = 0; if (!clusterState.hasClusterInfo()) { LOG.warn("Cannot determine LLAP cluster information"); targetCount = (int)Math.ceil(minReducersPerExec * 1 * executorsPerNode); } else { targetCount = (int)Math.ceil(minReducersPerExec * (clusterState.getKnownExecutorCount() + clusterState.getNodeCountWithUnknownExecutors() * executorsPerNode)); } // We only increase the targets here. if (reduceWork.isAutoReduceParallelism()) { int newMin = Math.max(reduceWork.getMinReduceTasks(), targetCount); if (newMin < reduceWork.getMaxReduceTasks()) { reduceWork.setMinReduceTasks(newMin); reduceWork.getEdgePropRef().setAutoReduce(conf, true, newMin, reduceWork.getMaxReduceTasks(), conf.getLongVar(HiveConf.ConfVars.BYTESPERREDUCER)); } else { reduceWork.setAutoReduceParallelism(false); reduceWork.setNumReduceTasks(newMin); // TODO: is this correct? based on the same logic as HIVE-14200 reduceWork.getEdgePropRef().setAutoReduce(null, false, 0, 0, 0); } } else { // UNIFORM || AUTOPARALLEL (maxed out) reduceWork.setNumReduceTasks(Math.max(reduceWork.getNumReduceTasks(), targetCount)); } } private void convertWork(TezWork tezWork, BaseWork work) throws SemanticException { if (shouldUber) { // let's see if we can go one step further and just uber this puppy if (tezWork.getChildren(work).isEmpty() && work instanceof ReduceWork && ((ReduceWork) work).getNumReduceTasks() == 1) { LOG.info("Converting work to uber: {}", work); work.setUberMode(true); } } // always mark as llap work.setLlapMode(true); } private boolean evaluateWork(TezWork tezWork, BaseWork work) throws SemanticException { LOG.info("Evaluating work item: " + work.getName()); // no means no if (mode == none) { return false; } // first we check if we *can* run in llap. If we need to use // user code to do so (script/udf) we don't. /*if (work instanceof MapWork && ((MapWork)work).isUseOneNullRowInputFormat()) { // LLAP doesn't support file-based splits that this forces. return false; }*/ if (!evaluateOperators(work)) { LOG.info("some operators cannot be run in llap"); if (mode == only) { throw new RuntimeException("Cannot run all parts of query in llap. Failing since " + ConfVars.LLAP_EXECUTION_MODE.varname + " is set to " + only.name()); } return false; } // --- From here on out we choose whether we *want* to run in llap // if mode is all just run it if (EnumSet.of(all, only).contains(mode)) { LOG.info("LLAP mode set to '" + mode + "' so can convert any work."); return true; } // if map mode run iff work is map work if (mode == map) { return (work instanceof MapWork); } // --- From here we evaluate the auto mode assert mode == auto : "Mode must be " + auto.name() + " at this point"; // if parents aren't in llap neither should the child if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.LLAP_AUTO_ENFORCE_TREE) && !checkParentsInLlap(tezWork, work)) { LOG.info("Parent not in llap."); return false; } // only vectorized orc input is cached. so there's a reason to // limit to that for now. if (work instanceof MapWork && HiveConf.getBoolVar(conf, HiveConf.ConfVars.LLAP_AUTO_ENFORCE_VECTORIZED) && !checkInputsVectorized((MapWork) work)) { LOG.info("Inputs not vectorized."); return false; } // check if there's at least some degree of stats available if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.LLAP_AUTO_ENFORCE_STATS) && !checkPartialStatsAvailable(work)) { LOG.info("No column stats available."); return false; } // now let's take a look at input sizes long maxInput = HiveConf.getLongVar(conf, HiveConf.ConfVars.LLAP_AUTO_MAX_INPUT); long expectedInput = computeInputSize(work); if (maxInput >= 0 && (expectedInput > maxInput)) { LOG.info(String.format("Inputs too big (%d > %d)", expectedInput, maxInput)); return false; } // and finally let's check output sizes long maxOutput = HiveConf.getLongVar(conf, HiveConf.ConfVars.LLAP_AUTO_MAX_OUTPUT); long expectedOutput = computeOutputSize(work); if (maxOutput >= 0 && (expectedOutput > maxOutput)) { LOG.info(String.format("Outputs too big (%d > %d)", expectedOutput, maxOutput)); return false; } // couldn't convince you otherwise? well then let's llap. LOG.info("Can run work " + work.getName() + " in llap mode."); return true; } private boolean checkExpression(ExprNodeDesc expr) { Deque<ExprNodeDesc> exprs = new LinkedList<ExprNodeDesc>(); exprs.add(expr); while (!exprs.isEmpty()) { if (LOG.isDebugEnabled()) { LOG.debug(String.format("Checking '%s'",expr.getExprString())); } ExprNodeDesc cur = exprs.removeFirst(); if (cur == null) continue; if (cur.getChildren() != null) { exprs.addAll(cur.getChildren()); } if (!doSkipUdfCheck && cur instanceof ExprNodeGenericFuncDesc) { ExprNodeGenericFuncDesc funcDesc = (ExprNodeGenericFuncDesc)cur; boolean isBuiltIn = FunctionRegistry.isBuiltInFuncExpr(funcDesc); if (!isBuiltIn) { if (!arePermanentFnsAllowed) { LOG.info("Not a built-in function: " + cur.getExprString() + " (permanent functions are disabled)"); return false; } if (!FunctionRegistry.isPermanentFunction(funcDesc)) { LOG.info("Not a built-in or permanent function: " + cur.getExprString()); return false; } } } } return true; } private boolean checkAggregator(AggregationDesc agg) throws SemanticException { if (LOG.isDebugEnabled()) { LOG.debug(String.format("Checking '%s'", agg.getExprString())); } boolean result = checkExpressions(agg.getParameters()); FunctionInfo fi = FunctionRegistry.getFunctionInfo(agg.getGenericUDAFName()); result = result && (fi != null) && fi.isNative(); if (!result) { LOG.info("Aggregator is not native: " + agg.getExprString()); } return result; } private boolean checkExpressions(Collection<ExprNodeDesc> exprs) { for (ExprNodeDesc expr : exprs) { if (!checkExpression(expr)) return false; } return true; } private boolean checkAggregators(Collection<AggregationDesc> aggs) { try { for (AggregationDesc agg: aggs) { if (!checkAggregator(agg)) return false; } } catch (SemanticException e) { LOG.warn("Exception testing aggregators.",e); return false; } return true; } private Map<Rule, NodeProcessor> getRules() { Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); opRules.put(new RuleRegExp("No scripts", ScriptOperator.getOperatorName() + "%"), new NodeProcessor() { @Override public Object process(Node n, Stack<Node> s, NodeProcessorCtx c, Object... os) { LOG.debug("Cannot run operator [" + n + "] in llap mode."); return new Boolean(false); } }); opRules.put(new RuleRegExp("No user code in fil", FilterOperator.getOperatorName() + "%"), new NodeProcessor() { @Override public Object process(Node n, Stack<Node> s, NodeProcessorCtx c, Object... os) { ExprNodeDesc expr = ((FilterOperator)n).getConf().getPredicate(); Boolean retval = new Boolean(checkExpression(expr)); if (!retval) { LOG.info("Cannot run filter operator [" + n + "] in llap mode"); } return new Boolean(retval); } }); opRules.put(new RuleRegExp("No user code in gby", GroupByOperator.getOperatorName() + "%"), new NodeProcessor() { @Override public Object process(Node n, Stack<Node> s, NodeProcessorCtx c, Object... os) { @SuppressWarnings("unchecked") List<AggregationDesc> aggs = ((Operator<GroupByDesc>) n).getConf().getAggregators(); Boolean retval = new Boolean(checkAggregators(aggs)); if (!retval) { LOG.info("Cannot run group by operator [" + n + "] in llap mode"); } return new Boolean(retval); } }); opRules.put(new RuleRegExp("No user code in select", SelectOperator.getOperatorName() + "%"), new NodeProcessor() { @Override public Object process(Node n, Stack<Node> s, NodeProcessorCtx c, Object... os) { @SuppressWarnings({ "unchecked" }) List<ExprNodeDesc> exprs = ((Operator<SelectDesc>) n).getConf().getColList(); Boolean retval = new Boolean(checkExpressions(exprs)); if (!retval) { LOG.info("Cannot run select operator [" + n + "] in llap mode"); } return new Boolean(retval); } }); if (!conf.getBoolVar(HiveConf.ConfVars.LLAP_ENABLE_GRACE_JOIN_IN_LLAP)) { opRules.put( new RuleRegExp("Disable grace hash join if LLAP mode and not dynamic partition hash join", MapJoinOperator.getOperatorName() + "%"), new NodeProcessor() { @Override public Object process(Node n, Stack<Node> s, NodeProcessorCtx c, Object... os) { MapJoinOperator mapJoinOp = (MapJoinOperator) n; if (mapJoinOp.getConf().isHybridHashJoin() && !(mapJoinOp.getConf().isDynamicPartitionHashJoin())) { mapJoinOpList.add((MapJoinOperator) n); } return new Boolean(true); } }); } return opRules; } private boolean evaluateOperators(BaseWork work) throws SemanticException { // lets take a look at the operators. we're checking for user // code in those. we will not run that in llap. Dispatcher disp = new DefaultRuleDispatcher(null, rules, null); GraphWalker ogw = new DefaultGraphWalker(disp); ArrayList<Node> topNodes = new ArrayList<Node>(); topNodes.addAll(work.getAllRootOperators()); HashMap<Node, Object> nodeOutput = new HashMap<Node, Object>(); ogw.startWalking(topNodes, nodeOutput); for (Node n : nodeOutput.keySet()) { if (nodeOutput.get(n) != null) { if (!((Boolean)nodeOutput.get(n))) { return false; } } } return true; } private boolean checkParentsInLlap(TezWork tezWork, BaseWork base) { for (BaseWork w: tezWork.getParents(base)) { if (!w.getLlapMode()) { LOG.info("Not all parents are run in llap"); return false; } } return true; } private boolean checkInputsVectorized(MapWork mapWork) { boolean mayWrap = HiveConf.getBoolVar(conf, ConfVars.LLAP_IO_NONVECTOR_WRAPPER_ENABLED); for (PartitionDesc pd : mapWork.getPathToPartitionInfo().values()) { if (Utilities.isInputFileFormatVectorized(pd) || (mayWrap && HiveInputFormat.canWrapForLlap(pd.getInputFileFormatClass(), true))) { continue; } LOG.info("Input format: " + pd.getInputFileFormatClassName() + ", doesn't provide vectorized input"); return false; } return true; } private boolean checkPartialStatsAvailable(BaseWork base) { for (Operator<?> o: base.getAllRootOperators()) { if (o.getStatistics().getColumnStatsState() == Statistics.State.NONE) { return false; } } return true; } private long computeEdgeSize(BaseWork base, boolean input) { long size = 0; for (Operator<?> o: (input ? base.getAllRootOperators() : base.getAllLeafOperators())) { if (o.getStatistics() == null) { // return worst case if unknown return Long.MAX_VALUE; } long currSize = o.getStatistics().getDataSize(); if ((currSize < 0) || ((Long.MAX_VALUE - size) < currSize)) { // overflow return Long.MAX_VALUE; } size += currSize; } return size; } private long computeInputSize(BaseWork base) { return computeEdgeSize(base, true); } private long computeOutputSize(BaseWork base) { return computeEdgeSize(base, false); } } @Override public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException { this.conf = pctx.getConf(); this.mode = LlapMode.valueOf(HiveConf.getVar(conf, HiveConf.ConfVars.LLAP_EXECUTION_MODE)); Preconditions.checkState(this.mode != null, "Unrecognized LLAP mode configuration: " + HiveConf.getVar(conf, HiveConf.ConfVars.LLAP_EXECUTION_MODE)); LOG.info("llap mode: " + this.mode); if (mode == none) { LOG.info("LLAP disabled."); return pctx; } // create dispatcher and graph walker Dispatcher disp = new LlapDecisionDispatcher(pctx, mode); TaskGraphWalker ogw = new TaskGraphWalker(disp); // get all the tasks nodes from root task ArrayList<Node> topNodes = new ArrayList<Node>(); topNodes.addAll(pctx.getRootTasks()); // begin to walk through the task tree. ogw.startWalking(topNodes, null); return pctx; } }