/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.planner.fragment;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import org.apache.drill.common.exceptions.ExecutionSetupException;
import org.apache.drill.common.util.DrillStringUtils;
import org.apache.drill.exec.ExecConstants;
import org.apache.drill.exec.ops.QueryContext;
import org.apache.drill.exec.physical.MinorFragmentEndpoint;
import org.apache.drill.exec.physical.PhysicalOperatorSetupException;
import org.apache.drill.exec.physical.base.AbstractPhysicalVisitor;
import org.apache.drill.exec.physical.base.Exchange.ParallelizationDependency;
import org.apache.drill.exec.physical.base.FragmentRoot;
import org.apache.drill.exec.physical.base.PhysicalOperator;
import org.apache.drill.exec.physical.base.Receiver;
import org.apache.drill.exec.planner.PhysicalPlanReader;
import org.apache.drill.exec.planner.fragment.Fragment.ExchangeFragmentPair;
import org.apache.drill.exec.planner.fragment.Materializer.IndexedFragmentNode;
import org.apache.drill.exec.proto.BitControl.Collector;
import org.apache.drill.exec.proto.BitControl.PlanFragment;
import org.apache.drill.exec.proto.BitControl.QueryContextInformation;
import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
import org.apache.drill.exec.proto.ExecProtos.FragmentHandle;
import org.apache.drill.exec.proto.UserBitShared.QueryId;
import org.apache.drill.exec.rpc.user.UserSession;
import org.apache.drill.exec.server.options.OptionList;
import org.apache.drill.exec.server.options.OptionManager;
import org.apache.drill.exec.work.QueryWorkUnit;
import org.apache.drill.exec.work.foreman.ForemanSetupException;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
/**
* The simple parallelizer determines the level of parallelization of a plan based on the cost of the underlying
* operations. It doesn't take into account system load or other factors. Based on the cost of the query, the
* parallelization for each major fragment will be determined. Once the amount of parallelization is done, assignment
* is done based on round robin assignment ordered by operator affinity (locality) to available execution Drillbits.
*/
public class SimpleParallelizer implements ParallelizationParameters {
static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(SimpleParallelizer.class);
private final long parallelizationThreshold;
private final int maxWidthPerNode;
private final int maxGlobalWidth;
private final double affinityFactor;
public SimpleParallelizer(QueryContext context) {
OptionManager optionManager = context.getOptions();
long sliceTarget = optionManager.getOption(ExecConstants.SLICE_TARGET).num_val;
this.parallelizationThreshold = sliceTarget > 0 ? sliceTarget : 1;
this.maxWidthPerNode = optionManager.getOption(ExecConstants.MAX_WIDTH_PER_NODE_KEY).num_val.intValue();
this.maxGlobalWidth = optionManager.getOption(ExecConstants.MAX_WIDTH_GLOBAL_KEY).num_val.intValue();
this.affinityFactor = optionManager.getOption(ExecConstants.AFFINITY_FACTOR_KEY).float_val.intValue();
}
public SimpleParallelizer(long parallelizationThreshold, int maxWidthPerNode, int maxGlobalWidth, double affinityFactor) {
this.parallelizationThreshold = parallelizationThreshold;
this.maxWidthPerNode = maxWidthPerNode;
this.maxGlobalWidth = maxGlobalWidth;
this.affinityFactor = affinityFactor;
}
@Override
public long getSliceTarget() {
return parallelizationThreshold;
}
@Override
public int getMaxWidthPerNode() {
return maxWidthPerNode;
}
@Override
public int getMaxGlobalWidth() {
return maxGlobalWidth;
}
@Override
public double getAffinityFactor() {
return affinityFactor;
}
/**
* Generate a set of assigned fragments based on the provided fragment tree. Do not allow parallelization stages
* to go beyond the global max width.
*
* @param options Option list
* @param foremanNode The driving/foreman node for this query. (this node)
* @param queryId The queryId for this query.
* @param activeEndpoints The list of endpoints to consider for inclusion in planning this query.
* @param reader Tool used to read JSON plans
* @param rootFragment The root node of the PhysicalPlan that we will be parallelizing.
* @param session UserSession of user who launched this query.
* @param queryContextInfo Info related to the context when query has started.
* @return The list of generated PlanFragment protobuf objects to be assigned out to the individual nodes.
* @throws ExecutionSetupException
*/
public QueryWorkUnit getFragments(OptionList options, DrillbitEndpoint foremanNode, QueryId queryId,
Collection<DrillbitEndpoint> activeEndpoints, PhysicalPlanReader reader, Fragment rootFragment,
UserSession session, QueryContextInformation queryContextInfo) throws ExecutionSetupException {
final PlanningSet planningSet = getFragmentsHelper(activeEndpoints, rootFragment);
return generateWorkUnit(
options, foremanNode, queryId, reader, rootFragment, planningSet, session, queryContextInfo);
}
/**
* Create multiple physical plans from original query planning, it will allow execute them eventually independently
* @param options
* @param foremanNode
* @param queryId
* @param activeEndpoints
* @param reader
* @param rootFragment
* @param session
* @param queryContextInfo
* @return
* @throws ExecutionSetupException
*/
public List<QueryWorkUnit> getSplitFragments(OptionList options, DrillbitEndpoint foremanNode, QueryId queryId,
Collection<DrillbitEndpoint> activeEndpoints, PhysicalPlanReader reader, Fragment rootFragment,
UserSession session, QueryContextInformation queryContextInfo) throws ExecutionSetupException {
// no op
throw new UnsupportedOperationException("Use children classes");
}
/**
* Helper method to reuse the code for QueryWorkUnit(s) generation
* @param activeEndpoints
* @param rootFragment
* @return
* @throws ExecutionSetupException
*/
protected PlanningSet getFragmentsHelper(Collection<DrillbitEndpoint> activeEndpoints, Fragment rootFragment) throws ExecutionSetupException {
PlanningSet planningSet = new PlanningSet();
initFragmentWrappers(rootFragment, planningSet);
final Set<Wrapper> leafFragments = constructFragmentDependencyGraph(planningSet);
// Start parallelizing from leaf fragments
for (Wrapper wrapper : leafFragments) {
parallelizeFragment(wrapper, planningSet, activeEndpoints);
}
return planningSet;
}
// For every fragment, create a Wrapper in PlanningSet.
@VisibleForTesting
public void initFragmentWrappers(Fragment rootFragment, PlanningSet planningSet) {
planningSet.get(rootFragment);
for(ExchangeFragmentPair fragmentPair : rootFragment) {
initFragmentWrappers(fragmentPair.getNode(), planningSet);
}
}
/**
* Based on the affinity of the Exchange that separates two fragments, setup fragment dependencies.
*
* @param planningSet
* @return Returns a list of leaf fragments in fragment dependency graph.
*/
private static Set<Wrapper> constructFragmentDependencyGraph(PlanningSet planningSet) {
// Set up dependency of fragments based on the affinity of exchange that separates the fragments.
for(Wrapper currentFragmentWrapper : planningSet) {
ExchangeFragmentPair sendingExchange = currentFragmentWrapper.getNode().getSendingExchangePair();
if (sendingExchange != null) {
ParallelizationDependency dependency = sendingExchange.getExchange().getParallelizationDependency();
Wrapper receivingFragmentWrapper = planningSet.get(sendingExchange.getNode());
if (dependency == ParallelizationDependency.RECEIVER_DEPENDS_ON_SENDER) {
receivingFragmentWrapper.addFragmentDependency(currentFragmentWrapper);
} else if (dependency == ParallelizationDependency.SENDER_DEPENDS_ON_RECEIVER) {
currentFragmentWrapper.addFragmentDependency(receivingFragmentWrapper);
}
}
}
// Identify leaf fragments. Leaf fragments are fragments that have no other fragments depending on them for
// parallelization info. First assume all fragments are leaf fragments. Go through the fragments one by one and
// remove the fragment on which the current fragment depends on.
final Set<Wrapper> roots = Sets.newHashSet();
for(Wrapper w : planningSet) {
roots.add(w);
}
for(Wrapper wrapper : planningSet) {
final List<Wrapper> fragmentDependencies = wrapper.getFragmentDependencies();
if (fragmentDependencies != null && fragmentDependencies.size() > 0) {
for(Wrapper dependency : fragmentDependencies) {
if (roots.contains(dependency)) {
roots.remove(dependency);
}
}
}
}
return roots;
}
/**
* Helper method for parallelizing a given fragment. Dependent fragments are parallelized first before
* parallelizing the given fragment.
*/
private void parallelizeFragment(Wrapper fragmentWrapper, PlanningSet planningSet,
Collection<DrillbitEndpoint> activeEndpoints) throws PhysicalOperatorSetupException {
// If the fragment is already parallelized, return.
if (fragmentWrapper.isEndpointsAssignmentDone()) {
return;
}
// First parallelize fragments on which this fragment depends on.
final List<Wrapper> fragmentDependencies = fragmentWrapper.getFragmentDependencies();
if (fragmentDependencies != null && fragmentDependencies.size() > 0) {
for(Wrapper dependency : fragmentDependencies) {
parallelizeFragment(dependency, planningSet, activeEndpoints);
}
}
// Find stats. Stats include various factors including cost of physical operators, parallelizability of
// work in physical operator and affinity of physical operator to certain nodes.
fragmentWrapper.getNode().getRoot().accept(new StatsCollector(planningSet), fragmentWrapper);
fragmentWrapper.getStats().getDistributionAffinity()
.getFragmentParallelizer()
.parallelizeFragment(fragmentWrapper, this, activeEndpoints);
}
protected QueryWorkUnit generateWorkUnit(OptionList options, DrillbitEndpoint foremanNode, QueryId queryId,
PhysicalPlanReader reader, Fragment rootNode, PlanningSet planningSet,
UserSession session, QueryContextInformation queryContextInfo) throws ExecutionSetupException {
List<PlanFragment> fragments = Lists.newArrayList();
PlanFragment rootFragment = null;
FragmentRoot rootOperator = null;
// now we generate all the individual plan fragments and associated assignments. Note, we need all endpoints
// assigned before we can materialize, so we start a new loop here rather than utilizing the previous one.
for (Wrapper wrapper : planningSet) {
Fragment node = wrapper.getNode();
final PhysicalOperator physicalOperatorRoot = node.getRoot();
boolean isRootNode = rootNode == node;
if (isRootNode && wrapper.getWidth() != 1) {
throw new ForemanSetupException(String.format("Failure while trying to setup fragment. " +
"The root fragment must always have parallelization one. In the current case, the width was set to %d.",
wrapper.getWidth()));
}
// a fragment is self driven if it doesn't rely on any other exchanges.
boolean isLeafFragment = node.getReceivingExchangePairs().size() == 0;
// Create a minorFragment for each major fragment.
for (int minorFragmentId = 0; minorFragmentId < wrapper.getWidth(); minorFragmentId++) {
IndexedFragmentNode iNode = new IndexedFragmentNode(minorFragmentId, wrapper);
wrapper.resetAllocation();
PhysicalOperator op = physicalOperatorRoot.accept(Materializer.INSTANCE, iNode);
Preconditions.checkArgument(op instanceof FragmentRoot);
FragmentRoot root = (FragmentRoot) op;
// get plan as JSON
String plan;
String optionsData;
try {
plan = reader.writeJson(root);
optionsData = reader.writeJson(options);
} catch (JsonProcessingException e) {
throw new ForemanSetupException("Failure while trying to convert fragment into json.", e);
}
FragmentHandle handle = FragmentHandle //
.newBuilder() //
.setMajorFragmentId(wrapper.getMajorFragmentId()) //
.setMinorFragmentId(minorFragmentId) //
.setQueryId(queryId) //
.build();
PlanFragment fragment = PlanFragment.newBuilder() //
.setForeman(foremanNode) //
.setFragmentJson(plan) //
.setHandle(handle) //
.setAssignment(wrapper.getAssignedEndpoint(minorFragmentId)) //
.setLeafFragment(isLeafFragment) //
.setContext(queryContextInfo)
.setMemInitial(wrapper.getInitialAllocation())//
.setMemMax(wrapper.getMaxAllocation())
.setOptionsJson(optionsData)
.setCredentials(session.getCredentials())
.addAllCollector(CountRequiredFragments.getCollectors(root))
.build();
if (isRootNode) {
if (logger.isDebugEnabled()) {
logger.debug("Root fragment:\n {}", DrillStringUtils.unescapeJava(fragment.toString()));
}
rootFragment = fragment;
rootOperator = root;
} else {
if (logger.isDebugEnabled()) {
logger.debug("Remote fragment:\n {}", DrillStringUtils.unescapeJava(fragment.toString()));
}
fragments.add(fragment);
}
}
}
return new QueryWorkUnit(rootOperator, rootFragment, fragments);
}
/**
* Designed to setup initial values for arriving fragment accounting.
*/
protected static class CountRequiredFragments extends AbstractPhysicalVisitor<Void, List<Collector>, RuntimeException> {
private static final CountRequiredFragments INSTANCE = new CountRequiredFragments();
public static List<Collector> getCollectors(PhysicalOperator root) {
List<Collector> collectors = Lists.newArrayList();
root.accept(INSTANCE, collectors);
return collectors;
}
@Override
public Void visitReceiver(Receiver receiver, List<Collector> collectors) throws RuntimeException {
List<MinorFragmentEndpoint> endpoints = receiver.getProvidingEndpoints();
List<Integer> list = new ArrayList<>(endpoints.size());
for (MinorFragmentEndpoint ep : endpoints) {
list.add(ep.getId());
}
collectors.add(Collector.newBuilder()
.setIsSpooling(receiver.isSpooling())
.setOppositeMajorFragmentId(receiver.getOppositeMajorFragmentId())
.setSupportsOutOfOrder(receiver.supportsOutOfOrderExchange())
.addAllIncomingMinorFragment(list)
.build());
return null;
}
@Override
public Void visitOp(PhysicalOperator op, List<Collector> collectors) throws RuntimeException {
for (PhysicalOperator o : op) {
o.accept(this, collectors);
}
return null;
}
}
}