/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.execution.scheduler; import com.facebook.presto.connector.ConnectorId; import com.facebook.presto.execution.NodeTaskMap; import com.facebook.presto.execution.RemoteTask; import com.facebook.presto.metadata.InternalNodeManager; import com.facebook.presto.metadata.Split; import com.facebook.presto.spi.HostAddress; import com.facebook.presto.spi.Node; import com.facebook.presto.sql.planner.NodePartitionMap; import com.google.common.base.Supplier; import com.google.common.base.Suppliers; import com.google.common.collect.HashMultimap; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableMultimap; import com.google.common.collect.ImmutableSetMultimap; import com.google.common.collect.Multimap; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.MoreExecutors; import io.airlift.stats.CounterStat; import javax.annotation.PreDestroy; import javax.inject.Inject; import java.net.InetAddress; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.concurrent.TimeUnit; import static com.facebook.presto.execution.scheduler.NodeSchedulerConfig.NetworkTopologyType; import static com.facebook.presto.spi.NodeState.ACTIVE; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableSet.toImmutableSet; import static com.google.common.util.concurrent.Futures.immediateFuture; import static io.airlift.concurrent.MoreFutures.whenAnyComplete; import static java.util.Objects.requireNonNull; public class NodeScheduler { private final NetworkLocationCache networkLocationCache; private final List<CounterStat> topologicalSplitCounters; private final List<String> networkLocationSegmentNames; private final InternalNodeManager nodeManager; private final int minCandidates; private final boolean includeCoordinator; private final int maxSplitsPerNode; private final int maxPendingSplitsPerTask; private final NodeTaskMap nodeTaskMap; private final boolean useNetworkTopology; @Inject public NodeScheduler(NetworkTopology networkTopology, InternalNodeManager nodeManager, NodeSchedulerConfig config, NodeTaskMap nodeTaskMap) { this(new NetworkLocationCache(networkTopology), networkTopology, nodeManager, config, nodeTaskMap); } public NodeScheduler( NetworkLocationCache networkLocationCache, NetworkTopology networkTopology, InternalNodeManager nodeManager, NodeSchedulerConfig config, NodeTaskMap nodeTaskMap) { this.networkLocationCache = networkLocationCache; this.nodeManager = nodeManager; this.minCandidates = config.getMinCandidates(); this.includeCoordinator = config.isIncludeCoordinator(); this.maxSplitsPerNode = config.getMaxSplitsPerNode(); this.maxPendingSplitsPerTask = config.getMaxPendingSplitsPerTask(); this.nodeTaskMap = requireNonNull(nodeTaskMap, "nodeTaskMap is null"); checkArgument(maxSplitsPerNode > maxPendingSplitsPerTask, "maxSplitsPerNode must be > maxPendingSplitsPerTask"); this.useNetworkTopology = !config.getNetworkTopology().equals(NetworkTopologyType.LEGACY); ImmutableList.Builder<CounterStat> builder = ImmutableList.builder(); if (useNetworkTopology) { networkLocationSegmentNames = ImmutableList.copyOf(networkTopology.getLocationSegmentNames()); for (int i = 0; i < networkLocationSegmentNames.size() + 1; i++) { builder.add(new CounterStat()); } } else { networkLocationSegmentNames = ImmutableList.of(); } topologicalSplitCounters = builder.build(); } @PreDestroy public void stop() { networkLocationCache.stop(); } public Map<String, CounterStat> getTopologicalSplitCounters() { ImmutableMap.Builder<String, CounterStat> counters = ImmutableMap.builder(); for (int i = 0; i < topologicalSplitCounters.size(); i++) { counters.put(i == 0 ? "all" : networkLocationSegmentNames.get(i - 1), topologicalSplitCounters.get(i)); } return counters.build(); } public NodeSelector createNodeSelector(ConnectorId connectorId) { // this supplier is thread-safe. TODO: this logic should probably move to the scheduler since the choice of which node to run in should be // done as close to when the the split is about to be scheduled Supplier<NodeMap> nodeMap = Suppliers.memoizeWithExpiration(() -> { ImmutableSetMultimap.Builder<HostAddress, Node> byHostAndPort = ImmutableSetMultimap.builder(); ImmutableSetMultimap.Builder<InetAddress, Node> byHost = ImmutableSetMultimap.builder(); ImmutableSetMultimap.Builder<NetworkLocation, Node> workersByNetworkPath = ImmutableSetMultimap.builder(); Set<Node> nodes; if (connectorId != null) { nodes = nodeManager.getActiveConnectorNodes(connectorId); } else { nodes = nodeManager.getNodes(ACTIVE); } Set<String> coordinatorNodeIds = nodeManager.getCoordinators().stream() .map(Node::getNodeIdentifier) .collect(toImmutableSet()); for (Node node : nodes) { if (useNetworkTopology && (includeCoordinator || !coordinatorNodeIds.contains(node.getNodeIdentifier()))) { NetworkLocation location = networkLocationCache.get(node.getHostAndPort()); for (int i = 0; i <= location.getSegments().size(); i++) { workersByNetworkPath.put(location.subLocation(0, i), node); } } try { byHostAndPort.put(node.getHostAndPort(), node); InetAddress host = InetAddress.getByName(node.getHttpUri().getHost()); byHost.put(host, node); } catch (UnknownHostException e) { // ignore } } return new NodeMap(byHostAndPort.build(), byHost.build(), workersByNetworkPath.build(), coordinatorNodeIds); }, 5, TimeUnit.SECONDS); if (useNetworkTopology) { return new TopologyAwareNodeSelector( nodeManager, nodeTaskMap, includeCoordinator, nodeMap, minCandidates, maxSplitsPerNode, maxPendingSplitsPerTask, topologicalSplitCounters, networkLocationSegmentNames, networkLocationCache); } else { return new SimpleNodeSelector(nodeManager, nodeTaskMap, includeCoordinator, nodeMap, minCandidates, maxSplitsPerNode, maxPendingSplitsPerTask); } } public static List<Node> selectNodes(int limit, Iterator<Node> candidates) { checkArgument(limit > 0, "limit must be at least 1"); List<Node> selected = new ArrayList<>(limit); while (selected.size() < limit && candidates.hasNext()) { selected.add(candidates.next()); } return selected; } public static ResettableRandomizedIterator<Node> randomizedNodes(NodeMap nodeMap, boolean includeCoordinator) { ImmutableList<Node> nodes = nodeMap.getNodesByHostAndPort().values().stream() .filter(node -> includeCoordinator || !nodeMap.getCoordinatorNodeIds().contains(node.getNodeIdentifier())) .collect(toImmutableList()); return new ResettableRandomizedIterator<>(nodes); } public static List<Node> selectExactNodes(NodeMap nodeMap, List<HostAddress> hosts, boolean includeCoordinator) { Set<Node> chosen = new LinkedHashSet<>(); Set<String> coordinatorIds = nodeMap.getCoordinatorNodeIds(); for (HostAddress host : hosts) { nodeMap.getNodesByHostAndPort().get(host).stream() .filter(node -> includeCoordinator || !coordinatorIds.contains(node.getNodeIdentifier())) .forEach(chosen::add); InetAddress address; try { address = host.toInetAddress(); } catch (UnknownHostException e) { // skip hosts that don't resolve continue; } // consider a split with a host without a port as being accessible by all nodes in that host if (!host.hasPort()) { nodeMap.getNodesByHost().get(address).stream() .filter(node -> includeCoordinator || !coordinatorIds.contains(node.getNodeIdentifier())) .forEach(chosen::add); } } // if the chosen set is empty and the host is the coordinator, force pick the coordinator if (chosen.isEmpty() && !includeCoordinator) { for (HostAddress host : hosts) { // In the code below, before calling `chosen::add`, it could have been checked that // `coordinatorIds.contains(node.getNodeIdentifier())`. But checking the condition isn't necessary // because every node satisfies it. Otherwise, `chosen` wouldn't have been empty. nodeMap.getNodesByHostAndPort().get(host).stream() .forEach(chosen::add); InetAddress address; try { address = host.toInetAddress(); } catch (UnknownHostException e) { // skip hosts that don't resolve continue; } // consider a split with a host without a port as being accessible by all nodes in that host if (!host.hasPort()) { nodeMap.getNodesByHost().get(address).stream() .forEach(chosen::add); } } } return ImmutableList.copyOf(chosen); } public static SplitPlacementResult selectDistributionNodes( NodeMap nodeMap, NodeTaskMap nodeTaskMap, int maxSplitsPerNode, int maxPendingSplitsPerTask, Set<Split> splits, List<RemoteTask> existingTasks, NodePartitionMap partitioning) { Multimap<Node, Split> assignments = HashMultimap.create(); NodeAssignmentStats assignmentStats = new NodeAssignmentStats(nodeTaskMap, nodeMap, existingTasks); Set<Node> blockedNodes = new HashSet<>(); for (Split split : splits) { // node placement is forced by the partitioning Node node = partitioning.getNode(split); // if node is full, don't schedule now, which will push back on the scheduling of splits if (assignmentStats.getTotalSplitCount(node) < maxSplitsPerNode || assignmentStats.getQueuedSplitCountForStage(node) < maxPendingSplitsPerTask) { assignments.put(node, split); assignmentStats.addAssignedSplit(node); } else { blockedNodes.add(node); } } ListenableFuture<?> blocked = toWhenHasSplitQueueSpaceFuture(blockedNodes, existingTasks, calculateLowWatermark(maxPendingSplitsPerTask)); return new SplitPlacementResult(blocked, ImmutableMultimap.copyOf(assignments)); } public static int calculateLowWatermark(int maxPendingSplitsPerTask) { return (int) Math.ceil(maxPendingSplitsPerTask / 2.0); } public static ListenableFuture<?> toWhenHasSplitQueueSpaceFuture(Set<Node> blockedNodes, List<RemoteTask> existingTasks, int spaceThreshold) { if (blockedNodes.isEmpty()) { return immediateFuture(null); } Map<String, RemoteTask> nodeToTaskMap = new HashMap<>(); for (RemoteTask task : existingTasks) { nodeToTaskMap.put(task.getNodeId(), task); } List<ListenableFuture<?>> blockedFutures = blockedNodes.stream() .map(Node::getNodeIdentifier) .map(nodeToTaskMap::get) .filter(Objects::nonNull) .map(remoteTask -> remoteTask.whenSplitQueueHasSpace(spaceThreshold)) .collect(toImmutableList()); if (blockedFutures.isEmpty()) { return immediateFuture(null); } return getFirstCompleteAndCancelOthers(blockedFutures); } public static ListenableFuture<?> toWhenHasSplitQueueSpaceFuture(List<RemoteTask> existingTasks, int spaceThreshold) { if (existingTasks.isEmpty()) { return immediateFuture(null); } List<ListenableFuture<?>> stateChangeFutures = existingTasks.stream() .map(remoteTask -> remoteTask.whenSplitQueueHasSpace(spaceThreshold)) .collect(toImmutableList()); return getFirstCompleteAndCancelOthers(stateChangeFutures); } private static ListenableFuture<?> getFirstCompleteAndCancelOthers(List<ListenableFuture<?>> blockedFutures) { // wait for the first task to unblock and then cancel all futures to free up resources ListenableFuture<?> result = whenAnyComplete(blockedFutures); result.addListener( () -> { for (ListenableFuture<?> blockedFuture : blockedFutures) { blockedFuture.cancel(true); } }, MoreExecutors.directExecutor()); return result; } }