/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.llap.tezplugins; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.NavigableMap; import java.util.Random; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.DelayQueue; import java.util.concurrent.Delayed; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import com.google.common.collect.Sets; import com.google.common.util.concurrent.Futures; import org.apache.commons.lang.mutable.MutableInt; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.JvmPauseMonitor; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.llap.metrics.LlapMetricsSystem; import org.apache.hadoop.hive.llap.metrics.MetricsUtils; import org.apache.hadoop.hive.llap.registry.ServiceInstance; import org.apache.hadoop.hive.llap.registry.ServiceInstanceSet; import org.apache.hadoop.hive.llap.registry.ServiceInstanceStateChangeListener; import org.apache.hadoop.hive.llap.registry.ServiceRegistry; import org.apache.hadoop.hive.llap.registry.impl.InactiveServiceInstance; import org.apache.hadoop.hive.llap.registry.impl.LlapRegistryService; import org.apache.hadoop.hive.llap.tezplugins.helpers.MonotonicClock; import org.apache.hadoop.hive.llap.tezplugins.scheduler.LoggingFutureCallback; import org.apache.hadoop.hive.llap.tezplugins.metrics.LlapTaskSchedulerMetrics; import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.NodeReport; import org.apache.hadoop.yarn.api.records.NodeState; import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.util.Clock; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.ListeningExecutorService; import com.google.common.util.concurrent.MoreExecutors; import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.tez.common.TezUtils; import org.apache.tez.dag.api.TezUncheckedException; import org.apache.tez.serviceplugins.api.DagInfo; import org.apache.tez.serviceplugins.api.ServicePluginErrorDefaults; import org.apache.tez.serviceplugins.api.TaskAttemptEndReason; import org.apache.tez.serviceplugins.api.TaskScheduler; import org.apache.tez.serviceplugins.api.TaskSchedulerContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class LlapTaskSchedulerService extends TaskScheduler { private static final Logger LOG = LoggerFactory.getLogger(LlapTaskSchedulerService.class); private static final TaskStartComparator TASK_INFO_COMPARATOR = new TaskStartComparator(); private final Configuration conf; // interface into the registry service private ServiceInstanceSet activeInstances; // Tracks all instances, including ones which have been disabled in the past. // LinkedHashMap to provide the same iteration order when selecting a random host. @VisibleForTesting final Map<String, NodeInfo> instanceToNodeMap = new LinkedHashMap<>(); // TODO Ideally, remove elements from this once it's known that no tasks are linked to the instance (all deallocated) // Tracks tasks which could not be allocated immediately. @VisibleForTesting // Tasks are tracked in the order requests come in, at different priority levels. // TODO HIVE-13538 For tasks at the same priority level, it may be worth attempting to schedule tasks with // locality information before those without locality information final TreeMap<Priority, List<TaskInfo>> pendingTasks = new TreeMap<>(new Comparator<Priority>() { @Override public int compare(Priority o1, Priority o2) { return o1.getPriority() - o2.getPriority(); } }); // Tracks running and queued tasks. Cleared after a task completes. private final ConcurrentMap<Object, TaskInfo> knownTasks = new ConcurrentHashMap<>(); // Tracks tasks which are running. Useful for selecting a task to preempt based on when it started. private final TreeMap<Integer, TreeSet<TaskInfo>> runningTasks = new TreeMap<>(); // Queue for disabled nodes. Nodes make it out of this queue when their expiration timeout is hit. @VisibleForTesting final DelayQueue<NodeInfo> disabledNodesQueue = new DelayQueue<>(); @VisibleForTesting final DelayQueue<TaskInfo> delayedTaskQueue = new DelayQueue<>(); private volatile boolean dagRunning = false; private final ContainerFactory containerFactory; @VisibleForTesting final Clock clock; private final ListeningExecutorService nodeEnabledExecutor; private final NodeEnablerCallable nodeEnablerCallable = new NodeEnablerCallable(); private final ListeningExecutorService delayedTaskSchedulerExecutor; @VisibleForTesting final DelayedTaskSchedulerCallable delayedTaskSchedulerCallable; private final ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); private final ReentrantReadWriteLock.ReadLock readLock = lock.readLock(); private final ReentrantReadWriteLock.WriteLock writeLock = lock.writeLock(); private final Lock scheduleLock = new ReentrantLock(); private final Condition scheduleCondition = scheduleLock.newCondition(); private final AtomicBoolean pendingScheduleInvocations = new AtomicBoolean(false); private final ListeningExecutorService schedulerExecutor; private final SchedulerCallable schedulerCallable = new SchedulerCallable(); private final AtomicBoolean isStopped = new AtomicBoolean(false); // Tracks total pending preemptions. private final AtomicInteger pendingPreemptions = new AtomicInteger(0); // Tracks pending preemptions per host, using the hostname || Always to be accessed inside a lock private final Map<String, MutableInt> pendingPreemptionsPerHost = new HashMap<>(); private final NodeBlacklistConf nodeBlacklistConf; private final LocalityDelayConf localityDelayConf; private final int numSchedulableTasksPerNode; // when there are no live nodes in the cluster and this timeout elapses the query is failed private final long timeout; private final Lock timeoutLock = new ReentrantLock(); private final ScheduledExecutorService timeoutExecutor; private final ScheduledExecutorService scheduledLoggingExecutor; private final SchedulerTimeoutMonitor timeoutMonitor; private ScheduledFuture<?> timeoutFuture; private final AtomicReference<ScheduledFuture<?>> timeoutFutureRef = new AtomicReference<>(null); private final AtomicInteger assignedTaskCounter = new AtomicInteger(0); private final LlapRegistryService registry = new LlapRegistryService(false); private volatile ListenableFuture<Void> nodeEnablerFuture; private volatile ListenableFuture<Void> delayedTaskSchedulerFuture; private volatile ListenableFuture<Void> schedulerFuture; @VisibleForTesting private final AtomicInteger dagCounter = new AtomicInteger(1); // Statistics to track allocations // All of stats variables are visible for testing. @VisibleForTesting StatsPerDag dagStats = new StatsPerDag(); private final LlapTaskSchedulerMetrics metrics; private final JvmPauseMonitor pauseMonitor; private final Random random = new Random(); public LlapTaskSchedulerService(TaskSchedulerContext taskSchedulerContext) { this(taskSchedulerContext, new MonotonicClock(), true); } @VisibleForTesting public LlapTaskSchedulerService(TaskSchedulerContext taskSchedulerContext, Clock clock, boolean initMetrics) { super(taskSchedulerContext); this.clock = clock; this.delayedTaskSchedulerCallable = createDelayedTaskSchedulerCallable(); try { this.conf = TezUtils.createConfFromUserPayload(taskSchedulerContext.getInitialUserPayload()); } catch (IOException e) { throw new TezUncheckedException( "Failed to parse user payload for " + LlapTaskSchedulerService.class.getSimpleName(), e); } this.containerFactory = new ContainerFactory(taskSchedulerContext.getApplicationAttemptId(), taskSchedulerContext.getCustomClusterIdentifier()); // TODO HIVE-13483 Get all of these properties from the registry. This will need to take care of different instances // publishing potentially different values when we support changing configurations dynamically. // For now, this can simply be fetched from a single registry instance. this.nodeBlacklistConf = new NodeBlacklistConf( HiveConf.getTimeVar(conf, ConfVars.LLAP_TASK_SCHEDULER_NODE_REENABLE_MIN_TIMEOUT_MS, TimeUnit.MILLISECONDS), HiveConf.getTimeVar(conf, ConfVars.LLAP_TASK_SCHEDULER_NODE_REENABLE_MAX_TIMEOUT_MS, TimeUnit.MILLISECONDS), HiveConf.getFloatVar(conf, ConfVars.LLAP_TASK_SCHEDULER_NODE_DISABLE_BACK_OFF_FACTOR)); this.numSchedulableTasksPerNode = HiveConf.getIntVar(conf, ConfVars.LLAP_TASK_SCHEDULER_NUM_SCHEDULABLE_TASKS_PER_NODE); long localityDelayMs = HiveConf .getTimeVar(conf, ConfVars.LLAP_TASK_SCHEDULER_LOCALITY_DELAY, TimeUnit.MILLISECONDS); this.localityDelayConf = new LocalityDelayConf(localityDelayMs); this.timeoutMonitor = new SchedulerTimeoutMonitor(); this.timeout = HiveConf.getTimeVar(conf, ConfVars.LLAP_DAEMON_TASK_SCHEDULER_TIMEOUT_SECONDS, TimeUnit.MILLISECONDS); this.timeoutExecutor = Executors.newSingleThreadScheduledExecutor( new ThreadFactoryBuilder().setDaemon(true).setNameFormat("LlapTaskSchedulerTimeoutMonitor") .build()); this.timeoutFuture = null; this.scheduledLoggingExecutor = Executors.newSingleThreadScheduledExecutor( new ThreadFactoryBuilder().setDaemon(true).setNameFormat("LlapTaskSchedulerTimedLogThread") .build()); String instanceId = HiveConf.getTrimmedVar(conf, ConfVars.LLAP_DAEMON_SERVICE_HOSTS); Preconditions.checkNotNull(instanceId, ConfVars.LLAP_DAEMON_SERVICE_HOSTS.varname + " must be defined"); ExecutorService executorServiceRaw = Executors.newSingleThreadExecutor( new ThreadFactoryBuilder().setDaemon(true).setNameFormat("LlapSchedulerNodeEnabler").build()); nodeEnabledExecutor = MoreExecutors.listeningDecorator(executorServiceRaw); ExecutorService delayedTaskSchedulerExecutorRaw = Executors.newFixedThreadPool(1, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("LlapSchedulerDelayedTaskHandler") .build()); delayedTaskSchedulerExecutor = MoreExecutors.listeningDecorator(delayedTaskSchedulerExecutorRaw); ExecutorService schedulerExecutorServiceRaw = Executors.newSingleThreadExecutor( new ThreadFactoryBuilder().setDaemon(true).setNameFormat("LlapScheduler").build()); schedulerExecutor = MoreExecutors.listeningDecorator(schedulerExecutorServiceRaw); if (initMetrics && !conf.getBoolean(ConfVars.HIVE_IN_TEST.varname, false)) { // Initialize the metrics system LlapMetricsSystem.initialize("LlapTaskScheduler"); this.pauseMonitor = new JvmPauseMonitor(conf); pauseMonitor.start(); String displayName = "LlapTaskSchedulerMetrics-" + MetricsUtils.getHostName(); String sessionId = conf.get("llap.daemon.metrics.sessionid"); // TODO: Not sure about the use of this. Should we instead use workerIdentity as sessionId? this.metrics = LlapTaskSchedulerMetrics.create(displayName, sessionId); } else { this.metrics = null; this.pauseMonitor = null; } String hostsString = HiveConf.getVar(conf, ConfVars.LLAP_DAEMON_SERVICE_HOSTS); LOG.info( "Running with configuration: hosts={}, numSchedulableTasksPerNode={}, nodeBlacklistConf={}, localityConf={}", hostsString, numSchedulableTasksPerNode, nodeBlacklistConf, localityDelayConf); } @Override public void initialize() { registry.init(conf); } @Override public void start() throws IOException { writeLock.lock(); try { scheduledLoggingExecutor.schedule(new Callable<Void>() { @Override public Void call() throws Exception { readLock.lock(); try { if (dagRunning) { LOG.info("Stats for current dag: {}", dagStats); } } finally { readLock.unlock(); } return null; } }, 10000L, TimeUnit.MILLISECONDS); nodeEnablerFuture = nodeEnabledExecutor.submit(nodeEnablerCallable); Futures.addCallback(nodeEnablerFuture, new LoggingFutureCallback("NodeEnablerThread", LOG)); delayedTaskSchedulerFuture = delayedTaskSchedulerExecutor.submit(delayedTaskSchedulerCallable); Futures.addCallback(delayedTaskSchedulerFuture, new LoggingFutureCallback("DelayedTaskSchedulerThread", LOG)); schedulerFuture = schedulerExecutor.submit(schedulerCallable); Futures.addCallback(schedulerFuture, new LoggingFutureCallback("SchedulerThread", LOG)); registry.start(); registry.registerStateChangeListener(new NodeStateChangeListener()); activeInstances = registry.getInstances(); for (ServiceInstance inst : activeInstances.getAll()) { addNode(new NodeInfo(inst, nodeBlacklistConf, clock, numSchedulableTasksPerNode, metrics), inst); } } finally { writeLock.unlock(); } } @VisibleForTesting public void setServiceInstanceSet(ServiceInstanceSet serviceInstanceSet) { this.activeInstances = serviceInstanceSet; } private class NodeStateChangeListener implements ServiceInstanceStateChangeListener { private final Logger LOG = LoggerFactory.getLogger(NodeStateChangeListener.class); @Override public void onCreate(ServiceInstance serviceInstance) { LOG.info("Added node with identity: {} as a result of registry callback", serviceInstance.getWorkerIdentity()); addNode(new NodeInfo(serviceInstance, nodeBlacklistConf, clock, numSchedulableTasksPerNode, metrics), serviceInstance); } @Override public void onUpdate(ServiceInstance serviceInstance) { // TODO In what situations will this be invoked? LOG.warn( "Not expecing Updates from the registry. Received update for instance={}. Ignoring", serviceInstance); } @Override public void onRemove(ServiceInstance serviceInstance) { NodeReport nodeReport = constructNodeReport(serviceInstance, false); LOG.info("Sending out nodeReport for onRemove: {}", nodeReport); getContext().nodesUpdated(Collections.singletonList(nodeReport)); instanceToNodeMap.remove(serviceInstance.getWorkerIdentity()); LOG.info("Removed node with identity: {} due to RegistryNotification. currentActiveInstances={}", serviceInstance.getWorkerIdentity(), activeInstances.size()); if (metrics != null) { metrics.setClusterNodeCount(activeInstances.size()); } // if there are no more nodes. Signal timeout monitor to start timer if (activeInstances.size() == 0) { LOG.info("No node found. Signalling scheduler timeout monitor thread to start timer."); startTimeoutMonitor(); } } } private void startTimeoutMonitor() { timeoutLock.lock(); try { // If timer is null, start a new one. // If timer has completed during previous invocation, start a new one. // If timer already started and is not completed, leaving it running without resetting it. if ((timeoutFuture == null || (timeoutFuture != null && timeoutFuture.isDone())) && activeInstances.size() == 0) { timeoutFuture = timeoutExecutor.schedule(timeoutMonitor, timeout, TimeUnit.MILLISECONDS); timeoutFutureRef.set(timeoutFuture); LOG.info("Scheduled timeout monitor task to run after {} ms", timeout); } else { LOG.info("Timeout monitor task not started. Timeout future state: {}, #instances: {}", timeoutFuture == null ? "null" : timeoutFuture.isDone(), activeInstances.size()); } } finally { timeoutLock.unlock(); } } private void stopTimeoutMonitor() { timeoutLock.lock(); try { if (timeoutFuture != null && activeInstances.size() != 0 && timeoutFuture.cancel(false)) { timeoutFutureRef.set(null); LOG.info("Stopped timeout monitor task"); } else { LOG.info("Timeout monitor task not stopped. Timeout future state: {}, #instances: {}", timeoutFuture == null ? "null" : timeoutFuture.isDone(), activeInstances.size()); } timeoutFuture = null; } finally { timeoutLock.unlock(); } } @Override public void shutdown() { writeLock.lock(); try { if (!this.isStopped.getAndSet(true)) { scheduledLoggingExecutor.shutdownNow(); nodeEnablerCallable.shutdown(); if (nodeEnablerFuture != null) { nodeEnablerFuture.cancel(true); } nodeEnabledExecutor.shutdownNow(); timeoutExecutor.shutdown(); if (timeoutFuture != null) { timeoutFuture.cancel(true); timeoutFuture = null; } timeoutExecutor.shutdownNow(); delayedTaskSchedulerCallable.shutdown(); if (delayedTaskSchedulerFuture != null) { delayedTaskSchedulerFuture.cancel(true); } delayedTaskSchedulerExecutor.shutdownNow(); schedulerCallable.shutdown(); if (schedulerFuture != null) { schedulerFuture.cancel(true); } schedulerExecutor.shutdownNow(); if (registry != null) { registry.stop(); } if (pauseMonitor != null) { pauseMonitor.stop(); } if (metrics != null) { LlapMetricsSystem.shutdown(); } } } finally { writeLock.unlock(); } } @Override public Resource getTotalResources() { int memory = 0; int vcores = 0; readLock.lock(); try { int numInstancesFound = 0; for (ServiceInstance inst : activeInstances.getAll()) { Resource r = inst.getResource(); memory += r.getMemory(); vcores += r.getVirtualCores(); numInstancesFound++; } if (LOG.isDebugEnabled()) { LOG.debug("GetTotalResources: numInstancesFound={}, totalMem={}, totalVcores={}", numInstancesFound, memory, vcores); } } finally { readLock.unlock(); } return Resource.newInstance(memory, vcores); } /** * The difference between this and getTotalResources() is that this only gives currently free * resource instances, while the other lists all the instances that may become available in a * while. */ @Override public Resource getAvailableResources() { // need a state store eventually for current state & measure backoffs int memory = 0; int vcores = 0; readLock.lock(); try { int numInstancesFound = 0; for (ServiceInstance inst : activeInstances.getAll()) { NodeInfo nodeInfo = instanceToNodeMap.get(inst.getWorkerIdentity()); if (nodeInfo != null && !nodeInfo.isDisabled()) { Resource r = inst.getResource(); memory += r.getMemory(); vcores += r.getVirtualCores(); numInstancesFound++; } } if (LOG.isDebugEnabled()) { LOG.debug("GetAvailableResources: numInstancesFound={}, totalMem={}, totalVcores={}", numInstancesFound, memory, vcores); } } finally { readLock.unlock(); } return Resource.newInstance(memory, vcores); } @Override public int getClusterNodeCount() { readLock.lock(); try { return activeInstances.getAll().size(); } finally { readLock.unlock(); } } @Override public void dagComplete() { // This is effectively DAG completed, and can be used to reset statistics being tracked. LOG.info("DAG: " + dagCounter.get() + " completed. Scheduling stats: " + dagStats); dagCounter.incrementAndGet(); if (metrics != null) { metrics.incrCompletedDagCount(); } writeLock.lock(); try { dagRunning = false; dagStats = new StatsPerDag(); int pendingCount = 0; for (Entry<Priority, List<TaskInfo>> entry : pendingTasks.entrySet()) { if (entry.getValue() != null) { pendingCount += entry.getValue().size(); } } int runningCount = 0; for (Entry<Integer, TreeSet<TaskInfo>> entry : runningTasks.entrySet()) { if (entry.getValue() != null) { runningCount += entry.getValue().size(); } } LOG.info( "DAG reset. Current knownTaskCount={}, pendingTaskCount={}, runningTaskCount={}", knownTasks.size(), pendingCount, runningCount); } finally { writeLock.unlock(); } // TODO Cleanup pending tasks etc, so that the next dag is not affected. } @Override public void blacklistNode(NodeId nodeId) { LOG.info("BlacklistNode not supported"); // TODO Disable blacklisting in Tez when using LLAP, until this is properly supported. // Blacklisting can cause containers to move to a terminating state, which can cause attempt to be marked as failed. // This becomes problematic when we set #allowedFailures to 0 // TODO HIVE-13484 What happens when we try scheduling a task on a node that Tez at this point thinks is blacklisted. } @Override public void unblacklistNode(NodeId nodeId) { LOG.info("unBlacklistNode not supported"); // TODO: See comments under blacklistNode. } @Override public void allocateTask(Object task, Resource capability, String[] hosts, String[] racks, Priority priority, Object containerSignature, Object clientCookie) { TaskInfo taskInfo = new TaskInfo(localityDelayConf, clock, task, clientCookie, priority, capability, hosts, racks, clock.getTime()); LOG.info("Received allocateRequest. task={}, priority={}, capability={}, hosts={}", task, priority, capability, Arrays.toString(hosts)); writeLock.lock(); try { dagRunning = true; dagStats.registerTaskRequest(hosts, racks); } finally { writeLock.unlock(); } addPendingTask(taskInfo); trySchedulingPendingTasks(); } @Override public void allocateTask(Object task, Resource capability, ContainerId containerId, Priority priority, Object containerSignature, Object clientCookie) { // Container affinity can be implemented as Host affinity for LLAP. Not required until // 1:1 edges are used in Hive. TaskInfo taskInfo = new TaskInfo(localityDelayConf, clock, task, clientCookie, priority, capability, null, null, clock.getTime()); LOG.info("Received allocateRequest. task={}, priority={}, capability={}, containerId={}", task, priority, capability, containerId); writeLock.lock(); try { dagRunning = true; dagStats.registerTaskRequest(null, null); } finally { writeLock.unlock(); } addPendingTask(taskInfo); trySchedulingPendingTasks(); } // This may be invoked before a container is ever assigned to a task. allocateTask... app decides // the task is no longer required, and asks for a de-allocation. @Override public boolean deallocateTask(Object task, boolean taskSucceeded, TaskAttemptEndReason endReason, String diagnostics) { if (LOG.isDebugEnabled()) { LOG.debug("Processing deallocateTask for task={}, taskSucceeded={}, endReason={}", task, taskSucceeded, endReason); } writeLock.lock(); // Updating several local structures TaskInfo taskInfo; try { taskInfo = unregisterTask(task); if (taskInfo == null) { LOG.error("Could not determine ContainerId for task: " + task + " . Could have hit a race condition. Ignoring." + " The query may hang since this \"unknown\" container is now taking up a slot permanently"); return false; } if (taskInfo.containerId == null) { if (taskInfo.getState() == TaskInfo.State.ASSIGNED) { LOG.error("Task: " + task + " assigned, but could not find the corresponding containerId." + " The query may hang since this \"unknown\" container is now taking up a slot permanently"); } else { LOG.info("Ignoring deallocate request for task " + task + " which hasn't been assigned to a container"); removePendingTask(taskInfo); } return false; } NodeInfo nodeInfo = taskInfo.assignedNode; assert nodeInfo != null; // endReason shows up as OTHER for CONTAINER_TIME_OUT LOG.info("Processing de-allocate request for task={}, state={}, endReason={}", taskInfo.task, taskInfo.getState(), endReason); // Re-enable the node if preempted if (taskInfo.getState() == TaskInfo.State.PREEMPTED) { unregisterPendingPreemption(taskInfo.assignedNode.getHost()); nodeInfo.registerUnsuccessfulTaskEnd(true); if (nodeInfo.isDisabled()) { // Re-enable the node, if a task completed due to preemption. Capacity has become available, // and we may have been able to communicate with the node. queueNodeForReEnablement(nodeInfo); } // In case of success, trigger a scheduling run for pending tasks. trySchedulingPendingTasks(); } else { if (taskSucceeded) { // The node may have been blacklisted at this point - which means it may not be in the // activeNodeList. nodeInfo.registerTaskSuccess(); if (nodeInfo.isDisabled()) { // Re-enable the node. If a task succeeded, a slot may have become available. // Also reset commFailures since a task was able to communicate back and indicate success. queueNodeForReEnablement(nodeInfo); } // In case of success, trigger a scheduling run for pending tasks. trySchedulingPendingTasks(); } else { // Task Failed nodeInfo.registerUnsuccessfulTaskEnd(false); // TODO Include EXTERNAL_PREEMPTION in this list? // TODO HIVE-16134. Differentiate between EXTERNAL_PREEMPTION_WAITQUEU vs EXTERNAL_PREEMPTION_FINISHABLE? if (endReason != null && EnumSet .of(TaskAttemptEndReason.EXECUTOR_BUSY, TaskAttemptEndReason.COMMUNICATION_ERROR) .contains(endReason)) { if (endReason == TaskAttemptEndReason.COMMUNICATION_ERROR) { dagStats.registerCommFailure(taskInfo.assignedNode.getHost()); } else if (endReason == TaskAttemptEndReason.EXECUTOR_BUSY) { dagStats.registerTaskRejected(taskInfo.assignedNode.getHost()); } } if (endReason != null && endReason == TaskAttemptEndReason.NODE_FAILED) { LOG.info( "Task {} ended on {} with a NODE_FAILED message." + " A message should come in from the registry to disable this node unless" + " this was a temporary communication failure", task, nodeInfo.toShortString()); } boolean commFailure = endReason != null && endReason == TaskAttemptEndReason.COMMUNICATION_ERROR; disableNode(nodeInfo, commFailure); } } } finally { writeLock.unlock(); } getContext().containerBeingReleased(taskInfo.containerId); getContext().containerCompleted(taskInfo.task, ContainerStatus.newInstance(taskInfo.containerId, ContainerState.COMPLETE, "", 0)); return true; } @Override public Object deallocateContainer(ContainerId containerId) { if (LOG.isDebugEnabled()) { LOG.debug("Ignoring deallocateContainer for containerId: {}", containerId); } // Containers are not being tracked for re-use. // This is safe to ignore since a deallocate task will come in. return null; } @Override public void setShouldUnregister() { } @Override public boolean hasUnregistered() { // Nothing to do. No registration involved. return true; } /** * @param request the list of preferred hosts. null implies any host * @return */ private SelectHostResult selectHost(TaskInfo request) { String[] requestedHosts = request.requestedHosts; String requestedHostsDebugStr = Arrays.toString(requestedHosts); if (LOG.isDebugEnabled()) { LOG.debug("selectingHost for task={} on hosts={}", request.task, requestedHostsDebugStr); } long schedulerAttemptTime = clock.getTime(); readLock.lock(); // Read-lock. Not updating any stats at the moment. try { boolean shouldDelayForLocality = request.shouldDelayForLocality(schedulerAttemptTime); LOG.debug("ShouldDelayForLocality={} for task={} on hosts={}", shouldDelayForLocality, request.task, requestedHostsDebugStr); if (requestedHosts != null && requestedHosts.length > 0) { int prefHostCount = -1; boolean requestedHostsWillBecomeAvailable = false; for (String host : requestedHosts) { prefHostCount++; // Pick the first host always. Weak attempt at cache affinity. Set<ServiceInstance> instances = activeInstances.getByHost(host); if (!instances.isEmpty()) { for (ServiceInstance inst : instances) { NodeInfo nodeInfo = instanceToNodeMap.get(inst.getWorkerIdentity()); if (nodeInfo != null) { if (nodeInfo.canAcceptTask()) { // Successfully scheduled. LOG.info("Assigning {} when looking for {}." + " local=true FirstRequestedHost={}, #prefLocations={}", nodeInfo.toShortString(), host, (prefHostCount == 0), requestedHosts.length); return new SelectHostResult(nodeInfo); } else { // The node cannot accept a task at the moment. if (shouldDelayForLocality) { // Perform some checks on whether the node will become available or not. if (request.shouldForceLocality()) { requestedHostsWillBecomeAvailable = true; } else { if (nodeInfo.getEnableTime() > request.getLocalityDelayTimeout() && nodeInfo.isDisabled() && nodeInfo.hadCommFailure()) { LOG.debug("Host={} will not become available within requested timeout", nodeInfo); // This node will likely be activated after the task timeout expires. } else { // Worth waiting for the timeout. requestedHostsWillBecomeAvailable = true; } } } } } else { LOG.warn( "Null NodeInfo when attempting to get host with worker {}, and host {}", inst, host); // Leave requestedHostWillBecomeAvailable as is. If some other host is found - delay, // else ends up allocating to a random host immediately. } } } } // Check if forcing the location is required. if (shouldDelayForLocality) { if (requestedHostsWillBecomeAvailable) { if (LOG.isDebugEnabled()) { LOG.debug("Delaying local allocation for [" + request.task + "] when trying to allocate on [" + requestedHostsDebugStr + "]" + ". ScheduleAttemptTime=" + schedulerAttemptTime + ", taskDelayTimeout=" + request.getLocalityDelayTimeout()); } return SELECT_HOST_RESULT_DELAYED_LOCALITY; } else { if (LOG.isDebugEnabled()) { LOG.debug("Skipping local allocation for [" + request.task + "] when trying to allocate on [" + requestedHostsDebugStr + "] since none of these hosts are part of the known list"); } } } } /* fall through - miss in locality or no locality-requested */ Collection<ServiceInstance> instances = activeInstances.getAllInstancesOrdered(true); List<NodeInfo> allNodes = new ArrayList<>(instances.size()); List<NodeInfo> activeNodesWithFreeSlots = new ArrayList<>(); for (ServiceInstance inst : instances) { if (inst instanceof InactiveServiceInstance) { allNodes.add(null); } else { NodeInfo nodeInfo = instanceToNodeMap.get(inst.getWorkerIdentity()); if (nodeInfo == null) { allNodes.add(null); } else { allNodes.add(nodeInfo); if (nodeInfo.canAcceptTask()) { activeNodesWithFreeSlots.add(nodeInfo); } } } } if (allNodes.isEmpty()) { return SELECT_HOST_RESULT_DELAYED_RESOURCES; } // no locality-requested, randomly pick a node containing free slots if (requestedHosts == null || requestedHosts.length == 0) { if (LOG.isDebugEnabled()) { LOG.debug("No-locality requested. Selecting a random host for task={}", request.task); } return randomSelection(activeNodesWithFreeSlots); } // miss in locality request, try picking consistent location with fallback to random selection final String firstRequestedHost = requestedHosts[0]; int requestedHostIdx = -1; for (int i = 0; i < allNodes.size(); i++) { NodeInfo nodeInfo = allNodes.get(i); if (nodeInfo != null) { if (nodeInfo.getHost().equals(firstRequestedHost)){ requestedHostIdx = i; break; } } } // requested host died or unknown host requested, fallback to random selection. // TODO: At this point we don't know the slot number of the requested host, so can't rollover to next available if (requestedHostIdx == -1) { if (LOG.isDebugEnabled()) { LOG.debug("Requested node [{}] in consistent order does not exist. Falling back to random selection for " + "request {}", firstRequestedHost, request); } return randomSelection(activeNodesWithFreeSlots); } // requested host is still alive but cannot accept task, pick the next available host in consistent order for (int i = 0; i < allNodes.size(); i++) { NodeInfo nodeInfo = allNodes.get((i + requestedHostIdx + 1) % allNodes.size()); // next node in consistent order died or does not have free slots, rollover to next if (nodeInfo == null || !nodeInfo.canAcceptTask()) { continue; } else { if (LOG.isDebugEnabled()) { LOG.debug("Assigning {} in consistent order when looking for first requested host, from #hosts={}," + " requestedHosts={}", nodeInfo.toShortString(), allNodes.size(), ((requestedHosts == null || requestedHosts.length == 0) ? "null" : requestedHostsDebugStr)); } return new SelectHostResult(nodeInfo); } } return SELECT_HOST_RESULT_DELAYED_RESOURCES; } finally { readLock.unlock(); } } private SelectHostResult randomSelection(final List<NodeInfo> nodesWithFreeSlots) { if (nodesWithFreeSlots.isEmpty()) { return SELECT_HOST_RESULT_DELAYED_RESOURCES; } NodeInfo randomNode = nodesWithFreeSlots.get(random.nextInt(nodesWithFreeSlots.size())); if (LOG.isInfoEnabled()) { LOG.info("Assigning {} when looking for any host, from #hosts={}, requestedHosts=null", randomNode.toShortString(), nodesWithFreeSlots.size()); } return new SelectHostResult(randomNode); } private void addNode(NodeInfo node, ServiceInstance serviceInstance) { // we have just added a new node. Signal timeout monitor to reset timer if (activeInstances.size() != 0 && timeoutFutureRef.get() != null) { LOG.info("New node added. Signalling scheduler timeout monitor thread to stop timer."); stopTimeoutMonitor(); } NodeReport nodeReport = constructNodeReport(serviceInstance, true); getContext().nodesUpdated(Collections.singletonList(nodeReport)); // When the same node goes away and comes back... the old entry will be lost - which means // we don't know how many fragments we have actually scheduled on this node. // Replacing it is the right thing to do though, since we expect the AM to kill all the fragments running on the node, via timeouts. // De-allocate messages coming in from the old node are sent to the NodeInfo instance for the old node. instanceToNodeMap.put(node.getNodeIdentity(), node); if (metrics != null) { metrics.setClusterNodeCount(activeInstances.size()); } // Trigger scheduling since a new node became available. LOG.info("Adding new node: {}. TotalNodeCount={}. activeInstances.size={}", node, instanceToNodeMap.size(), activeInstances.size()); trySchedulingPendingTasks(); } private void reenableDisabledNode(NodeInfo nodeInfo) { writeLock.lock(); try { LOG.info("Attempting to re-enable node: " + nodeInfo.toShortString()); if (activeInstances.getInstance(nodeInfo.getNodeIdentity()) != null) { nodeInfo.enableNode(); if (metrics != null) { metrics.setDisabledNodeCount(disabledNodesQueue.size()); } } else { if (LOG.isInfoEnabled()) { LOG.info( "Not re-enabling node: {}, since it is not present in the RegistryActiveNodeList", nodeInfo.toShortString()); } } } finally { writeLock.unlock(); } } /** * Updates relevant structures on the node, and fixes the position in the disabledNodeQueue * to facilitate the actual re-enablement of the node. * @param nodeInfo the node to be re-enabled */ private void queueNodeForReEnablement(final NodeInfo nodeInfo) { if ( disabledNodesQueue.remove(nodeInfo)) { LOG.info("Queueing node for re-enablement: {}", nodeInfo.toShortString()); nodeInfo.resetExpireInformation(); disabledNodesQueue.add(nodeInfo); } } private void disableNode(NodeInfo nodeInfo, boolean isCommFailure) { writeLock.lock(); try { if (nodeInfo == null || nodeInfo.isDisabled()) { if (LOG.isDebugEnabled()) { if (nodeInfo != null) { LOG.debug("Node: " + nodeInfo.toShortString() + " already disabled, or invalid. Not doing anything."); } else { LOG.debug("Ignoring disableNode invocation for null NodeInfo"); } } } else { nodeInfo.disableNode(isCommFailure); // TODO: handle task to container map events in case of hard failures disabledNodesQueue.add(nodeInfo); if (metrics != null) { metrics.setDisabledNodeCount(disabledNodesQueue.size()); } // Trigger a scheduling run - in case there's some task which was waiting for this node to // become available. trySchedulingPendingTasks(); } } finally { writeLock.unlock(); } } private static NodeReport constructNodeReport(ServiceInstance serviceInstance, boolean healthy) { NodeReport nodeReport = NodeReport.newInstance(NodeId .newInstance(serviceInstance.getHost(), serviceInstance.getRpcPort()), healthy ? NodeState.RUNNING : NodeState.LOST, serviceInstance.getServicesAddress(), null, null, null, 0, "", 0l); return nodeReport; } private void addPendingTask(TaskInfo taskInfo) { writeLock.lock(); try { List<TaskInfo> tasksAtPriority = pendingTasks.get(taskInfo.priority); if (tasksAtPriority == null) { tasksAtPriority = new LinkedList<>(); pendingTasks.put(taskInfo.priority, tasksAtPriority); } // Delayed tasks will not kick in right now. That will happen in the scheduling loop. tasksAtPriority.add(taskInfo); knownTasks.putIfAbsent(taskInfo.task, taskInfo); if (metrics != null) { metrics.incrPendingTasksCount(); } if (LOG.isInfoEnabled()) { LOG.info("PendingTasksInfo={}", constructPendingTaskCountsLogMessage()); } } finally { writeLock.unlock(); } } /* Remove a task from the pending list */ private void removePendingTask(TaskInfo taskInfo) { writeLock.lock(); try { Priority priority = taskInfo.priority; List<TaskInfo> taskInfoList = pendingTasks.get(priority); if (taskInfoList == null || taskInfoList.isEmpty() || !taskInfoList.remove(taskInfo)) { LOG.warn("Could not find task: " + taskInfo.task + " in pending list, at priority: " + priority); } } finally { writeLock.unlock(); } } /* Register a running task into the runningTasks structure */ private void registerRunningTask(TaskInfo taskInfo) { writeLock.lock(); try { int priority = taskInfo.priority.getPriority(); TreeSet<TaskInfo> tasksAtpriority = runningTasks.get(priority); if (tasksAtpriority == null) { tasksAtpriority = new TreeSet<>(TASK_INFO_COMPARATOR); runningTasks.put(priority, tasksAtpriority); } tasksAtpriority.add(taskInfo); if (metrics != null) { metrics.decrPendingTasksCount(); } } finally { writeLock.unlock(); } } /* Unregister a task from the known and running structures */ private TaskInfo unregisterTask(Object task) { writeLock.lock(); try { TaskInfo taskInfo = knownTasks.remove(task); if (taskInfo != null) { if (taskInfo.getState() == TaskInfo.State.ASSIGNED) { // Remove from the running list. int priority = taskInfo.priority.getPriority(); Set<TaskInfo> tasksAtPriority = runningTasks.get(priority); Preconditions.checkState(tasksAtPriority != null, "runningTasks should contain an entry if the task was in running state. Caused by task: {}", task); tasksAtPriority.remove(taskInfo); if (tasksAtPriority.isEmpty()) { runningTasks.remove(priority); } } } else { LOG.warn("Could not find TaskInfo for task: {}. Not removing it from the running set", task); } return taskInfo; } finally { writeLock.unlock(); } } private enum ScheduleResult { // Successfully scheduled SCHEDULED, // Delayed to find a local match DELAYED_LOCALITY, // Delayed due to temporary resource availability DELAYED_RESOURCES, // Inadequate total resources - will never succeed / wait for new executors to become available INADEQUATE_TOTAL_RESOURCES, } @VisibleForTesting protected void schedulePendingTasks() { writeLock.lock(); try { if (LOG.isDebugEnabled()) { LOG.debug("ScheduleRun: {}", constructPendingTaskCountsLogMessage()); } Iterator<Entry<Priority, List<TaskInfo>>> pendingIterator = pendingTasks.entrySet().iterator(); Resource totalResource = getTotalResources(); while (pendingIterator.hasNext()) { Entry<Priority, List<TaskInfo>> entry = pendingIterator.next(); List<TaskInfo> taskListAtPriority = entry.getValue(); Iterator<TaskInfo> taskIter = taskListAtPriority.iterator(); boolean scheduledAllAtPriority = true; while (taskIter.hasNext()) { // TODO Optimization: Add a check to see if there's any capacity available. No point in // walking through all active nodes, if they don't have potential capacity. TaskInfo taskInfo = taskIter.next(); if (taskInfo.getNumPreviousAssignAttempts() == 1) { dagStats.registerDelayedAllocation(); } taskInfo.triedAssigningTask(); ScheduleResult scheduleResult = scheduleTask(taskInfo, totalResource); if (LOG.isDebugEnabled()) { LOG.debug("ScheduleResult for Task: {} = {}", taskInfo, scheduleResult); } if (scheduleResult == ScheduleResult.SCHEDULED) { taskIter.remove(); } else { if (scheduleResult == ScheduleResult.INADEQUATE_TOTAL_RESOURCES) { LOG.info("Inadequate total resources before scheduling pending tasks." + " Signalling scheduler timeout monitor thread to start timer."); startTimeoutMonitor(); // TODO Nothing else should be done for this task. Move on. } // Try pre-empting a task so that a higher priority task can take it's place. // Preempt only if there's no pending preemptions to avoid preempting twice for a task. String[] potentialHosts; if (scheduleResult == ScheduleResult.DELAYED_LOCALITY) { // Add the task to the delayed task queue if it does not already exist. maybeAddToDelayedTaskQueue(taskInfo); // Try preempting a lower priority task in any case. // preempt only on specific hosts, if no preemptions already exist on those. potentialHosts = taskInfo.requestedHosts; //Protect against a bad location being requested. if (potentialHosts == null || potentialHosts.length == 0) { potentialHosts = null; } } else { // preempt on any host. potentialHosts = null; } // At this point we're dealing with all return types, except ScheduleResult.SCHEDULED. if (potentialHosts != null) { if (LOG.isDebugEnabled()) { LOG.debug("Attempting to preempt on requested host for task={}, potentialHosts={}", taskInfo, Arrays.toString(potentialHosts)); } // Preempt on specific host boolean shouldPreempt = true; for (String host : potentialHosts) { // Preempt only if there are no pending preemptions on the same host // When the premption registers, the request at the highest priority will be given the slot, // even if the initial preemption was caused by some other task. // TODO Maybe register which task the preemption was for, to avoid a bad non-local allocation. MutableInt pendingHostPreemptions = pendingPreemptionsPerHost.get(host); if (pendingHostPreemptions != null && pendingHostPreemptions.intValue() > 0) { shouldPreempt = false; LOG.debug( "Not preempting for task={}. Found an existing preemption request on host={}, pendingPreemptionCount={}", taskInfo.task, host, pendingHostPreemptions.intValue()); break; } } if (shouldPreempt) { if (LOG.isDebugEnabled()) { LOG.debug("Attempting to preempt for {} on potential hosts={}. TotalPendingPreemptions={}", taskInfo.task, Arrays.toString(potentialHosts), pendingPreemptions.get()); } preemptTasks(entry.getKey().getPriority(), 1, potentialHosts); } else { if (LOG.isDebugEnabled()) { LOG.debug("Not preempting for {} on potential hosts={}. An existing preemption request exists", taskInfo.task, Arrays.toString(potentialHosts)); } } } else { // Either DELAYED_RESOURCES or DELAYED_LOCALITY with an unknown requested host. // Request for a preemption if there's none pending. If a single preemption is pending, // and this is the next task to be assigned, it will be assigned once that slot becomes available. LOG.debug("Attempting to preempt on any host for task={}, pendingPreemptions={}", taskInfo.task, pendingPreemptions.get()); if (pendingPreemptions.get() == 0) { if (LOG.isDebugEnabled()) { LOG.debug( "Attempting to preempt for task={}, priority={} on any available host", taskInfo.task, taskInfo.priority); } preemptTasks(entry.getKey().getPriority(), 1, null); } else { if (LOG.isDebugEnabled()) { LOG.debug( "Skipping preemption since there are {} pending preemption request. For task={}", pendingPreemptions.get(), taskInfo); } } } // Since there was an allocation failure - don't try assigning tasks at the next priority. scheduledAllAtPriority = false; // Don't break if this allocation failure was a result of a LOCALITY_DELAY. Others could still be allocated. if (scheduleResult != ScheduleResult.DELAYED_LOCALITY) { break; } } // end of else - i.e. could not allocate } // end of loop over pending tasks if (taskListAtPriority.isEmpty()) { // Remove the entry, if there's nothing left at the specific priority level pendingIterator.remove(); } if (!scheduledAllAtPriority) { LOG.debug( "Unable to schedule all requests at priority={}. Skipping subsequent priority levels", entry.getKey()); // Don't attempt scheduling for additional priorities break; } } } finally { writeLock.unlock(); } } private String constructPendingTaskCountsLogMessage() { StringBuilder sb = new StringBuilder(); int totalCount = 0; sb.append("numPriorityLevels=").append(pendingTasks.size()).append(". "); Iterator<Entry<Priority, List<TaskInfo>>> pendingIterator = pendingTasks.entrySet().iterator(); while (pendingIterator.hasNext()) { Entry<Priority, List<TaskInfo>> entry = pendingIterator.next(); int count = entry.getValue() == null ? 0 : entry.getValue().size(); sb.append("[p=").append(entry.getKey().toString()).append(",c=").append(count).append("]"); totalCount += count; } sb.append(". totalPendingTasks=").append(totalCount); sb.append(". delayedTaskQueueSize=").append(delayedTaskQueue.size()); return sb.toString(); } private ScheduleResult scheduleTask(TaskInfo taskInfo, Resource totalResource) { Preconditions.checkNotNull(totalResource, "totalResource can not be null"); // If there's no memory available, fail if (totalResource.getMemory() <= 0) { return SELECT_HOST_RESULT_INADEQUATE_TOTAL_CAPACITY.scheduleResult; } SelectHostResult selectHostResult = selectHost(taskInfo); if (selectHostResult.scheduleResult == ScheduleResult.SCHEDULED) { NodeInfo nodeInfo = selectHostResult.nodeInfo; Container container = containerFactory.createContainer(nodeInfo.getResourcePerExecutor(), taskInfo.priority, nodeInfo.getHost(), nodeInfo.getRpcPort(), nodeInfo.getServiceAddress()); writeLock.lock(); // While updating local structures try { // The canAccept part of this log message does not account for this allocation. assignedTaskCounter.incrementAndGet(); LOG.info("Assigned #{}, task={} on node={}, to container={}", assignedTaskCounter.get(), taskInfo, nodeInfo.toShortString(), container.getId()); dagStats.registerTaskAllocated(taskInfo.requestedHosts, taskInfo.requestedRacks, nodeInfo.getHost()); taskInfo.setAssignmentInfo(nodeInfo, container.getId(), clock.getTime()); registerRunningTask(taskInfo); nodeInfo.registerTaskScheduled(); } finally { writeLock.unlock(); } getContext().taskAllocated(taskInfo.task, taskInfo.clientCookie, container); } return selectHostResult.scheduleResult; } // Removes tasks from the runningList and sends out a preempt request to the system. // Subsequent tasks will be scheduled again once the de-allocate request for the preempted // task is processed. private void preemptTasks(int forPriority, int numTasksToPreempt, String []potentialHosts) { Set<String> preemptHosts = null; writeLock.lock(); List<TaskInfo> preemptedTaskList = null; try { NavigableMap<Integer, TreeSet<TaskInfo>> orderedMap = runningTasks.descendingMap(); Iterator<Entry<Integer, TreeSet<TaskInfo>>> iterator = orderedMap.entrySet().iterator(); int preemptedCount = 0; while (iterator.hasNext() && preemptedCount < numTasksToPreempt) { Entry<Integer, TreeSet<TaskInfo>> entryAtPriority = iterator.next(); if (entryAtPriority.getKey() > forPriority) { if (potentialHosts != null && preemptHosts == null) { preemptHosts = Sets.newHashSet(potentialHosts); } Iterator<TaskInfo> taskInfoIterator = entryAtPriority.getValue().iterator(); while (taskInfoIterator.hasNext() && preemptedCount < numTasksToPreempt) { TaskInfo taskInfo = taskInfoIterator.next(); if (preemptHosts == null || preemptHosts.contains(taskInfo.assignedNode.getHost())) { // Candidate for preemption. preemptedCount++; LOG.info("preempting {} for task at priority {} with potentialHosts={}", taskInfo, forPriority, potentialHosts == null ? "" : Arrays.toString(potentialHosts)); taskInfo.setPreemptedInfo(clock.getTime()); if (preemptedTaskList == null) { preemptedTaskList = new LinkedList<>(); } dagStats.registerTaskPreempted(taskInfo.assignedNode.getHost()); preemptedTaskList.add(taskInfo); registerPendingPreemption(taskInfo.assignedNode.getHost()); // Remove from the runningTaskList taskInfoIterator.remove(); } } // Remove entire priority level if it's been emptied. if (entryAtPriority.getValue().isEmpty()) { iterator.remove(); } } else { // No tasks qualify as preemptable LOG.debug("No tasks qualify as killable to schedule tasks at priority {}. Current priority={}", forPriority, entryAtPriority.getKey()); break; } } } finally { writeLock.unlock(); } // Send out the preempted request outside of the lock. if (preemptedTaskList != null) { for (TaskInfo taskInfo : preemptedTaskList) { LOG.info("Preempting task {}", taskInfo); getContext().preemptContainer(taskInfo.containerId); // Preemption will finally be registered as a deallocateTask as a result of preemptContainer // That resets preemption info and allows additional tasks to be pre-empted if required. } } // The schedule loop will be triggered again when the deallocateTask request comes in for the // preempted task. } private void registerPendingPreemption(String host) { writeLock.lock(); try { pendingPreemptions.incrementAndGet(); if (metrics != null) { metrics.incrPendingPreemptionTasksCount(); } MutableInt val = pendingPreemptionsPerHost.get(host); if (val == null) { val = new MutableInt(0); pendingPreemptionsPerHost.put(host, val); } val.increment(); } finally { writeLock.unlock(); } } private void unregisterPendingPreemption(String host) { writeLock.lock(); try { pendingPreemptions.decrementAndGet(); if (metrics != null) { metrics.decrPendingPreemptionTasksCount(); } MutableInt val = pendingPreemptionsPerHost.get(host); Preconditions.checkNotNull(val); val.decrement(); // Not bothering with removing the entry. There's a limited number of hosts, and a good // chance that the entry will make it back in when the AM is used for a long duration. } finally { writeLock.unlock(); } } private void maybeAddToDelayedTaskQueue(TaskInfo taskInfo) { // There's no point adding a task with forceLocality set - since that will never exit the queue. // Add other tasks if they are not already in the queue. if (!taskInfo.shouldForceLocality() && !taskInfo.isInDelayedQueue()) { taskInfo.setInDelayedQueue(true); delayedTaskQueue.add(taskInfo); } } // ------ Inner classes defined after this point ------ @VisibleForTesting class DelayedTaskSchedulerCallable implements Callable<Void> { private final AtomicBoolean isShutdown = new AtomicBoolean(false); @Override public Void call() { while (!isShutdown.get() && !Thread.currentThread().isInterrupted()) { try { TaskInfo taskInfo = getNextTask(); taskInfo.setInDelayedQueue(false); // Tasks can exist in the delayed queue even after they have been scheduled. // Trigger scheduling only if the task is still in PENDING state. processEvictedTask(taskInfo); } catch (InterruptedException e) { if (isShutdown.get()) { LOG.info("DelayedTaskScheduler thread interrupted after shutdown"); break; } else { LOG.warn("DelayedTaskScheduler thread interrupted before being shutdown"); throw new RuntimeException( "DelayedTaskScheduler thread interrupted without being shutdown", e); } } } return null; } public void shutdown() { isShutdown.set(true); } public TaskInfo getNextTask() throws InterruptedException { return delayedTaskQueue.take(); } public void processEvictedTask(TaskInfo taskInfo) { if (shouldScheduleTask(taskInfo)) { trySchedulingPendingTasks(); } } public boolean shouldScheduleTask(TaskInfo taskInfo) { return taskInfo.getState() == TaskInfo.State.PENDING; } } @VisibleForTesting DelayedTaskSchedulerCallable createDelayedTaskSchedulerCallable() { return new DelayedTaskSchedulerCallable(); } private class NodeEnablerCallable implements Callable<Void> { private final AtomicBoolean isShutdown = new AtomicBoolean(false); private static final long POLL_TIMEOUT = 10000L; @Override public Void call() { while (!isShutdown.get() && !Thread.currentThread().isInterrupted()) { try { NodeInfo nodeInfo = disabledNodesQueue.poll(POLL_TIMEOUT, TimeUnit.MILLISECONDS); if (nodeInfo != null) { // A node became available. Enable the node and try scheduling. reenableDisabledNode(nodeInfo); trySchedulingPendingTasks(); } } catch (InterruptedException e) { if (isShutdown.get()) { LOG.info("NodeEnabler thread interrupted after shutdown"); break; } else { LOG.warn("NodeEnabler thread interrupted without being shutdown"); throw new RuntimeException("NodeEnabler thread interrupted without being shutdown", e); } } } return null; } // Call this first, then send in an interrupt to the thread. public void shutdown() { isShutdown.set(true); } } private void trySchedulingPendingTasks() { scheduleLock.lock(); try { pendingScheduleInvocations.set(true); scheduleCondition.signal(); } finally { scheduleLock.unlock(); } } private class SchedulerTimeoutMonitor implements Runnable { private final Logger LOG = LoggerFactory.getLogger(SchedulerTimeoutMonitor.class); @Override public void run() { LOG.info("Reporting SERVICE_UNAVAILABLE error as no instances are running"); try { getContext().reportError(ServicePluginErrorDefaults.SERVICE_UNAVAILABLE, "No LLAP Daemons are running", getContext().getCurrentDagInfo()); } catch (Exception e) { DagInfo currentDagInfo = getContext().getCurrentDagInfo(); LOG.error("Exception when reporting SERVICE_UNAVAILABLE error for dag: {}", currentDagInfo == null ? "" : currentDagInfo.getName(), e); } } } private class SchedulerCallable implements Callable<Void> { private AtomicBoolean isShutdown = new AtomicBoolean(false); @Override public Void call() { while (!isShutdown.get() && !Thread.currentThread().isInterrupted()) { scheduleLock.lock(); try { while (!pendingScheduleInvocations.get()) { scheduleCondition.await(); } } catch (InterruptedException e) { if (isShutdown.get()) { LOG.info("Scheduler thread interrupted after shutdown"); break; } else { LOG.warn("Scheduler thread interrupted without being shutdown"); throw new RuntimeException("Scheduler thread interrupted without being shutdown", e); } } finally { scheduleLock.unlock(); } // Set pending to false since scheduling is about to run. Any triggers up to this point // will be handled in the next run. // A new request may come in right after this is set to false, but before the actual scheduling. // This will be handled in this run, but will cause an immediate run after, which is harmless. // This is mainly to handle a trySchedue request while in the middle of a run - since the event // which triggered it may not be processed for all tasks in the run. pendingScheduleInvocations.set(false); // Schedule outside of the scheduleLock - which should only be used to wait on the condition. schedulePendingTasks(); } return null; } // Call this first, then send in an interrupt to the thread. public void shutdown() { isShutdown.set(true); } } // ------ Additional static classes defined after this point ------ @VisibleForTesting static class NodeInfo implements Delayed { private final NodeBlacklistConf blacklistConf; final ServiceInstance serviceInstance; private final Clock clock; long expireTimeMillis = -1; private long numSuccessfulTasks = 0; private long numSuccessfulTasksAtLastBlacklist = -1; float cumulativeBackoffFactor = 1.0f; // Indicates whether a node had a recent communication failure. // This is primarily for tracking and logging purposes for the moment. // TODO At some point, treat task rejection and communication failures differently. private boolean hadCommFailure = false; // Indicates whether a node is disabled - for whatever reason - commFailure, busy, etc. private boolean disabled = false; private int numPreemptedTasks = 0; private int numScheduledTasks = 0; private final int numSchedulableTasks; private final LlapTaskSchedulerMetrics metrics; private final Resource resourcePerExecutor; private final String shortStringBase; /** * Create a NodeInfo bound to a service instance * @param serviceInstance the associated serviceInstance * @param blacklistConf blacklist configuration * @param clock clock to use to obtain timing information * @param numSchedulableTasksConf number of schedulable tasks on the node. 0 represents auto * detect based on the serviceInstance, -1 indicates indicates * @param metrics */ NodeInfo(ServiceInstance serviceInstance, NodeBlacklistConf blacklistConf, Clock clock, int numSchedulableTasksConf, final LlapTaskSchedulerMetrics metrics) { Preconditions.checkArgument(numSchedulableTasksConf >= -1, "NumSchedulableTasks must be >=-1"); this.serviceInstance = serviceInstance; this.blacklistConf = blacklistConf; this.clock = clock; this.metrics = metrics; int numVcores = serviceInstance.getResource().getVirtualCores(); int memoryPerInstance = serviceInstance.getResource().getMemory(); int memoryPerExecutor = (int)(memoryPerInstance / (double) numVcores); resourcePerExecutor = Resource.newInstance(memoryPerExecutor, 1); if (numSchedulableTasksConf == 0) { int pendingQueueuCapacity = 0; String pendingQueueCapacityString = serviceInstance.getProperties() .get(ConfVars.LLAP_DAEMON_TASK_SCHEDULER_WAIT_QUEUE_SIZE.varname); LOG.info("Setting up node: {} with available capacity={}, pendingQueueSize={}, memory={}", serviceInstance, serviceInstance.getResource().getVirtualCores(), pendingQueueCapacityString, serviceInstance.getResource().getMemory()); if (pendingQueueCapacityString != null) { pendingQueueuCapacity = Integer.parseInt(pendingQueueCapacityString); } this.numSchedulableTasks = numVcores + pendingQueueuCapacity; } else { this.numSchedulableTasks = numSchedulableTasksConf; LOG.info("Setting up node: " + serviceInstance + " with schedulableCapacity=" + this.numSchedulableTasks); } if (metrics != null) { metrics.incrSchedulableTasksCount(numSchedulableTasks); } shortStringBase = setupShortStringBase(); } String getNodeIdentity() { return serviceInstance.getWorkerIdentity(); } String getHost() { return serviceInstance.getHost(); } int getRpcPort() { return serviceInstance.getRpcPort(); } String getServiceAddress() { return serviceInstance.getServicesAddress(); } public Resource getResourcePerExecutor() { return resourcePerExecutor; } void resetExpireInformation() { expireTimeMillis = -1; hadCommFailure = false; } void enableNode() { resetExpireInformation(); disabled = false; } void disableNode(boolean commFailure) { long duration = blacklistConf.minDelay; long currentTime = clock.getTime(); this.hadCommFailure = commFailure; disabled = true; if (numSuccessfulTasksAtLastBlacklist == numSuccessfulTasks) { // Relying on a task succeeding to reset the exponent. // There's no notifications on whether a task gets accepted or not. That would be ideal to // reset this. cumulativeBackoffFactor = cumulativeBackoffFactor * blacklistConf.backoffFactor; } else { // Was able to execute something before the last blacklist. Reset the exponent. cumulativeBackoffFactor = 1.0f; } long delayTime = (long) (duration * cumulativeBackoffFactor); if (delayTime > blacklistConf.maxDelay) { delayTime = blacklistConf.maxDelay; } if (LOG.isInfoEnabled()) { LOG.info("Disabling instance {} for {} milli-seconds. commFailure={}", toShortString(), delayTime, commFailure); } expireTimeMillis = currentTime + delayTime; numSuccessfulTasksAtLastBlacklist = numSuccessfulTasks; } void registerTaskScheduled() { numScheduledTasks++; if (metrics != null) { metrics.incrRunningTasksCount(); metrics.decrSchedulableTasksCount(); } } void registerTaskSuccess() { numSuccessfulTasks++; numScheduledTasks--; if (metrics != null) { metrics.incrSuccessfulTasksCount(); metrics.decrRunningTasksCount(); metrics.incrSchedulableTasksCount(); } } void registerUnsuccessfulTaskEnd(boolean wasPreempted) { numScheduledTasks--; if (metrics != null) { metrics.decrRunningTasksCount(); metrics.incrSchedulableTasksCount(); } if (wasPreempted) { numPreemptedTasks++; if (metrics != null) { metrics.incrPreemptedTasksCount(); } } } /** * @return the time at which this node will be re-enabled */ long getEnableTime() { return expireTimeMillis; } public boolean isDisabled() { return disabled; } boolean hadCommFailure() { return hadCommFailure; } boolean _canAccepInternal() { return !hadCommFailure && !disabled &&(numSchedulableTasks == -1 || ((numSchedulableTasks - numScheduledTasks) > 0)); } int canAcceptCounter = 0; /* Returning true does not guarantee that the task will run, considering other queries may be running in the system. Also depends upon the capacity usage configuration */ boolean canAcceptTask() { boolean result = _canAccepInternal(); if (LOG.isTraceEnabled()) { LOG.trace(constructCanAcceptLogResult(result)); } if (canAcceptCounter == 10000) { canAcceptCounter++; LOG.info(constructCanAcceptLogResult(result)); canAcceptCounter = 0; } return result; } String constructCanAcceptLogResult(boolean result) { StringBuilder sb = new StringBuilder(); sb.append("Node[").append(serviceInstance.getHost()).append(":").append(serviceInstance.getRpcPort()) .append(", ").append(serviceInstance.getWorkerIdentity()).append("]: ") .append("canAcceptTask=").append(result) .append(", numScheduledTasks=").append(numScheduledTasks) .append(", numSchedulableTasks=").append(numSchedulableTasks) .append(", hadCommFailure=").append(hadCommFailure) .append(", disabled=").append(disabled); return sb.toString(); } @Override public long getDelay(TimeUnit unit) { return unit.convert(expireTimeMillis - clock.getTime(), TimeUnit.MILLISECONDS); } @Override public int compareTo(Delayed o) { NodeInfo other = (NodeInfo) o; if (other.expireTimeMillis > this.expireTimeMillis) { return -1; } else if (other.expireTimeMillis < this.expireTimeMillis) { return 1; } else { return 0; } } private String setupShortStringBase() { return "{" + serviceInstance.getHost() + ":" + serviceInstance.getRpcPort() + ", id=" + getNodeIdentity(); } @Override public String toString() { return "NodeInfo{" + "instance=" + serviceInstance + ", expireTimeMillis=" + expireTimeMillis + ", numSuccessfulTasks=" + numSuccessfulTasks + ", numSuccessfulTasksAtLastBlacklist=" + numSuccessfulTasksAtLastBlacklist + ", cumulativeBackoffFactor=" + cumulativeBackoffFactor + ", numSchedulableTasks=" + numSchedulableTasks + ", numScheduledTasks=" + numScheduledTasks + ", disabled=" + disabled + ", commFailures=" + hadCommFailure +'}'; } private String toShortString() { StringBuilder sb = new StringBuilder(); sb.append(", canAcceptTask=").append(_canAccepInternal()); sb.append(", st=").append(numScheduledTasks); sb.append(", ac=").append((numSchedulableTasks - numScheduledTasks)); sb.append(", commF=").append(hadCommFailure); sb.append(", disabled=").append(disabled); sb.append("}"); return shortStringBase + sb.toString(); } } @VisibleForTesting static class StatsPerDag { int numRequestedAllocations = 0; int numRequestsWithLocation = 0; int numRequestsWithoutLocation = 0; int numTotalAllocations = 0; int numLocalAllocations = 0; int numNonLocalAllocations = 0; int numAllocationsNoLocalityRequest = 0; int numRejectedTasks = 0; int numCommFailures = 0; int numDelayedAllocations = 0; int numPreemptedTasks = 0; Map<String, AtomicInteger> localityBasedNumAllocationsPerHost = new HashMap<>(); Map<String, AtomicInteger> numAllocationsPerHost = new HashMap<>(); @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("NumPreemptedTasks=").append(numPreemptedTasks).append(", "); sb.append("NumRequestedAllocations=").append(numRequestedAllocations).append(", "); sb.append("NumRequestsWithlocation=").append(numRequestsWithLocation).append(", "); sb.append("NumLocalAllocations=").append(numLocalAllocations).append(","); sb.append("NumNonLocalAllocations=").append(numNonLocalAllocations).append(","); sb.append("NumTotalAllocations=").append(numTotalAllocations).append(","); sb.append("NumRequestsWithoutLocation=").append(numRequestsWithoutLocation).append(", "); sb.append("NumRejectedTasks=").append(numRejectedTasks).append(", "); sb.append("NumCommFailures=").append(numCommFailures).append(", "); sb.append("NumDelayedAllocations=").append(numDelayedAllocations).append(", "); sb.append("LocalityBasedAllocationsPerHost=").append(localityBasedNumAllocationsPerHost) .append(", "); sb.append("NumAllocationsPerHost=").append(numAllocationsPerHost); return sb.toString(); } void registerTaskRequest(String[] requestedHosts, String[] requestedRacks) { numRequestedAllocations++; // TODO Change after HIVE-9987. For now, there's no rack matching. if (requestedHosts != null && requestedHosts.length != 0) { numRequestsWithLocation++; } else { numRequestsWithoutLocation++; } } void registerTaskAllocated(String[] requestedHosts, String[] requestedRacks, String allocatedHost) { // TODO Change after HIVE-9987. For now, there's no rack matching. if (requestedHosts != null && requestedHosts.length != 0) { Set<String> requestedHostSet = new HashSet<>(Arrays.asList(requestedHosts)); if (requestedHostSet.contains(allocatedHost)) { numLocalAllocations++; _registerAllocationInHostMap(allocatedHost, localityBasedNumAllocationsPerHost); } else { numNonLocalAllocations++; } } else { numAllocationsNoLocalityRequest++; } numTotalAllocations++; _registerAllocationInHostMap(allocatedHost, numAllocationsPerHost); } // TODO Track stats of rejections etc per host void registerTaskPreempted(String host) { numPreemptedTasks++; } void registerCommFailure(String host) { numCommFailures++; } void registerTaskRejected(String host) { numRejectedTasks++; } void registerDelayedAllocation() { numDelayedAllocations++; } private void _registerAllocationInHostMap(String host, Map<String, AtomicInteger> hostMap) { AtomicInteger val = hostMap.get(host); if (val == null) { val = new AtomicInteger(0); hostMap.put(host, val); } val.incrementAndGet(); } } // TODO There needs to be a mechanism to figure out different attempts for the same task. Delays // could potentially be changed based on this. @VisibleForTesting static class TaskInfo implements Delayed { enum State { PENDING, ASSIGNED, PREEMPTED } // IDs used to ensure two TaskInfos are different without using the underlying task instance. // Required for insertion into a TreeMap static final AtomicLong ID_GEN = new AtomicLong(0); final long uniqueId; final LocalityDelayConf localityDelayConf; final Clock clock; final Object task; final Object clientCookie; final Priority priority; final Resource capability; final String[] requestedHosts; final String[] requestedRacks; final long requestTime; final long localityDelayTimeout; long startTime; long preemptTime; ContainerId containerId; NodeInfo assignedNode; private State state = State.PENDING; boolean inDelayedQueue = false; private int numAssignAttempts = 0; // TaskInfo instances for two different tasks will not be the same. Only a single instance should // ever be created for a taskAttempt public TaskInfo(LocalityDelayConf localityDelayConf, Clock clock, Object task, Object clientCookie, Priority priority, Resource capability, String[] hosts, String[] racks, long requestTime) { this.localityDelayConf = localityDelayConf; this.clock = clock; this.task = task; this.clientCookie = clientCookie; this.priority = priority; this.capability = capability; this.requestedHosts = hosts; this.requestedRacks = racks; this.requestTime = requestTime; if (localityDelayConf.getNodeLocalityDelay() == -1) { localityDelayTimeout = Long.MAX_VALUE; } else if (localityDelayConf.getNodeLocalityDelay() == 0) { localityDelayTimeout = 0L; } else { localityDelayTimeout = requestTime + localityDelayConf.getNodeLocalityDelay(); } this.uniqueId = ID_GEN.getAndIncrement(); } synchronized void setAssignmentInfo(NodeInfo nodeInfo, ContainerId containerId, long startTime) { this.assignedNode = nodeInfo; this.containerId = containerId; this.startTime = startTime; this.state = State.ASSIGNED; } synchronized void setPreemptedInfo(long preemptTime) { this.state = State.PREEMPTED; this.preemptTime = preemptTime; } synchronized void setInDelayedQueue(boolean val) { this.inDelayedQueue = val; } synchronized void triedAssigningTask() { numAssignAttempts++; } synchronized int getNumPreviousAssignAttempts() { return numAssignAttempts; } synchronized State getState() { return state; } synchronized boolean isInDelayedQueue() { return inDelayedQueue; } boolean shouldDelayForLocality(long schedulerAttemptTime) { // getDelay <=0 means the task will be evicted from the queue. return localityDelayTimeout > schedulerAttemptTime; } boolean shouldForceLocality() { return localityDelayTimeout == Long.MAX_VALUE; } long getLocalityDelayTimeout() { return localityDelayTimeout; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } TaskInfo taskInfo = (TaskInfo) o; if (uniqueId != taskInfo.uniqueId) { return false; } return task.equals(taskInfo.task); } @Override public int hashCode() { int result = (int) (uniqueId ^ (uniqueId >>> 32)); result = 31 * result + task.hashCode(); return result; } @Override public String toString() { return "TaskInfo{" + "task=" + task + ", priority=" + priority + ", startTime=" + startTime + ", containerId=" + containerId + (assignedNode != null ? "assignedNode=" + assignedNode.toShortString() : "") + ", uniqueId=" + uniqueId + ", localityDelayTimeout=" + localityDelayTimeout + '}'; } @Override public long getDelay(TimeUnit unit) { return unit.convert(localityDelayTimeout - clock.getTime(), TimeUnit.MILLISECONDS); } @Override public int compareTo(Delayed o) { TaskInfo other = (TaskInfo) o; if (other.localityDelayTimeout > this.localityDelayTimeout) { return -1; } else if (other.localityDelayTimeout < this.localityDelayTimeout) { return 1; } else { return 0; } } } // Newer tasks first. private static class TaskStartComparator implements Comparator<TaskInfo> { @Override public int compare(TaskInfo o1, TaskInfo o2) { if (o1.startTime > o2.startTime) { return -1; } else if (o1.startTime < o2.startTime) { return 1; } else { // Comparing on time is not sufficient since two may be created at the same time, // in which case inserting into a TreeSet/Map would break if (o1.uniqueId > o2.uniqueId) { return -1; } else if (o1.uniqueId < o2.uniqueId) { return 1; } else { return 0; } } } } private static class SelectHostResult { final NodeInfo nodeInfo; final ScheduleResult scheduleResult; SelectHostResult(NodeInfo nodeInfo) { this.nodeInfo = nodeInfo; this.scheduleResult = ScheduleResult.SCHEDULED; } SelectHostResult(ScheduleResult scheduleResult) { this.nodeInfo = null; this.scheduleResult = scheduleResult; } } private static final SelectHostResult SELECT_HOST_RESULT_INADEQUATE_TOTAL_CAPACITY = new SelectHostResult(ScheduleResult.INADEQUATE_TOTAL_RESOURCES); private static final SelectHostResult SELECT_HOST_RESULT_DELAYED_LOCALITY = new SelectHostResult(ScheduleResult.DELAYED_LOCALITY); private static final SelectHostResult SELECT_HOST_RESULT_DELAYED_RESOURCES = new SelectHostResult(ScheduleResult.DELAYED_RESOURCES); private static final class NodeBlacklistConf { private final long minDelay; private final long maxDelay; private final float backoffFactor; public NodeBlacklistConf(long minDelay, long maxDelay, float backoffFactor) { this.minDelay = minDelay; this.maxDelay = maxDelay; this.backoffFactor = backoffFactor; } @Override public String toString() { return "NodeBlacklistConf{" + "minDelay=" + minDelay + ", maxDelay=" + maxDelay + ", backoffFactor=" + backoffFactor + '}'; } } @VisibleForTesting static final class LocalityDelayConf { private final long nodeLocalityDelay; public LocalityDelayConf(long nodeLocalityDelay) { this.nodeLocalityDelay = nodeLocalityDelay; } public long getNodeLocalityDelay() { return nodeLocalityDelay; } @Override public String toString() { return "LocalityDelayConf{" + "nodeLocalityDelay=" + nodeLocalityDelay + '}'; } } }