/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * <p/> * http://www.apache.org/licenses/LICENSE-2.0 * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.llap.daemon.impl; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.text.SimpleDateFormat; import java.util.Comparator; import java.util.Date; import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.PriorityBlockingQueue; import java.util.concurrent.RejectedExecutionException; import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.lang3.exception.ExceptionUtils; import org.apache.hadoop.hive.llap.daemon.FinishableStateUpdateHandler; import org.apache.hadoop.hive.llap.daemon.SchedulerFragmentCompletingListener; import org.apache.hadoop.hive.llap.daemon.rpc.LlapDaemonProtocolProtos.FragmentRuntimeInfo; import org.apache.hadoop.hive.llap.daemon.rpc.LlapDaemonProtocolProtos.SignableVertexSpec; import org.apache.hadoop.hive.llap.metrics.LlapDaemonExecutorMetrics; import org.apache.hadoop.hive.llap.tezplugins.helpers.MonotonicClock; import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.yarn.util.Clock; import org.apache.tez.runtime.task.EndReason; import org.apache.tez.runtime.task.TaskRunner2Result; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.annotations.VisibleForTesting; import com.google.common.util.concurrent.FutureCallback; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.ListeningExecutorService; import com.google.common.util.concurrent.MoreExecutors; import com.google.common.util.concurrent.ThreadFactoryBuilder; /** * Task executor service provides method for scheduling tasks. Tasks submitted to executor service * are submitted to wait queue for scheduling. Wait queue tasks are ordered based on the priority * of the task. The internal wait queue scheduler moves tasks from wait queue when executor slots * are available or when a higher priority task arrives and will schedule it for execution. * When pre-emption is enabled, the tasks from wait queue can replace(pre-empt) a running task. * The pre-empted task is reported back to the Application Master(AM) for it to be rescheduled. * <p/> * Because of the concurrent nature of task submission, the position of the task in wait queue is * held as long the scheduling of the task from wait queue (with or without pre-emption) is complete. * The order of pre-emption is based on the ordering in the pre-emption queue. All tasks that cannot * run to completion immediately (canFinish = false) are added to pre-emption queue. * <p/> * When all the executor threads are occupied and wait queue is full, the task scheduler will * return SubmissionState.REJECTED response * <p/> * Task executor service can be shut down which will terminated all running tasks and reject all * new tasks. Shutting down of the task executor service can be done gracefully or immediately. */ public class TaskExecutorService extends AbstractService implements Scheduler<TaskRunnerCallable>, SchedulerFragmentCompletingListener { private static final Logger LOG = LoggerFactory.getLogger(TaskExecutorService.class); private static final boolean isInfoEnabled = LOG.isInfoEnabled(); private static final boolean isDebugEnabled = LOG.isDebugEnabled(); private static final String TASK_EXECUTOR_THREAD_NAME_FORMAT = "Task-Executor-%d"; private static final String WAIT_QUEUE_SCHEDULER_THREAD_NAME_FORMAT = "Wait-Queue-Scheduler-%d"; private static final long PREEMPTION_KILL_GRACE_MS = 500; // 500ms private static final int PREEMPTION_KILL_GRACE_SLEEP_MS = 50; // 50ms private final AtomicBoolean isShutdown = new AtomicBoolean(false); // Thread pool for actual execution of work. private final ListeningExecutorService executorService; @VisibleForTesting final EvictingPriorityBlockingQueue<TaskWrapper> waitQueue; // Thread pool for taking entities off the wait queue. private final ListeningExecutorService waitQueueExecutorService; // Thread pool for callbacks on completion of execution of a work unit. private final ListeningExecutorService executionCompletionExecutorService; @VisibleForTesting final BlockingQueue<TaskWrapper> preemptionQueue; private final boolean enablePreemption; private final ThreadPoolExecutor threadPoolExecutor; private final AtomicInteger numSlotsAvailable; private final int maxParallelExecutors; private final Clock clock; // Tracks running fragments, and completing fragments. // Completing since we have a race in the AM being notified and the task actually // falling off, and the executor service being ready to schedule a new task. private final AtomicInteger runningFragmentCount = new AtomicInteger(0); @VisibleForTesting // Tracks known tasks. final ConcurrentMap<String, TaskWrapper> knownTasks = new ConcurrentHashMap<>(); private final Object lock = new Object(); private final LlapDaemonExecutorMetrics metrics; public TaskExecutorService(int numExecutors, int waitQueueSize, String waitQueueComparatorClassName, boolean enablePreemption, ClassLoader classLoader, final LlapDaemonExecutorMetrics metrics, Clock clock) { super(TaskExecutorService.class.getSimpleName()); LOG.info("TaskExecutorService is being setup with parameters: " + "numExecutors=" + numExecutors + ", waitQueueSize=" + waitQueueSize + ", waitQueueComparatorClassName=" + waitQueueComparatorClassName + ", enablePreemption=" + enablePreemption); final Comparator<TaskWrapper> waitQueueComparator = createComparator( waitQueueComparatorClassName); this.maxParallelExecutors = numExecutors; this.waitQueue = new EvictingPriorityBlockingQueue<>(waitQueueComparator, waitQueueSize); this.clock = clock == null ? new MonotonicClock() : clock; this.threadPoolExecutor = new ThreadPoolExecutor(numExecutors, // core pool size numExecutors, // max pool size 1, TimeUnit.MINUTES, new SynchronousQueue<Runnable>(), // direct hand-off new ExecutorThreadFactory(classLoader)); this.executorService = MoreExecutors.listeningDecorator(threadPoolExecutor); this.preemptionQueue = new PriorityBlockingQueue<>(numExecutors, new PreemptionQueueComparator()); this.enablePreemption = enablePreemption; this.numSlotsAvailable = new AtomicInteger(numExecutors); this.metrics = metrics; if (metrics != null) { metrics.setNumExecutorsAvailable(numSlotsAvailable.get()); } // single threaded scheduler for tasks from wait queue to executor threads ExecutorService wes = Executors.newFixedThreadPool(1, new ThreadFactoryBuilder() .setDaemon(true).setNameFormat(WAIT_QUEUE_SCHEDULER_THREAD_NAME_FORMAT).build()); this.waitQueueExecutorService = MoreExecutors.listeningDecorator(wes); ExecutorService executionCompletionExecutorServiceRaw = Executors.newFixedThreadPool(1, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("ExecutionCompletionThread #%d") .build()); executionCompletionExecutorService = MoreExecutors.listeningDecorator( executionCompletionExecutorServiceRaw); ListenableFuture<?> future = waitQueueExecutorService.submit(new WaitQueueWorker()); Futures.addCallback(future, new WaitQueueWorkerCallback()); } private Comparator<TaskWrapper> createComparator( String waitQueueComparatorClassName) { final Comparator<TaskWrapper> waitQueueComparator; try { Class<? extends Comparator> waitQueueComparatorClazz = (Class<? extends Comparator>) Class.forName(waitQueueComparatorClassName); Constructor<? extends Comparator> ctor = waitQueueComparatorClazz.getConstructor(null); waitQueueComparator = ctor.newInstance(null); } catch (ClassNotFoundException e) { throw new RuntimeException( "Failed to load wait queue comparator, class=" + waitQueueComparatorClassName, e); } catch (NoSuchMethodException e) { throw new RuntimeException("Failed to find constructor for wait queue comparator, class=" + waitQueueComparatorClassName, e); } catch (InvocationTargetException | InstantiationException | IllegalAccessException e) { throw new RuntimeException("Failed to find instantiate wait queue comparator, class=" + waitQueueComparatorClassName, e); } return waitQueueComparator; } @Override public void serviceStop() { shutDown(false); } private static final ThreadLocal<SimpleDateFormat> sdf = new ThreadLocal<SimpleDateFormat>() { @Override protected SimpleDateFormat initialValue() { return new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); } }; @Override public int getNumActive() { int result = 0; for (Map.Entry<String, TaskWrapper> e : knownTasks.entrySet()) { TaskWrapper task = e.getValue(); if (task.isInWaitQueue()) continue; TaskRunnerCallable c = task.getTaskRunnerCallable(); // Count the tasks in intermediate state as waiting. if (c == null || c.getStartTime() == 0) continue; ++result; } return result; } @Override public Set<String> getExecutorsStatus() { // TODO Change this method to make the output easier to parse (parse programmatically) Set<String> result = new LinkedHashSet<>(); Set<String> running = new LinkedHashSet<>(); Set<String> waiting = new LinkedHashSet<>(); StringBuilder value = new StringBuilder(); for (Map.Entry<String, TaskWrapper> e : knownTasks.entrySet()) { boolean isWaiting; value.setLength(0); value.append(e.getKey()); TaskWrapper task = e.getValue(); boolean isFirst = true; TaskRunnerCallable c = task.getTaskRunnerCallable(); if (c != null && c.getVertexSpec() != null) { SignableVertexSpec fs = c.getVertexSpec(); value.append(isFirst ? " (" : ", ").append(c.getQueryId()) .append("/").append(fs.getVertexName()); isFirst = false; } value.append(isFirst ? " (" : ", "); if (task.isInWaitQueue()) { isWaiting = true; value.append("in queue"); } else if (c != null) { long startTime = c.getStartTime(); if (startTime != 0) { isWaiting = false; value.append("started at ").append(sdf.get().format(new Date(startTime))); } else { isWaiting = false; value.append("not started"); } } else { isWaiting = true; value.append("has no callable"); } if (task.isInPreemptionQueue()) { value.append(", ").append("preemptable"); } value.append(")"); if (isWaiting) { waiting.add(value.toString()); } else { running.add(value.toString()); } } result.addAll(waiting); result.addAll(running); return result; } /** * Worker that takes tasks from wait queue and schedule it for execution. */ private final class WaitQueueWorker implements Runnable { private TaskWrapper task; @Override public void run() { try { Long lastKillTimeMs = null; while (!isShutdown.get()) { RejectedExecutionException rejectedException = null; synchronized (lock) { // Since schedule() can be called from multiple threads, we peek the wait queue, try // scheduling the task and then remove the task if scheduling is successful. This // will make sure the task's place in the wait queue is held until it gets scheduled. task = waitQueue.peek(); if (task == null) { if (!isShutdown.get()) { lock.wait(); } continue; } // If the task cannot finish and if no slots are available then don't schedule it. // Also don't wait if we have a task and we just killed something to schedule it. // (numSlotsAvailable can go negative, if the callback after the thread completes is delayed) boolean shouldWait = numSlotsAvailable.get() <= 0 && lastKillTimeMs == null; if (task.getTaskRunnerCallable().canFinish()) { if (isDebugEnabled) { LOG.debug("Attempting to schedule task {}, canFinish={}. Current state: " + "preemptionQueueSize={}, numSlotsAvailable={}, waitQueueSize={}", task.getRequestId(), task.getTaskRunnerCallable().canFinish(), preemptionQueue.size(), numSlotsAvailable.get(), waitQueue.size()); } shouldWait = shouldWait && (enablePreemption == false || preemptionQueue.isEmpty()); } if (shouldWait) { if (!isShutdown.get()) { lock.wait(); } // Another task at a higher priority may have come in during the wait. Lookup the // queue again to pick up the task at the highest priority. continue; } try { tryScheduleUnderLock(task); // Wait queue could have been re-ordered in the mean time because of concurrent task // submission. So remove the specific task instead of the head task. if (waitQueue.remove(task)) { if (metrics != null) { metrics.setExecutorNumQueuedRequests(waitQueue.size()); } } lastKillTimeMs = null; // We have filled the spot we may have killed for (if any). } catch (RejectedExecutionException e) { rejectedException = e; } } // synchronized (lock) // Handle the rejection outside of the lock if (rejectedException != null) { if (lastKillTimeMs != null && (clock.getTime() - lastKillTimeMs) < PREEMPTION_KILL_GRACE_MS) { // We killed something, but still got rejected. Wait a bit to give a chance to our // previous victim to actually die. synchronized (lock) { lock.wait(PREEMPTION_KILL_GRACE_SLEEP_MS); } } else { if (isDebugEnabled && lastKillTimeMs != null) { LOG.debug("Grace period ended for the previous kill; preemtping more tasks"); } if (handleScheduleAttemptedRejection(task)) { lastKillTimeMs = clock.getTime(); // We killed something. } } } } } catch (InterruptedException e) { if (isShutdown.get()) { LOG.info(WAIT_QUEUE_SCHEDULER_THREAD_NAME_FORMAT + " thread has been interrupted after shutdown."); } else { LOG.warn(WAIT_QUEUE_SCHEDULER_THREAD_NAME_FORMAT + " interrupted without shutdown", e); throw new RuntimeException(e); } } } } private class WaitQueueWorkerCallback implements FutureCallback { @Override public void onSuccess(Object result) { if (isShutdown.get()) { LOG.info("Wait queue scheduler worker exited with success!"); } else { LOG.error("Wait queue scheduler worker exited with success!"); Thread.getDefaultUncaughtExceptionHandler().uncaughtException(Thread.currentThread(), new IllegalStateException("WaitQueue worked exited before shutdown")); } } @Override public void onFailure(Throwable t) { LOG.error("Wait queue scheduler worker exited with failure!", t); Thread.getDefaultUncaughtExceptionHandler().uncaughtException(Thread.currentThread(), t); } } @Override public SubmissionState schedule(TaskRunnerCallable task) { TaskWrapper taskWrapper = new TaskWrapper(task, this); SubmissionState result; TaskWrapper evictedTask; boolean canFinish; synchronized (lock) { // If the queue does not have capacity, it does not throw a Rejection. Instead it will // return the task with the lowest priority, which could be the task which is currently being processed. // TODO HIVE-11687 It's possible for a bunch of tasks to come in around the same time, without the // actual executor threads picking up any work. This will lead to unnecessary rejection of tasks. // The wait queue should be able to fit at least (waitQueue + currentFreeExecutor slots) if (LOG.isDebugEnabled()) { LOG.debug( "Offering to wait queue with: waitQueueSize={}, numSlotsAvailable={}, runningFragmentCount={} ", waitQueue.size(), numSlotsAvailable.get(), runningFragmentCount.get()); } canFinish = taskWrapper.getTaskRunnerCallable().canFinish(); evictedTask = waitQueue.offer(taskWrapper, maxParallelExecutors - runningFragmentCount.get()); // Finishable state is checked on the task, via an explicit query to the TaskRunnerCallable // null evicted task means offer accepted // evictedTask is not equal taskWrapper means current task is accepted and it evicted // some other task if (evictedTask == null || !evictedTask.equals(taskWrapper)) { knownTasks.put(taskWrapper.getRequestId(), taskWrapper); taskWrapper.setIsInWaitQueue(true); if (isDebugEnabled) { LOG.debug("{} added to wait queue. Current wait queue size={}", task.getRequestId(), waitQueue.size()); } result = evictedTask == null ? SubmissionState.ACCEPTED : SubmissionState.EVICTED_OTHER; if (isDebugEnabled && evictedTask != null) { LOG.debug("Eviction: {} {} {}", taskWrapper, result, evictedTask); } } else { if (isInfoEnabled) { LOG.info( "wait queue full, size={}. numSlotsAvailable={}, runningFragmentCount={}. {} not added", waitQueue.size(), numSlotsAvailable.get(), runningFragmentCount.get(), task.getRequestId()); } evictedTask.getTaskRunnerCallable().killTask(); result = SubmissionState.REJECTED; if (isDebugEnabled) { LOG.debug("{} is {} as wait queue is full", taskWrapper.getRequestId(), result); } if (metrics != null) { metrics.incrTotalRejectedRequests(); } return result; } // Register for notifications inside the lock. Should avoid races with unregisterForNotifications // happens in a different Submission thread. i.e. Avoid register running for this task // after some other submission has evicted it. boolean stateChanged = !taskWrapper.maybeRegisterForFinishedStateNotifications(canFinish); if (stateChanged) { if (isDebugEnabled) { LOG.debug("Finishable state of {} updated to {} during registration for state updates", taskWrapper.getRequestId(), !canFinish); } finishableStateUpdated(taskWrapper, !canFinish); } } // At this point, the task has been added into the queue. It may have caused an eviction for // some other task. // This registration has to be done after knownTasks has been populated. // Register for state change notifications so that the waitQueue can be re-ordered correctly // if the fragment moves in or out of the finishable state. if (isDebugEnabled) { LOG.debug("Wait Queue: {}", waitQueue); } if (evictedTask != null) { if (isInfoEnabled) { LOG.info("{} evicted from wait queue in favor of {} because of lower priority", evictedTask.getRequestId(), task.getRequestId()); } try { knownTasks.remove(evictedTask.getRequestId()); evictedTask.maybeUnregisterForFinishedStateNotifications(); evictedTask.setIsInWaitQueue(false); } finally { // This is dealing with tasks from a different submission, and cause the kill // to go out before the previous submissions has completed. Handled in the AM evictedTask.getTaskRunnerCallable().killTask(); } if (metrics != null) { metrics.incrTotalEvictedFromWaitQueue(); } } synchronized (lock) { lock.notifyAll(); } if (metrics != null) { metrics.setExecutorNumQueuedRequests(waitQueue.size()); } return result; } @Override public QueryIdentifier findQueryByFragment(String fragmentId) { synchronized (lock) { TaskWrapper taskWrapper = knownTasks.get(fragmentId); return taskWrapper == null ? null : taskWrapper.getTaskRunnerCallable() .getFragmentInfo().getQueryInfo().getQueryIdentifier(); } } @Override public void killFragment(String fragmentId) { synchronized (lock) { TaskWrapper taskWrapper = knownTasks.remove(fragmentId); // Can be null since the task may have completed meanwhile. if (taskWrapper != null) { if (taskWrapper.isInWaitQueue()) { if (isDebugEnabled) { LOG.debug("Removing {} from waitQueue", fragmentId); } taskWrapper.setIsInWaitQueue(false); if (waitQueue.remove(taskWrapper)) { if (metrics != null) { metrics.setExecutorNumQueuedRequests(waitQueue.size()); } } } if (taskWrapper.isInPreemptionQueue()) { if (isDebugEnabled) { LOG.debug("Removing {} from preemptionQueue", fragmentId); } removeFromPreemptionQueue(taskWrapper); } taskWrapper.getTaskRunnerCallable().killTask(); } else { LOG.info("Ignoring killFragment request for {} since it isn't known", fragmentId); } lock.notifyAll(); } } private static final class FragmentCompletion { public FragmentCompletion( State state, long completingTime) { this.state = state; this.completingTime = completingTime; } State state; long completingTime; } @VisibleForTesting final ConcurrentMap<String, FragmentCompletion> completingFragmentMap = new ConcurrentHashMap<>(); @Override public void fragmentCompleting(String fragmentId, State state) { int count = runningFragmentCount.decrementAndGet(); if (count < 0) { LOG.warn( "RunningFragmentCount went negative. Multiple calls for the same completion. Resetting to 0"); runningFragmentCount.set(0); } completingFragmentMap .put(fragmentId, new FragmentCompletion(state, clock.getTime())); } @VisibleForTesting /** Assumes the epic lock is already taken. */ void tryScheduleUnderLock(final TaskWrapper taskWrapper) throws RejectedExecutionException { if (isInfoEnabled) { LOG.info("Attempting to execute {}", taskWrapper); } ListenableFuture<TaskRunner2Result> future = executorService.submit( taskWrapper.getTaskRunnerCallable()); runningFragmentCount.incrementAndGet(); taskWrapper.setIsInWaitQueue(false); FutureCallback<TaskRunner2Result> wrappedCallback = createInternalCompletionListener( taskWrapper); // Callback on a separate thread so that when a task completes, the thread in the main queue // is actually available for execution and will not potentially result in a RejectedExecution Futures.addCallback(future, wrappedCallback, executionCompletionExecutorService); boolean canFinish = taskWrapper.getTaskRunnerCallable().canFinish(); if (isDebugEnabled) { LOG.debug("{} scheduled for execution. canFinish={}", taskWrapper.getRequestId(), canFinish); } // only tasks that cannot finish immediately are pre-emptable. In other words, if all inputs // to the tasks are not ready yet, the task is eligible for pre-emptable. if (enablePreemption) { if (!canFinish) { if (isInfoEnabled) { LOG.info("{} is not finishable. Adding it to pre-emption queue", taskWrapper.getRequestId()); } addToPreemptionQueue(taskWrapper); } } numSlotsAvailable.decrementAndGet(); if (metrics != null) { metrics.setNumExecutorsAvailable(numSlotsAvailable.get()); } } private boolean handleScheduleAttemptedRejection(TaskWrapper taskWrapper) { if (enablePreemption && taskWrapper.getTaskRunnerCallable().canFinish() && !preemptionQueue.isEmpty()) { if (isDebugEnabled) { LOG.debug("Preemption Queue: " + preemptionQueue); } while (true) { // Try to preempt until we have something. TaskWrapper pRequest = removeAndGetNextFromPreemptionQueue(); if (pRequest == null) { return false; // Woe us. } if (pRequest.getTaskRunnerCallable().canFinish()) { LOG.info("Removed {} from preemption queue, but not preempting since it's now finishable", pRequest.getRequestId()); continue; // Try something else. } if (isInfoEnabled) { LOG.info("Invoking kill task for {} due to pre-emption to run {}", pRequest.getRequestId(), taskWrapper.getRequestId()); } // The task will either be killed or is already in the process of completing, which will // trigger the next scheduling run, or result in available slots being higher than 0, // which will cause the scheduler loop to continue. pRequest.getTaskRunnerCallable().killTask(); // We've killed something and may want to wait for it to die. return true; } } return false; } private void finishableStateUpdated(TaskWrapper taskWrapper, boolean newFinishableState) { synchronized (lock) { if (taskWrapper.isInWaitQueue()) { // Re-order the wait queue LOG.debug("Re-ordering the wait queue since {} finishable state moved to {}", taskWrapper.getRequestId(), newFinishableState); boolean reInserted = waitQueue.reinsertIfExists(taskWrapper); if (!reInserted) { LOG.warn("Failed to remove {} from waitQueue", taskWrapper.getTaskRunnerCallable().getRequestId()); } } if (newFinishableState == true && taskWrapper.isInPreemptionQueue()) { LOG.debug("Removing {} from preemption queue because it's state changed to {}", taskWrapper.getRequestId(), newFinishableState); removeFromPreemptionQueue(taskWrapper); } else if (newFinishableState == false && !taskWrapper.isInPreemptionQueue() && !taskWrapper.isInWaitQueue()) { LOG.debug("Adding {} to preemption queue since finishable state changed to {}", taskWrapper.getRequestId(), newFinishableState); addToPreemptionQueue(taskWrapper); } lock.notifyAll(); } } private void addToPreemptionQueue(TaskWrapper taskWrapper) { synchronized (lock) { boolean added = preemptionQueue.offer(taskWrapper); if (!added) { LOG.warn("Failed to add element {} to preemption queue. Terminating", taskWrapper); Thread.getDefaultUncaughtExceptionHandler().uncaughtException(Thread.currentThread(), new IllegalStateException("Preemption queue full. Cannot proceed")); } taskWrapper.setIsInPreemptableQueue(true); if (metrics != null) { metrics.setExecutorNumPreemptableRequests(preemptionQueue.size()); } } } /** * Remove the specified taskWrapper from the preemption queue * @param taskWrapper the taskWrapper to be removed * @return true if the element existed in the queue and wasa removed, false otherwise */ private boolean removeFromPreemptionQueue(TaskWrapper taskWrapper) { synchronized (lock) { return removeFromPreemptionQueueUnlocked(taskWrapper); } } private boolean removeFromPreemptionQueueUnlocked(TaskWrapper taskWrapper) { boolean removed = preemptionQueue.remove(taskWrapper); taskWrapper.setIsInPreemptableQueue(false); if (metrics != null) { metrics.setExecutorNumPreemptableRequests(preemptionQueue.size()); } return removed; } private TaskWrapper removeAndGetNextFromPreemptionQueue() { TaskWrapper taskWrapper; synchronized (lock) { taskWrapper = preemptionQueue.poll(); if (taskWrapper != null) { taskWrapper.setIsInPreemptableQueue(false); if (metrics != null) { metrics.setExecutorNumPreemptableRequests(preemptionQueue.size()); } } } return taskWrapper; } @VisibleForTesting InternalCompletionListener createInternalCompletionListener(TaskWrapper taskWrapper) { return new InternalCompletionListener(taskWrapper); } @VisibleForTesting class InternalCompletionListener implements FutureCallback<TaskRunner2Result> { private final TaskWrapper taskWrapper; public InternalCompletionListener(TaskWrapper taskWrapper) { this.taskWrapper = taskWrapper; } // By the time either success / failed are called, the task itself knows that it has terminated, // and will ignore subsequent kill requests if they go out. // There's a race between removing the current task from the preemption queue and the actual scheduler // attempting to take an element from the preemption queue to make space for another task. // If the current element is removed to make space - that is OK, since the current task is completing and // will end up making space for execution. Any kill message sent out by the scheduler to the task will // be ignored, since the task knows it has completed (otherwise it would not be in this callback). // // If the task is removed from the queue as a result of this callback, and the scheduler happens to // be in the section where it's looking for a preemptible task - the scheuler may end up pulling the // next pre-emptible task and killing it (an extra preemption). // TODO: This potential extra preemption can be avoided by synchronizing the entire tryScheduling block.\ // This would essentially synchronize all operations - it would be better to see if there's an // approach where multiple locks could be used to avoid single threaded operation. // - It checks available and preempts (which could be this task) // - Or this task completes making space, and removing the need for preemption @Override public void onSuccess(TaskRunner2Result result) { if (LOG.isDebugEnabled()) { LOG.debug("Received successful completion for: {}", taskWrapper.getRequestId()); } updateFallOffStats(taskWrapper.getRequestId()); knownTasks.remove(taskWrapper.getRequestId()); taskWrapper.setIsInPreemptableQueue(false); taskWrapper.maybeUnregisterForFinishedStateNotifications(); updatePreemptionListAndNotify(result.getEndReason()); taskWrapper.getTaskRunnerCallable().getCallback().onSuccess(result); } @Override public void onFailure(Throwable t) { if (LOG.isDebugEnabled()) { LOG.debug("Received failed completion for: {}", taskWrapper.getRequestId()); } updateFallOffStats(taskWrapper.getRequestId()); knownTasks.remove(taskWrapper.getRequestId()); taskWrapper.setIsInPreemptableQueue(false); taskWrapper.maybeUnregisterForFinishedStateNotifications(); updatePreemptionListAndNotify(null); taskWrapper.getTaskRunnerCallable().getCallback().onFailure(t); LOG.error("Failed notification received: Stacktrace: " + ExceptionUtils.getStackTrace(t)); } private void updatePreemptionListAndNotify(EndReason reason) { // if this task was added to pre-emption list, remove it if (enablePreemption) { String state = reason == null ? "FAILED" : reason.name(); boolean removed = removeFromPreemptionQueueUnlocked(taskWrapper); if (removed && isInfoEnabled) { TaskRunnerCallable trc = taskWrapper.getTaskRunnerCallable(); LOG.info(TaskRunnerCallable.getTaskIdentifierString(trc.getRequest(), trc.getVertexSpec(), trc.getQueryId()) + " request " + state + "! Removed from preemption list."); } } numSlotsAvailable.incrementAndGet(); if (metrics != null) { metrics.setNumExecutorsAvailable(numSlotsAvailable.get()); } if (isDebugEnabled) { LOG.debug("Task {} complete. WaitQueueSize={}, numSlotsAvailable={}, preemptionQueueSize={}", taskWrapper.getRequestId(), waitQueue.size(), numSlotsAvailable.get(), preemptionQueue.size()); } synchronized (lock) { if (!waitQueue.isEmpty()) { lock.notifyAll(); } } } private void updateFallOffStats( String requestId) { long now = clock.getTime(); FragmentCompletion fragmentCompletion = completingFragmentMap.remove(requestId); if (fragmentCompletion == null) { LOG.warn( "Received onSuccess/onFailure for a fragment for which a completing message was not received: {}", requestId); // Happens due to AM side pre-emption, or the AM asking for a task to die. // There's no hooks at the moment to get information over. // For now - decrement the count to avoid accounting errors. runningFragmentCount.decrementAndGet(); // TODO: Extend TaskRunner2 or see if an API with callbacks will work } else { long timeTaken = now - fragmentCompletion.completingTime; switch (fragmentCompletion.state) { case SUCCESS: if (metrics != null) { metrics.addMetricsFallOffSuccessTimeLost(timeTaken); } break; case FAILED: if (metrics != null) { metrics.addMetricsFallOffFailedTimeLost(timeTaken); } break; case KILLED: if (metrics != null) { metrics.addMetricsFallOffKilledTimeLost(timeTaken); } break; } } } } public void shutDown(boolean awaitTermination) { if (!isShutdown.getAndSet(true)) { if (awaitTermination) { if (isDebugEnabled) { LOG.debug("awaitTermination: " + awaitTermination + " shutting down task executor" + " service gracefully"); } shutdownExecutor(waitQueueExecutorService); shutdownExecutor(executorService); shutdownExecutor(executionCompletionExecutorService); } else { if (isDebugEnabled) { LOG.debug("awaitTermination: " + awaitTermination + " shutting down task executor" + " service immediately"); } executorService.shutdownNow(); waitQueueExecutorService.shutdownNow(); executionCompletionExecutorService.shutdownNow(); } } } private void shutdownExecutor(ExecutorService executorService) { executorService.shutdown(); try { if (!executorService.awaitTermination(1, TimeUnit.MINUTES)) { executorService.shutdownNow(); } } catch (InterruptedException e) { executorService.shutdownNow(); } } @VisibleForTesting public static class PreemptionQueueComparator implements Comparator<TaskWrapper> { @Override public int compare(TaskWrapper t1, TaskWrapper t2) { TaskRunnerCallable o1 = t1.getTaskRunnerCallable(); TaskRunnerCallable o2 = t2.getTaskRunnerCallable(); FragmentRuntimeInfo fri1 = o1.getFragmentRuntimeInfo(); FragmentRuntimeInfo fri2 = o2.getFragmentRuntimeInfo(); if (fri1.getNumSelfAndUpstreamTasks() > fri2.getNumSelfAndUpstreamTasks()) { return 1; } else if (fri1.getNumSelfAndUpstreamTasks() < fri2.getNumSelfAndUpstreamTasks()) { return -1; } return 0; } } public static class TaskWrapper implements FinishableStateUpdateHandler { private final TaskRunnerCallable taskRunnerCallable; private final AtomicBoolean inWaitQueue = new AtomicBoolean(false); private final AtomicBoolean inPreemptionQueue = new AtomicBoolean(false); private final AtomicBoolean registeredForNotifications = new AtomicBoolean(false); private final TaskExecutorService taskExecutorService; public TaskWrapper(TaskRunnerCallable taskRunnerCallable, TaskExecutorService taskExecutorService) { this.taskRunnerCallable = taskRunnerCallable; this.taskExecutorService = taskExecutorService; } // Don't invoke from within a scheduler lock /** * * @param currentFinishableState * @return true if the state has not changed from currentFinishableState, false otherwise */ public boolean maybeRegisterForFinishedStateNotifications( boolean currentFinishableState) { if (!registeredForNotifications.getAndSet(true)) { return taskRunnerCallable.getFragmentInfo() .registerForFinishableStateUpdates(this, currentFinishableState); } else { // State has not changed / already registered for notifications. return true; } } // Don't invoke from within a scheduler lock public void maybeUnregisterForFinishedStateNotifications() { if (registeredForNotifications.getAndSet(false)) { taskRunnerCallable.getFragmentInfo().unregisterForFinishableStateUpdates(this); } } public TaskRunnerCallable getTaskRunnerCallable() { return taskRunnerCallable; } public boolean isInWaitQueue() { return inWaitQueue.get(); } public boolean isInPreemptionQueue() { return inPreemptionQueue.get(); } public void setIsInWaitQueue(boolean value) { this.inWaitQueue.set(value); } public void setIsInPreemptableQueue(boolean value) { this.inPreemptionQueue.set(value); } public String getRequestId() { return taskRunnerCallable.getRequestId(); } @Override public String toString() { return "TaskWrapper{" + "task=" + taskRunnerCallable.getRequestId() + ", inWaitQueue=" + inWaitQueue.get() + ", inPreemptionQueue=" + inPreemptionQueue.get() + ", registeredForNotifications=" + registeredForNotifications.get() + ", canFinish=" + taskRunnerCallable.canFinish() + ", firstAttemptStartTime=" + taskRunnerCallable.getFragmentRuntimeInfo().getFirstAttemptStartTime() + ", dagStartTime=" + taskRunnerCallable.getFragmentRuntimeInfo().getDagStartTime() + ", withinDagPriority=" + taskRunnerCallable.getFragmentRuntimeInfo().getWithinDagPriority() + ", vertexParallelism= " + taskRunnerCallable.getVertexSpec().getVertexParallelism() + ", selfAndUpstreamParallelism= " + taskRunnerCallable.getFragmentRuntimeInfo().getNumSelfAndUpstreamTasks() + ", selfAndUpstreamComplete= " + taskRunnerCallable.getFragmentRuntimeInfo().getNumSelfAndUpstreamCompletedTasks() + '}'; } // No task lock. But acquires lock on the scheduler @Override public void finishableStateUpdated(boolean finishableState) { // This method should not by synchronized. Can lead to deadlocks since it calls a sync method. // Meanwhile the scheduler could try updating states via a synchronized method. LOG.info("Received finishable state update for {}, state={}", taskRunnerCallable.getRequestId(), finishableState); taskExecutorService.finishableStateUpdated(this, finishableState); } // TaskWrapper is used in structures, as well as for ordering using Comparators // in the waitQueue. Avoid Object comparison. @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } TaskWrapper that = (TaskWrapper) o; return taskRunnerCallable.getRequestId() .equals(that.taskRunnerCallable.getRequestId()); } @Override public int hashCode() { return taskRunnerCallable.getRequestId().hashCode(); } } private static class ExecutorThreadFactory implements ThreadFactory { private final ClassLoader classLoader; private final ThreadFactory defaultFactory; private final AtomicLong count = new AtomicLong(0); public ExecutorThreadFactory(ClassLoader classLoader) { this.classLoader = classLoader; this.defaultFactory = Executors.defaultThreadFactory(); } @Override public Thread newThread(Runnable r) { Thread thread = defaultFactory.newThread(r); thread.setName(String.format(TASK_EXECUTOR_THREAD_NAME_FORMAT, count.getAndIncrement())); thread.setDaemon(true); thread.setContextClassLoader(classLoader); return thread; } } }