/** * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.aurora.scheduler.scheduling; import java.lang.annotation.Retention; import java.lang.annotation.Target; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Collectors; import javax.inject.Inject; import javax.inject.Qualifier; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Joiner; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.collect.Sets; import com.google.common.eventbus.Subscribe; import org.apache.aurora.common.inject.TimedInterceptor.Timed; import org.apache.aurora.common.stats.Stats; import org.apache.aurora.scheduler.base.Query; import org.apache.aurora.scheduler.base.TaskGroupKey; import org.apache.aurora.scheduler.configuration.executor.ExecutorSettings; import org.apache.aurora.scheduler.events.PubsubEvent.EventSubscriber; import org.apache.aurora.scheduler.events.PubsubEvent.TaskStateChange; import org.apache.aurora.scheduler.filter.AttributeAggregate; import org.apache.aurora.scheduler.filter.SchedulingFilter.ResourceRequest; import org.apache.aurora.scheduler.preemptor.BiCache; import org.apache.aurora.scheduler.preemptor.Preemptor; import org.apache.aurora.scheduler.resources.ResourceBag; import org.apache.aurora.scheduler.state.TaskAssigner; import org.apache.aurora.scheduler.storage.Storage.MutableStoreProvider; import org.apache.aurora.scheduler.storage.entities.IAssignedTask; import org.apache.aurora.scheduler.storage.entities.IScheduledTask; import org.apache.aurora.scheduler.storage.entities.ITaskConfig; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static java.lang.annotation.ElementType.FIELD; import static java.lang.annotation.ElementType.METHOD; import static java.lang.annotation.ElementType.PARAMETER; import static java.lang.annotation.RetentionPolicy.RUNTIME; import static java.util.Objects.requireNonNull; import static java.util.stream.Collectors.toMap; import static org.apache.aurora.gen.ScheduleStatus.PENDING; import static org.apache.aurora.scheduler.resources.ResourceManager.bagFromResources; /** * Enables scheduling and preemption of tasks. */ public interface TaskScheduler extends EventSubscriber { /** * Attempts to schedule a task, possibly performing irreversible actions. * * @param storeProvider {@code MutableStoreProvider} instance to access data store. * @param taskIds The tasks to attempt to schedule. * @return Successfully scheduled task IDs. The caller should call schedule again if a given * task ID was not present in the result. */ Set<String> schedule(MutableStoreProvider storeProvider, Iterable<String> taskIds); /** * An asynchronous task scheduler. Scheduling of tasks is performed on a delay, where each task * backs off after a failed scheduling attempt. * <p> * Pending tasks are advertised to the scheduler via internal pubsub notifications. */ class TaskSchedulerImpl implements TaskScheduler { /** * Binding annotation for the time duration of reservations. */ @VisibleForTesting @Qualifier @Target({ FIELD, PARAMETER, METHOD }) @Retention(RUNTIME) public @interface ReservationDuration { } private static final Logger LOG = LoggerFactory.getLogger(TaskSchedulerImpl.class); private final TaskAssigner assigner; private final Preemptor preemptor; private final ExecutorSettings executorSettings; private final BiCache<String, TaskGroupKey> reservations; private final AtomicLong attemptsFired = Stats.exportLong("schedule_attempts_fired"); private final AtomicLong attemptsFailed = Stats.exportLong("schedule_attempts_failed"); private final AtomicLong attemptsNoMatch = Stats.exportLong("schedule_attempts_no_match"); @Inject TaskSchedulerImpl( TaskAssigner assigner, Preemptor preemptor, ExecutorSettings executorSettings, BiCache<String, TaskGroupKey> reservations) { this.assigner = requireNonNull(assigner); this.preemptor = requireNonNull(preemptor); this.executorSettings = requireNonNull(executorSettings); this.reservations = requireNonNull(reservations); } @Timed ("task_schedule_attempt") public Set<String> schedule(MutableStoreProvider store, Iterable<String> taskIds) { try { return scheduleTasks(store, taskIds); } catch (RuntimeException e) { // We catch the generic unchecked exception here to ensure tasks are not abandoned // if there is a transient issue resulting in an unchecked exception. LOG.warn("Task scheduling unexpectedly failed, will be retried", e); attemptsFailed.incrementAndGet(); // Return empty set for all task IDs to be retried later. // It's ok if some tasks were already assigned, those will be ignored in the next round. return ImmutableSet.of(); } } private Set<String> scheduleTasks(MutableStoreProvider store, Iterable<String> tasks) { ImmutableSet<String> taskIds = ImmutableSet.copyOf(tasks); String taskIdValues = Joiner.on(",").join(taskIds); LOG.debug("Attempting to schedule tasks {}", taskIdValues); ImmutableSet<IAssignedTask> assignedTasks = ImmutableSet.copyOf(Iterables.transform( store.getTaskStore().fetchTasks(Query.taskScoped(taskIds).byStatus(PENDING)), IScheduledTask::getAssignedTask)); if (Iterables.isEmpty(assignedTasks)) { LOG.warn("Failed to look up all tasks in a scheduling round: {}", taskIdValues); return taskIds; } Preconditions.checkState( assignedTasks.stream() .collect(Collectors.groupingBy(t -> t.getTask())) .entrySet() .size() == 1, "Found multiple task groups for %s", taskIdValues); Map<String, IAssignedTask> assignableTaskMap = assignedTasks.stream().collect(toMap(t -> t.getTaskId(), t -> t)); if (taskIds.size() != assignedTasks.size()) { LOG.warn("Failed to look up tasks " + Joiner.on(", ").join(Sets.difference(taskIds, assignableTaskMap.keySet()))); } // This is safe after all checks above. ITaskConfig task = assignedTasks.stream().findFirst().get().getTask(); AttributeAggregate aggregate = AttributeAggregate.getJobActiveState(store, task.getJob()); // Valid Docker tasks can have a container but no executor config ResourceBag overhead = ResourceBag.EMPTY; if (task.isSetExecutorConfig()) { overhead = executorSettings.getExecutorOverhead(task.getExecutorConfig().getName()) .orElseThrow( () -> new IllegalArgumentException("Cannot find executor configuration")); } Set<String> launched = assigner.maybeAssign( store, new ResourceRequest( task, bagFromResources(task.getResources()).add(overhead), aggregate), TaskGroupKey.from(task), assignedTasks, reservations.asMap()); attemptsFired.addAndGet(assignableTaskMap.size()); Set<String> failedToLaunch = Sets.difference(assignableTaskMap.keySet(), launched); failedToLaunch.forEach(taskId -> { // Task could not be scheduled. // TODO(maxim): Now that preemption slots are searched asynchronously, consider // retrying a launch attempt within the current scheduling round IFF a reservation is // available. maybePreemptFor(assignableTaskMap.get(taskId), aggregate, store); }); attemptsNoMatch.addAndGet(failedToLaunch.size()); // Return all successfully launched tasks as well as those weren't tried (not in PENDING). return Sets.union(launched, Sets.difference(taskIds, assignableTaskMap.keySet())); } private void maybePreemptFor( IAssignedTask task, AttributeAggregate jobState, MutableStoreProvider storeProvider) { if (!reservations.getByValue(TaskGroupKey.from(task.getTask())).isEmpty()) { return; } Optional<String> slaveId = preemptor.attemptPreemptionFor(task, jobState, storeProvider); if (slaveId.isPresent()) { reservations.put(slaveId.get(), TaskGroupKey.from(task.getTask())); } } @Subscribe public void taskChanged(final TaskStateChange stateChangeEvent) { if (Optional.of(PENDING).equals(stateChangeEvent.getOldState())) { IAssignedTask assigned = stateChangeEvent.getTask().getAssignedTask(); if (assigned.getSlaveId() != null) { reservations.remove(assigned.getSlaveId(), TaskGroupKey.from(assigned.getTask())); } } } } }