/** * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.aurora.scheduler.reconciliation; import java.util.EnumSet; import java.util.Set; import java.util.concurrent.atomic.AtomicLong; import javax.inject.Inject; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Optional; import com.google.common.eventbus.Subscribe; import com.google.common.util.concurrent.AbstractIdleService; import org.apache.aurora.common.quantity.Amount; import org.apache.aurora.common.quantity.Time; import org.apache.aurora.common.stats.StatsProvider; import org.apache.aurora.gen.ScheduleStatus; import org.apache.aurora.scheduler.async.AsyncModule.AsyncExecutor; import org.apache.aurora.scheduler.async.DelayExecutor; import org.apache.aurora.scheduler.events.PubsubEvent.EventSubscriber; import org.apache.aurora.scheduler.events.PubsubEvent.TaskStateChange; import org.apache.aurora.scheduler.state.StateChangeResult; import org.apache.aurora.scheduler.state.StateManager; import org.apache.aurora.scheduler.storage.Storage; import org.apache.aurora.scheduler.storage.entities.IScheduledTask; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static java.util.Objects.requireNonNull; /** * Observes task transitions and identifies tasks that are 'stuck' in a transient state. Stuck * tasks will be transitioned to the LOST state. */ class TaskTimeout extends AbstractIdleService implements EventSubscriber { private static final Logger LOG = LoggerFactory.getLogger(TaskTimeout.class); @VisibleForTesting static final Amount<Long, Time> NOT_STARTED_RETRY = Amount.of(5L, Time.SECONDS); @VisibleForTesting static final String TIMED_OUT_TASKS_COUNTER = "timed_out_tasks"; @VisibleForTesting static final Optional<String> TIMEOUT_MESSAGE = Optional.of("Task timed out"); @VisibleForTesting static final Set<ScheduleStatus> TRANSIENT_STATES = EnumSet.of( ScheduleStatus.ASSIGNED, ScheduleStatus.PREEMPTING, ScheduleStatus.RESTARTING, ScheduleStatus.KILLING, ScheduleStatus.DRAINING); private final DelayExecutor executor; private final Storage storage; private final StateManager stateManager; private final Amount<Long, Time> timeout; private final AtomicLong timedOutTasks; @Inject TaskTimeout( @AsyncExecutor DelayExecutor executor, Storage storage, StateManager stateManager, Amount<Long, Time> timeout, StatsProvider statsProvider) { this.executor = requireNonNull(executor); this.storage = requireNonNull(storage); this.stateManager = requireNonNull(stateManager); this.timeout = requireNonNull(timeout); this.timedOutTasks = statsProvider.makeCounter(TIMED_OUT_TASKS_COUNTER); } private static boolean isTransient(ScheduleStatus status) { return TRANSIENT_STATES.contains(status); } @Override protected void startUp() { // No work to do here for startup, however we leverage the state tracking in // AbstractIdleService. } @Override protected void shutDown() { // Nothing to do for shutting down. } private class TimedOutTaskHandler implements Runnable { private final String taskId; private final ScheduleStatus newState; TimedOutTaskHandler(String taskId, ScheduleStatus newState) { this.taskId = taskId; this.newState = newState; } @Override public void run() { if (isRunning()) { Optional<IScheduledTask> task = storage.read( storeProvider -> storeProvider.getTaskStore().fetchTask(taskId)); // Double-Checked Locking: acquire storage write lock only if necessary if (task.isPresent() && task.get().getStatus() == newState) { // This query acts as a CAS by including the state that we expect the task to be in // if the timeout is still valid. Ideally, the future would have already been // canceled, but in the event of a state transition race, including transientState // prevents an unintended task timeout. // Note: This requires LOST transitions trigger Driver.killTask. StateChangeResult result = storage.write(storeProvider -> stateManager.changeState( storeProvider, taskId, Optional.of(newState), ScheduleStatus.LOST, TIMEOUT_MESSAGE)); if (result == StateChangeResult.SUCCESS) { LOG.info("Timeout reached for task " + taskId + ":" + taskId); timedOutTasks.incrementAndGet(); } } } else { // Our service is not yet started. We don't want to lose track of the task, so // we will try again later. LOG.debug("Retrying timeout of task {} in {}", taskId, NOT_STARTED_RETRY); // TODO(wfarner): This execution should not wait for a transaction, but a second executor // would be weird. executor.execute(this, NOT_STARTED_RETRY); } } } @Subscribe public void recordStateChange(TaskStateChange change) { if (isTransient(change.getNewState())) { executor.execute( new TimedOutTaskHandler(change.getTaskId(), change.getNewState()), timeout); } } }