package com.hubspot.singularity.mesos; import java.util.concurrent.ConcurrentHashMap; import org.apache.mesos.Protos; import org.apache.mesos.Protos.TaskState; import org.apache.mesos.Protos.TaskStatus.Reason; import org.apache.mesos.SchedulerDriver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.codahale.metrics.Meter; import com.codahale.metrics.annotation.Timed; import com.google.common.base.Optional; import com.google.common.base.Strings; import com.google.common.collect.Multiset; import com.google.inject.Inject; import com.google.inject.Provider; import com.google.inject.Singleton; import com.google.inject.name.Named; import com.hubspot.mesos.JavaUtils; import com.hubspot.singularity.ExtendedTaskState; import com.hubspot.singularity.InvalidSingularityTaskIdException; import com.hubspot.singularity.SingularityCreateResult; import com.hubspot.singularity.SingularityMainModule; import com.hubspot.singularity.SingularityPendingDeploy; import com.hubspot.singularity.SingularityRequestWithState; import com.hubspot.singularity.SingularityTask; import com.hubspot.singularity.SingularityTaskHistoryUpdate; import com.hubspot.singularity.SingularityTaskId; import com.hubspot.singularity.SingularityTaskStatusHolder; import com.hubspot.singularity.config.SingularityConfiguration; import com.hubspot.singularity.data.DeployManager; import com.hubspot.singularity.data.RequestManager; import com.hubspot.singularity.data.TaskManager; import com.hubspot.singularity.data.transcoders.IdTranscoder; import com.hubspot.singularity.data.transcoders.SingularityTranscoderException; import com.hubspot.singularity.scheduler.SingularityHealthchecker; import com.hubspot.singularity.scheduler.SingularityNewTaskChecker; import com.hubspot.singularity.scheduler.SingularityScheduler; import com.hubspot.singularity.scheduler.SingularitySchedulerStateCache; import com.hubspot.singularity.sentry.SingularityExceptionNotifier; @Singleton public class SingularityMesosStatusUpdateHandler { private static final Logger LOG = LoggerFactory.getLogger(SingularityMesosStatusUpdateHandler.class); private final TaskManager taskManager; private final DeployManager deployManager; private final RequestManager requestManager; private final IdTranscoder<SingularityTaskId> taskIdTranscoder; private final SingularityExceptionNotifier exceptionNotifier; private final SingularityHealthchecker healthchecker; private final SingularityNewTaskChecker newTaskChecker; private final SingularitySlaveAndRackManager slaveAndRackManager; private final SingularityMesosExecutorInfoSupport logSupport; private final SingularityScheduler scheduler; private final Provider<SingularitySchedulerStateCache> stateCacheProvider; private final String serverId; private final SchedulerDriverSupplier schedulerDriverSupplier; private final SingularitySchedulerLock schedulerLock; private final SingularityConfiguration configuration; private final Multiset<Protos.TaskStatus.Reason> taskLostReasons; private final Meter lostTasksMeter; private final ConcurrentHashMap<Long, Long> statusUpdateDeltas; @Inject public SingularityMesosStatusUpdateHandler(TaskManager taskManager, DeployManager deployManager, RequestManager requestManager, IdTranscoder<SingularityTaskId> taskIdTranscoder, SingularityExceptionNotifier exceptionNotifier, SingularityHealthchecker healthchecker, SingularityNewTaskChecker newTaskChecker, SingularitySlaveAndRackManager slaveAndRackManager, SingularityMesosExecutorInfoSupport logSupport, SingularityScheduler scheduler, Provider<SingularitySchedulerStateCache> stateCacheProvider, @Named(SingularityMainModule.SERVER_ID_PROPERTY) String serverId, SchedulerDriverSupplier schedulerDriverSupplier, SingularitySchedulerLock schedulerLock, SingularityConfiguration configuration, @Named(SingularityMesosModule.TASK_LOST_REASONS_COUNTER) Multiset<Protos.TaskStatus.Reason> taskLostReasons, @Named(SingularityMainModule.LOST_TASKS_METER) Meter lostTasksMeter, @Named(SingularityMainModule.STATUS_UPDATE_DELTAS) ConcurrentHashMap<Long, Long> statusUpdateDeltas) { this.taskManager = taskManager; this.deployManager = deployManager; this.requestManager = requestManager; this.taskIdTranscoder = taskIdTranscoder; this.exceptionNotifier = exceptionNotifier; this.healthchecker = healthchecker; this.newTaskChecker = newTaskChecker; this.slaveAndRackManager = slaveAndRackManager; this.logSupport = logSupport; this.scheduler = scheduler; this.stateCacheProvider = stateCacheProvider; this.serverId = serverId; this.schedulerDriverSupplier = schedulerDriverSupplier; this.schedulerLock = schedulerLock; this.configuration = configuration; this.taskLostReasons = taskLostReasons; this.lostTasksMeter = lostTasksMeter; this.statusUpdateDeltas = statusUpdateDeltas; } /** * 1- we have a previous update, and this is a duplicate of it (ignore) 2- we don't have a * previous update, 2 cases: a - this task has already been destroyed (we can ignore it then) b - * we've never heard of this task (very unlikely since we first write a status into zk before we * launch a task) */ private boolean isDuplicateOrIgnorableStatusUpdate(Optional<SingularityTaskStatusHolder> previousTaskStatusHolder, final SingularityTaskStatusHolder newTaskStatusHolder) { if (!previousTaskStatusHolder.isPresent()) { return true; } if (!previousTaskStatusHolder.get().getTaskStatus().isPresent()) { // this is our launch state return false; } return previousTaskStatusHolder.get().getTaskStatus().get().getState() == newTaskStatusHolder.getTaskStatus().get().getState(); } private void saveNewTaskStatusHolder(SingularityTaskId taskIdObj, SingularityTaskStatusHolder newTaskStatusHolder, ExtendedTaskState taskState) { if (taskState.isDone()) { taskManager.deleteLastActiveTaskStatus(taskIdObj); } else { taskManager.saveLastActiveTaskStatus(newTaskStatusHolder); } } private Optional<SingularityTaskId> getTaskId(String taskId) { try { return Optional.of(taskIdTranscoder.fromString(taskId)); } catch (InvalidSingularityTaskIdException | SingularityTranscoderException e) { exceptionNotifier.notify(String.format("Unexpected taskId %s", taskId), e); LOG.error("Unexpected taskId {} ", taskId, e); return Optional.absent(); } } private Optional<String> getStatusMessage(Protos.TaskStatus status, Optional<SingularityTask> task) { if (status.hasMessage() && !Strings.isNullOrEmpty(status.getMessage())) { return Optional.of(status.getMessage()); } else if (status.hasReason() && status.getReason() == Reason.REASON_CONTAINER_LIMITATION_MEMORY) { if (task.isPresent() && task.get().getTaskRequest().getDeploy().getResources().isPresent()) { if (task.get().getTaskRequest().getDeploy().getResources().get().getDiskMb() > 0) { return Optional.of(String.format("Task exceeded one or more memory limits (%s MB mem, %s MB disk).", task.get().getTaskRequest().getDeploy().getResources().get().getMemoryMb(), task.get().getTaskRequest().getDeploy().getResources().get().getDiskMb())); } else { return Optional.of(String.format("Task exceeded memory limit (%s MB mem).", task.get().getTaskRequest().getDeploy().getResources().get().getMemoryMb())); } } return Optional.of("Task exceeded memory limit"); } return Optional.absent(); } private SchedulerDriver getSchedulerDriver() { final Optional<SchedulerDriver> maybeSchedulerDriver = schedulerDriverSupplier.get(); if (!maybeSchedulerDriver.isPresent()) { throw new RuntimeException("scheduler driver not present!"); // TODO: how best to handle? } return maybeSchedulerDriver.get(); } private void unsafeProcessStatusUpdate(Protos.TaskStatus status) { final String taskId = status.getTaskId().getValue(); long timestamp = System.currentTimeMillis(); if (status.hasTimestamp()) { timestamp = (long) (status.getTimestamp() * 1000); } long now = System.currentTimeMillis(); long delta = now - timestamp; LOG.debug("Update: task {} is now {} ({}) at {} (delta: {})", taskId, status.getState(), status.getMessage(), timestamp, JavaUtils.durationFromMillis(delta)); statusUpdateDeltas.put(now, delta); final Optional<SingularityTaskId> maybeTaskId = getTaskId(taskId); if (!maybeTaskId.isPresent()) { return; } final SingularityTaskId taskIdObj = maybeTaskId.get(); final SingularityTaskStatusHolder newTaskStatusHolder = new SingularityTaskStatusHolder(taskIdObj, Optional.of(status), System.currentTimeMillis(), serverId, Optional.<String>absent()); final Optional<SingularityTaskStatusHolder> previousTaskStatusHolder = taskManager.getLastActiveTaskStatus(taskIdObj); final ExtendedTaskState taskState = ExtendedTaskState.fromTaskState(status.getState()); if (isDuplicateOrIgnorableStatusUpdate(previousTaskStatusHolder, newTaskStatusHolder)) { LOG.trace("Ignoring status update {} to {}", taskState, taskIdObj); saveNewTaskStatusHolder(taskIdObj, newTaskStatusHolder, taskState); return; } if (status.getState() == TaskState.TASK_LOST) { lostTasksMeter.mark(); if (configuration.getDisasterDetection().isEnabled()) { taskLostReasons.add(status.getReason()); } } final Optional<SingularityTask> task = taskManager.getTask(taskIdObj); final boolean isActiveTask = taskManager.isActiveTask(taskId); if (isActiveTask && !taskState.isDone()) { if (task.isPresent()) { final Optional<SingularityPendingDeploy> pendingDeploy = deployManager.getPendingDeploy(taskIdObj.getRequestId()); Optional<SingularityRequestWithState> requestWithState = Optional.absent(); if (taskState == ExtendedTaskState.TASK_RUNNING) { requestWithState = requestManager.getRequest(taskIdObj.getRequestId()); healthchecker.enqueueHealthcheck(task.get(), pendingDeploy, requestWithState); } if (!pendingDeploy.isPresent() || !pendingDeploy.get().getDeployMarker().getDeployId().equals(taskIdObj.getDeployId())) { if (!requestWithState.isPresent()) { requestWithState = requestManager.getRequest(taskIdObj.getRequestId()); } newTaskChecker.enqueueNewTaskCheck(task.get(), requestWithState, healthchecker); } } else { final String message = String.format("Task %s is active but is missing task data", taskId); exceptionNotifier.notify(message); LOG.error(message); } } final Optional<String> statusMessage = getStatusMessage(status, task); final SingularityTaskHistoryUpdate taskUpdate = new SingularityTaskHistoryUpdate(taskIdObj, timestamp, taskState, statusMessage, status.hasReason() ? Optional.of(status.getReason().name()) : Optional.<String>absent()); final SingularityCreateResult taskHistoryUpdateCreateResult = taskManager.saveTaskHistoryUpdate(taskUpdate); logSupport.checkDirectoryAndContainerId(taskIdObj); if (taskState.isDone()) { healthchecker.cancelHealthcheck(taskId); newTaskChecker.cancelNewTaskCheck(taskId); taskManager.deleteKilledRecord(taskIdObj); SingularitySchedulerStateCache stateCache = stateCacheProvider.get(); slaveAndRackManager.checkStateAfterFinishedTask(taskIdObj, status.getSlaveId().getValue(), stateCache); scheduler.handleCompletedTask(task, taskIdObj, isActiveTask, timestamp, taskState, taskHistoryUpdateCreateResult, stateCache, status); } saveNewTaskStatusHolder(taskIdObj, newTaskStatusHolder, taskState); } @Timed public void processStatusUpdate(Protos.TaskStatus status) { long insideLock = 0; final long start = schedulerLock.lock("statusUpdate"); try { insideLock = System.currentTimeMillis(); unsafeProcessStatusUpdate(status); } finally { schedulerLock.unlock("statusUpdate", start); LOG.info("Processed status update for {} ({}) in {} (waited {} for lock)", status.getTaskId().getValue(), status.getState(), JavaUtils.duration(start), JavaUtils.durationFromMillis(insideLock - start)); } } }