package com.hubspot.singularity.scheduler; import java.util.Collection; import java.util.Map; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; import javax.inject.Singleton; import org.apache.commons.lang3.time.DurationFormatUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.codahale.metrics.annotation.Timed; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Optional; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; import com.google.inject.Inject; import com.google.inject.name.Named; import com.hubspot.deploy.HealthcheckOptions; import com.hubspot.singularity.ExtendedTaskState; import com.hubspot.singularity.HealthcheckProtocol; import com.hubspot.singularity.SingularityAbort; import com.hubspot.singularity.SingularityAction; import com.hubspot.singularity.SingularityMainModule; import com.hubspot.singularity.SingularityPendingDeploy; import com.hubspot.singularity.SingularityRequestWithState; import com.hubspot.singularity.SingularityTask; import com.hubspot.singularity.SingularityTaskHealthcheckResult; import com.hubspot.singularity.SingularityTaskHistoryUpdate; import com.hubspot.singularity.SingularityTaskId; import com.hubspot.singularity.config.SingularityConfiguration; import com.hubspot.singularity.data.DisasterManager; import com.hubspot.singularity.data.TaskManager; import com.hubspot.singularity.sentry.SingularityExceptionNotifier; import com.ning.http.client.AsyncHttpClient; import com.ning.http.client.PerRequestConfig; import com.ning.http.client.RequestBuilder; @SuppressWarnings("deprecation") @Singleton public class SingularityHealthchecker { private static final HealthcheckProtocol DEFAULT_HEALTH_CHECK_SCHEME = HealthcheckProtocol.HTTP; private static final Logger LOG = LoggerFactory.getLogger(SingularityHealthchecker.class); private final AsyncHttpClient http; private final SingularityConfiguration configuration; private final TaskManager taskManager; private final SingularityAbort abort; private final SingularityNewTaskChecker newTaskChecker; private final Map<String, ScheduledFuture<?>> taskIdToHealthcheck; private final ScheduledExecutorService executorService; private final SingularityExceptionNotifier exceptionNotifier; private final DisasterManager disasterManager; @Inject public SingularityHealthchecker(@Named(SingularityMainModule.HEALTHCHECK_THREADPOOL_NAME) ScheduledExecutorService executorService, AsyncHttpClient http, SingularityConfiguration configuration, SingularityNewTaskChecker newTaskChecker, TaskManager taskManager, SingularityAbort abort, SingularityExceptionNotifier exceptionNotifier, DisasterManager disasterManager) { this.http = http; this.configuration = configuration; this.newTaskChecker = newTaskChecker; this.taskManager = taskManager; this.abort = abort; this.exceptionNotifier = exceptionNotifier; this.taskIdToHealthcheck = Maps.newConcurrentMap(); this.executorService = executorService; this.disasterManager = disasterManager; } public void enqueueHealthcheck(SingularityTask task, boolean ignoreExisting, boolean inStartup, boolean isFirstCheck) { HealthcheckOptions options = task.getTaskRequest().getDeploy().getHealthcheck().get(); final Optional<Integer> healthcheckMaxRetries = options.getMaxRetries().or(configuration.getHealthcheckMaxRetries()); Optional<Long> maybeRunningAt = getRunningAt(taskManager.getTaskHistoryUpdates(task.getTaskId())); if (maybeRunningAt.isPresent()) { final long durationSinceRunning = System.currentTimeMillis() - maybeRunningAt.get(); final int startupTimeout = options.getStartupTimeoutSeconds().or(configuration.getStartupTimeoutSeconds()); if (inStartup && durationSinceRunning > TimeUnit.SECONDS.toMillis(startupTimeout)) { LOG.debug("{} since running", durationSinceRunning); LOG.info("Not enqueuing new healthcheck for {}, has not responded to healthchecks before startup timeout of {}s", task.getTaskId(), startupTimeout); return; } } if (healthcheckMaxRetries.isPresent() && taskManager.getNumNonstartupHealthchecks(task.getTaskId()) > healthcheckMaxRetries.get()) { LOG.info("Not enqueuing new healthcheck for {}, it has already attempted {} times", task.getTaskId(), healthcheckMaxRetries.get()); return; } ScheduledFuture<?> future = enqueueHealthcheckWithDelay(task, getDelaySeconds(task.getTaskId(), options, inStartup, isFirstCheck), inStartup); ScheduledFuture<?> existing = taskIdToHealthcheck.put(task.getTaskId().getId(), future); if (existing != null) { boolean canceledExisting = existing.cancel(false); if (!ignoreExisting) { LOG.warn("Found existing overlapping healthcheck for task {} - cancel success: {}", task.getTaskId(), canceledExisting); } } } private Optional<Long> getRunningAt(Collection<SingularityTaskHistoryUpdate> updates) { for (SingularityTaskHistoryUpdate update : updates) { if (update.getTaskState() == ExtendedTaskState.TASK_RUNNING) { return Optional.of(update.getTimestamp()); } } return Optional.absent(); } private int getDelaySeconds(SingularityTaskId taskId, HealthcheckOptions options, boolean inStartup, boolean isFirstCheck) { if (isFirstCheck && options.getStartupDelaySeconds().or(configuration.getStartupDelaySeconds()).isPresent()) { int delaySeconds = options.getStartupDelaySeconds().or(configuration.getStartupDelaySeconds()).get(); LOG.trace("Delaying first healthcheck %s seconds for task {}", delaySeconds, taskId); return delaySeconds; } else if (inStartup) { return options.getStartupIntervalSeconds().or(configuration.getStartupIntervalSeconds()); } else { return options.getIntervalSeconds().or(configuration.getHealthcheckIntervalSeconds()); } } @Timed public boolean enqueueHealthcheck(SingularityTask task, Optional<SingularityPendingDeploy> pendingDeploy, Optional<SingularityRequestWithState> request) { if (!shouldHealthcheck(task, request, pendingDeploy)) { return false; } Optional<SingularityTaskHealthcheckResult> lastHealthcheck = taskManager.getLastHealthcheck(task.getTaskId()); enqueueHealthcheck(task, true, true, !lastHealthcheck.isPresent()); return true; } public void checkHealthcheck(SingularityTask task) { if (!taskIdToHealthcheck.containsKey(task.getTaskId().getId())) { LOG.info("Enqueueing expected healthcheck for task {}", task.getTaskId()); Optional<SingularityTaskHealthcheckResult> lastHealthcheck = taskManager.getLastHealthcheck(task.getTaskId()); enqueueHealthcheck(task, false, true, !lastHealthcheck.isPresent()); } } @VisibleForTesting Collection<ScheduledFuture<?>> getHealthCheckFutures() { return taskIdToHealthcheck.values(); } public void markHealthcheckFinished(String taskId) { taskIdToHealthcheck.remove(taskId); } public boolean cancelHealthcheck(String taskId) { ScheduledFuture<?> future = taskIdToHealthcheck.remove(taskId); if (future == null) { return false; } boolean canceled = future.cancel(false); LOG.trace("Canceling healthcheck ({}) for task {}", canceled, taskId); return canceled; } private ScheduledFuture<?> enqueueHealthcheckWithDelay(final SingularityTask task, long delaySeconds, final boolean inStartup) { LOG.trace("Enqueuing a healthcheck for task {} with delay {}", task.getTaskId(), DurationFormatUtils.formatDurationHMS(TimeUnit.SECONDS.toMillis(delaySeconds))); return executorService.schedule(new Runnable() { @Override public void run() { try { asyncHealthcheck(task); } catch (Throwable t) { LOG.error("Uncaught throwable in async healthcheck", t); exceptionNotifier.notify(String.format("Uncaught throwable in async healthcheck (%s)", t.getMessage()), t, ImmutableMap.of("taskId", task.getTaskId().toString())); reEnqueueOrAbort(task, inStartup); } } }, delaySeconds, TimeUnit.SECONDS); } public void reEnqueueOrAbort(SingularityTask task, boolean inStartup) { try { enqueueHealthcheck(task, true, inStartup, false); } catch (Throwable t) { LOG.error("Caught throwable while re-enqueuing health check for {}, aborting", task.getTaskId(), t); exceptionNotifier.notify(String.format("Caught throwable while re-enqueuing health check (%s)", t.getMessage()), t, ImmutableMap.of("taskId", task.getTaskId().toString())); abort.abort(SingularityAbort.AbortReason.UNRECOVERABLE_ERROR, Optional.of(t)); } } private Optional<String> getHealthcheckUri(SingularityTask task) { if (!task.getTaskRequest().getDeploy().getHealthcheck().isPresent()) { return Optional.absent(); } HealthcheckOptions options = task.getTaskRequest().getDeploy().getHealthcheck().get(); final String hostname = task.getOffer().getHostname(); Optional<Long> healthcheckPort = options.getPortNumber().or(task.getPortByIndex(options.getPortIndex().or(0))); if (!healthcheckPort.isPresent() || healthcheckPort.get() < 1L) { LOG.warn("Couldn't find a port for health check for task {}", task); return Optional.absent(); } String uri = task.getTaskRequest().getDeploy().getHealthcheck().get().getUri(); if (uri.startsWith("/")) { uri = uri.substring(1); } HealthcheckProtocol protocol = options.getProtocol().or(DEFAULT_HEALTH_CHECK_SCHEME); return Optional.of(String.format("%s://%s:%d/%s", protocol.getProtocol(), hostname, healthcheckPort.get(), uri)); } private void saveFailure(SingularityHealthcheckAsyncHandler handler, String message) { handler.saveResult(Optional.<Integer> absent(), Optional.<String> absent(), Optional.of(message), Optional.<Throwable>absent()); } private boolean shouldHealthcheck(final SingularityTask task, final Optional<SingularityRequestWithState> request, Optional<SingularityPendingDeploy> pendingDeploy) { if (disasterManager.isDisabled(SingularityAction.RUN_HEALTH_CHECKS)) { return false; } if (!task.getTaskRequest().getRequest().isLongRunning() || !task.getTaskRequest().getDeploy().getHealthcheck().isPresent()) { return false; } if (task.getTaskRequest().getPendingTask().getSkipHealthchecks().or(false)) { return false; } if (pendingDeploy.isPresent() && pendingDeploy.get().getDeployMarker().getDeployId().equals(task.getTaskId().getDeployId()) && task.getTaskRequest().getDeploy().getSkipHealthchecksOnDeploy().or(false)) { return false; } if (request.isPresent() && request.get().getRequest().getSkipHealthchecks().or(false)) { return false; } Optional<SingularityTaskHealthcheckResult> lastHealthcheck = taskManager.getLastHealthcheck(task.getTaskId()); if (lastHealthcheck.isPresent() && !lastHealthcheck.get().isFailed()) { LOG.debug("Not submitting a new healthcheck for {} because it already passed a healthcheck", task.getTaskId()); return false; } return true; } private void asyncHealthcheck(final SingularityTask task) { final SingularityHealthcheckAsyncHandler handler = new SingularityHealthcheckAsyncHandler(exceptionNotifier, configuration, this, newTaskChecker, taskManager, task); final Optional<String> uri = getHealthcheckUri(task); if (!uri.isPresent()) { saveFailure(handler, "Invalid healthcheck uri or ports not present"); return; } final Integer timeoutSeconds = task.getTaskRequest().getDeploy().getHealthcheck().isPresent() ? task.getTaskRequest().getDeploy().getHealthcheck().get().getResponseTimeoutSeconds().or(configuration.getHealthcheckTimeoutSeconds()) : configuration.getHealthcheckTimeoutSeconds(); try { PerRequestConfig prc = new PerRequestConfig(); prc.setRequestTimeoutInMs((int) TimeUnit.SECONDS.toMillis(timeoutSeconds)); RequestBuilder builder = new RequestBuilder("GET"); builder.setFollowRedirects(true); builder.setUrl(uri.get()); builder.setPerRequestConfig(prc); LOG.trace("Issuing a healthcheck ({}) for task {} with timeout {}s", uri.get(), task.getTaskId(), timeoutSeconds); http.prepareRequest(builder.build()).execute(handler); } catch (Throwable t) { LOG.debug("Exception while preparing healthcheck ({}) for task ({})", uri, task.getTaskId(), t); exceptionNotifier.notify(String.format("Error preparing healthcheck (%s)", t.getMessage()), t, ImmutableMap.of("taskId", task.getTaskId().toString())); saveFailure(handler, String.format("Healthcheck failed due to exception: %s", t.getMessage())); } } }