/*- * -\-\- * Helios Services * -- * Copyright (C) 2016 Spotify AB * -- * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * -/-/- */ package com.spotify.helios.master.reaper; import static com.google.common.base.Preconditions.checkArgument; import com.google.common.annotations.VisibleForTesting; import com.spotify.helios.common.Clock; import com.spotify.helios.common.SystemClock; import com.spotify.helios.common.descriptors.Deployment; import com.spotify.helios.common.descriptors.Job; import com.spotify.helios.common.descriptors.JobId; import com.spotify.helios.common.descriptors.JobStatus; import com.spotify.helios.common.descriptors.TaskStatusEvent; import com.spotify.helios.master.MasterModel; import java.util.List; import java.util.Map; import java.util.Random; import java.util.concurrent.TimeUnit; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Removes old jobs that haven't been deployed for a while. * The logic for whether a job should be reaped depends on whether it's deployed, its last history * event, its creation date, and the specified number of retention days. * * <p>1. A job that's deployed should NOT BE reaped regardless of its history or creation date. * 2. A job not deployed, with history, and an event before the number of retention days should * BE reaped. * 3. A job not deployed, with history, and an event after the number of retention days should NOT * BE reaped. An example is a job created a long time ago but deployed recently. * 4. A job not deployed, without history, and without a creation date should BE reaped. Only really * old versions of Helios create jobs without dates. * 5. A job not deployed, without history, and with a creation date before the number of retention * days should BE reaped. * 6. A job not deployed, without history, and with a creation date after the number of retention * days should NOT BE reaped. * * <p>Note that the --disable-job-history flag in {@link com.spotify.helios.agent.AgentParser} * controls whether the Helios agent should write job history to the data store. If this is * disabled, scenarios two and three above will never match. In this case, a job created a long * time ago but deployed recently may be reaped once it's undeployed even if the user needs it * again in the future. */ public class OldJobReaper extends RateLimitedService<Job> { private static final double PERMITS_PER_SECOND = 0.2; // one permit every 5 seconds private static final Clock SYSTEM_CLOCK = new SystemClock(); private static final int DELAY = 60 * 24; // 1 day in minutes private static final TimeUnit TIME_UNIT = TimeUnit.MINUTES; private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormat.forPattern("YYYY-MM-dd HH:mm:ss"); private static final Logger log = LoggerFactory.getLogger(OldJobReaper.class); private final MasterModel masterModel; private final long retentionDays; private final long retentionMillis; private final Clock clock; public OldJobReaper(final MasterModel masterModel, final long retentionDays) { this(masterModel, retentionDays, SYSTEM_CLOCK, PERMITS_PER_SECOND, new Random().nextInt(DELAY)); } @VisibleForTesting OldJobReaper(final MasterModel masterModel, final long retentionDays, final Clock clock, final double permitsPerSecond, final int initialDelay) { super(permitsPerSecond, initialDelay, DELAY, TIME_UNIT); this.masterModel = masterModel; checkArgument(retentionDays > 0); this.retentionDays = retentionDays; this.retentionMillis = TimeUnit.DAYS.toMillis(retentionDays); this.clock = clock; } @Override Iterable<Job> collectItems() { return masterModel.getJobs().values(); } @Override void processItem(final Job job) { final JobId jobId = job.getId(); try { final JobStatus jobStatus = masterModel.getJobStatus(jobId); final Map<String, Deployment> deployments = jobStatus.getDeployments(); final List<TaskStatusEvent> events = masterModel.getJobHistory(jobId); boolean reap; if (deployments.isEmpty()) { if (events.isEmpty()) { final Long created = job.getCreated(); if (created == null) { log.info("Marked job '{}' for reaping (not deployed, no history, no creation date)", jobId); reap = true; } else if ((clock.now().getMillis() - created) > retentionMillis) { log.info("Marked job '{}' for reaping (not deployed, no history, creation date " + "of {} before retention time of {} days)", jobId, DATE_FORMATTER.print(created), retentionDays); reap = true; } else { log.info("NOT reaping job '{}' (not deployed, no history, creation date of {} after " + "retention time of {} days)", jobId, DATE_FORMATTER.print(created), retentionDays); reap = false; } } else { // Get the last event which is the most recent final TaskStatusEvent event = events.get(events.size() - 1); final String eventDate = DATE_FORMATTER.print(event.getTimestamp()); // Calculate the amount of time in milliseconds that has elapsed since the last event final long unusedDurationMillis = clock.now().getMillis() - event.getTimestamp(); // A job not deployed, with history, and last used too long ago should BE reaped // A job not deployed, with history, and last used recently should NOT BE reaped if (unusedDurationMillis > retentionMillis) { log.info("Marked job '{}' for reaping (not deployed, has history whose last event " + "on {} was before the retention time of {} days)", jobId, eventDate, retentionDays); reap = true; } else { log.info("NOT reaping job '{}' (not deployed, has history whose last event " + "on {} was after the retention time of {} days)", jobId, eventDate, retentionDays); reap = false; } } } else { // A job that's deployed should NOT BE reaped regardless of its history or creation date reap = false; } if (reap) { try { log.info("reaping old job '{}'", job.getId()); masterModel.removeJob(jobId, job.getToken()); } catch (Exception e) { log.warn("Failed to reap old job '{}'", jobId, e); } } } catch (Exception e) { log.warn("Failed to determine if job '{}' should be reaped", jobId, e); } } }