/*- * -\-\- * Helios Services * -- * Copyright (C) 2016 Spotify AB * -- * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * -/-/- */ package com.spotify.helios.master.reaper; import com.google.common.annotations.VisibleForTesting; import com.spotify.helios.common.descriptors.Job; import com.spotify.helios.common.descriptors.JobId; import com.spotify.helios.master.MasterModel; import com.spotify.helios.servicescommon.coordination.Paths; import com.spotify.helios.servicescommon.coordination.ZooKeeperClient; import java.util.Collections; import java.util.List; import java.util.Random; import java.util.concurrent.TimeUnit; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.KeeperException.NoNodeException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Removes job histories whose corresponding jobs don't exist anymore. * There are two race conditions where jobs can be deleted but their histories are * left behind in ZooKeeper: * * <p>1. The master deletes the job (in {@link com.spotify.helios.master.ZooKeeperMasterModel} * and then deletes its history. During this deletion, the agent creates a znode. The master's * deletion operations fail. * * <p>2. The master deletes all relevant history znodes successfully. The agent still hasn't * undeployed its job and continues writing history to ZooKeeper. This will recreate deleted history * znodes via {@link com.spotify.helios.agent.TaskHistoryWriter}. * * <p>Solve both of these cases by scheduling an instance of this class. It runs once a day once * scheduled. */ public class JobHistoryReaper extends RateLimitedService<String> { private static final double PERMITS_PER_SECOND = 0.2; // one permit every 5 seconds private static final int DELAY = 60 * 24; // 1 day in minutes private static final TimeUnit TIME_UNIT = TimeUnit.MINUTES; private static final Logger log = LoggerFactory.getLogger(JobHistoryReaper.class); private final MasterModel masterModel; private final ZooKeeperClient client; public JobHistoryReaper(final MasterModel masterModel, final ZooKeeperClient client) { this(masterModel, client, PERMITS_PER_SECOND, new Random().nextInt(DELAY)); } @VisibleForTesting JobHistoryReaper(final MasterModel masterModel, final ZooKeeperClient client, final double permitsPerSecond, final int initialDelay) { super(permitsPerSecond, initialDelay, DELAY, TIME_UNIT); this.masterModel = masterModel; this.client = client; } @Override Iterable<String> collectItems() { final String path = Paths.historyJobs(); List<String> jobIds = Collections.emptyList(); try { jobIds = client.getChildren(path); } catch (KeeperException e) { log.warn("Failed to get children of znode {}", path, e); } return jobIds; } @Override void processItem(final String jobId) { log.info("Deciding whether to reap job history for job {}", jobId); final JobId id = JobId.fromString(jobId); final Job job = masterModel.getJob(id); if (job == null) { try { client.deleteRecursive(Paths.historyJob(id)); log.info("Reaped job history for job {}", jobId); } catch (NoNodeException ignored) { // Something deleted the history right before we got to it. Ignore and keep going. } catch (KeeperException e) { log.warn("error reaping job history for job {}", jobId, e); } } } }