/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.minion; import javax.annotation.Nullable; import java.io.File; import java.io.IOException; import java.lang.management.ManagementFactory; import java.net.InetAddress; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map.Entry; import java.util.Set; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import com.addthis.basis.jvm.Shutdown; import com.addthis.basis.util.JitterClock; import com.addthis.basis.util.LessBytes; import com.addthis.basis.util.LessFiles; import com.addthis.basis.util.LessNumbers; import com.addthis.basis.util.Parameter; import com.addthis.basis.util.SimpleExec; import com.addthis.bark.ZkGroupMembership; import com.addthis.bark.ZkUtil; import com.addthis.codec.annotations.FieldConfig; import com.addthis.codec.codables.Codable; import com.addthis.codec.config.Configs; import com.addthis.codec.json.CodecJSON; import com.addthis.hydra.common.util.CloseTask; import com.addthis.hydra.job.JobTaskErrorCode; import com.addthis.hydra.job.mq.CommandTaskDelete; import com.addthis.hydra.job.mq.CommandTaskKick; import com.addthis.hydra.job.mq.CommandTaskNew; import com.addthis.hydra.job.mq.CommandTaskReplicate; import com.addthis.hydra.job.mq.CommandTaskRevert; import com.addthis.hydra.job.mq.CommandTaskStop; import com.addthis.hydra.job.mq.CommandTaskUpdateReplicas; import com.addthis.hydra.job.mq.CoreMessage; import com.addthis.hydra.job.mq.HostCapacity; import com.addthis.hydra.job.mq.HostMessage; import com.addthis.hydra.job.mq.HostState; import com.addthis.hydra.job.mq.JobKey; import com.addthis.hydra.job.mq.JobMessage; import com.addthis.hydra.job.mq.StatusTaskCantBegin; import com.addthis.hydra.job.mq.StatusTaskEnd; import com.addthis.hydra.mq.MessageConsumer; import com.addthis.hydra.mq.MessageListener; import com.addthis.hydra.mq.MessageProducer; import com.addthis.hydra.mq.RabbitMQUtil; import com.addthis.hydra.mq.RabbitMessageConsumer; import com.addthis.hydra.mq.RabbitMessageProducer; import com.addthis.hydra.mq.RabbitQueueingConsumer; import com.addthis.hydra.mq.ZKMessageProducer; import com.addthis.hydra.util.MetricsServletMaker; import com.addthis.hydra.util.MinionWriteableDiskCheck; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; import com.google.common.util.concurrent.MoreExecutors; import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.fasterxml.jackson.annotation.JsonAutoDetect; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.rabbitmq.client.AMQP; import com.rabbitmq.client.AlreadyClosedException; import com.rabbitmq.client.Channel; import com.rabbitmq.client.Connection; import com.yammer.metrics.Metrics; import com.yammer.metrics.core.Counter; import com.yammer.metrics.core.Histogram; import com.yammer.metrics.core.Meter; import com.yammer.metrics.core.Timer; import org.apache.curator.framework.CuratorFramework; import org.apache.curator.framework.imps.CuratorFrameworkState; import org.apache.curator.framework.state.ConnectionState; import org.apache.zookeeper.KeeperException; import org.eclipse.jetty.io.UncheckedIOException; import org.eclipse.jetty.server.Server; import org.eclipse.jetty.servlet.ServletHandler; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * TODO implement APIs for extended probing, sanity, clearing of job state */ @JsonAutoDetect(getterVisibility = JsonAutoDetect.Visibility.NONE, isGetterVisibility = JsonAutoDetect.Visibility.NONE, setterVisibility = JsonAutoDetect.Visibility.NONE) @JsonIgnoreProperties(value="stopped") public class Minion implements MessageListener<CoreMessage>, Codable, AutoCloseable { private static final Logger log = LoggerFactory.getLogger(Minion.class); private static final int webPort = Parameter.intValue("minion.web.port", 5051); private static final String group = System.getProperty("minion.group", "none"); private static final String localHost = System.getProperty("minion.localhost"); private static final String batchBrokerAddresses = Parameter.value("batch.brokerAddresses", "localhost:5672"); private static final String batchBrokerUsername = Parameter.value("batch.brokerUsername", "guest"); private static final String batchBrokerPassword = Parameter.value("batch.brokerPassword", "guest"); private static final int sendStatusRetries = Parameter.intValue("send.status.retries", 5); private static final int sendStatusRetryDelay = Parameter.intValue("send.status.delay", 5000); static final long hostMetricUpdaterInterval = Parameter.longValue("minion.host.metric.interval", 30 * 1000); static final String remoteConnectMethod = Parameter.value("minion.remote.connect.method", "ssh -o StrictHostKeyChecking=no -o TCPKeepAlive=yes -o ServerAliveInterval=30"); static final String rsyncCommand = Parameter.value("minion.rsync.command", "rsync"); private static final int maxActiveTasks = Parameter.intValue("minion.max.active.tasks", 3); static final int copyRetryLimit = Parameter.intValue("minion.copy.retry.limit", 3); static final int copyRetryDelaySeconds = Parameter.intValue("minion.copy.retry.delay", 10); /* If the following var is positive, it is passed as the bwlimit arg to rsync. If <= 0, it is ignored. */ static final int copyBandwidthLimit = Parameter.intValue("minion.copy.bwlimit", -1); static final ReentrantLock revertLock = new ReentrantLock(); static final ReentrantLock capacityLock = new ReentrantLock(); static final DateTimeFormatter timeFormat = DateTimeFormat.forPattern("yyMMdd-HHmmss"); static final String echoWithDate_cmd = "echo `date '+%y/%m/%d %H:%M:%S'` "; public static final String MINION_ZK_PATH = "/minion/"; public static final String defaultMinionType = Parameter.value("minion.type", "default"); public static final String batchJobQueueSuffix = ".batchJob"; public static final String batchControlQueueSuffix = ".batchControl"; public static final Meter tasksCompletedPerHour = Metrics.newMeter(Minion.class, "tasksCompletedPerHour", "tasksCompletedPerHour", TimeUnit.HOURS); public static void main(String[] args) throws Exception { Minion minion = Configs.newDefault(Minion.class); Runtime.getRuntime().addShutdownHook(new Thread(new CloseTask(minion), "Minion Shutdown Hook")); } @FieldConfig String uuid; @FieldConfig MinionTaskDeleter minionTaskDeleter; @FieldConfig List<CommandTaskKick> jobQueue = new ArrayList<>(10); @FieldConfig String minionTypes; final Set<String> activeTaskKeys; final AtomicBoolean shutdown = new AtomicBoolean(false); final ExecutorService messageTaskExecutorService = new ThreadPoolExecutor( 4, 4, 100L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<>(), new ThreadFactoryBuilder().setDaemon(true).build()); // This next executor service only serves promote/demote requests, so that these will be performed quickly and not // wait on a lengthy revert / delete / etc. final ExecutorService promoteDemoteTaskExecutorService = new ThreadPoolExecutor( 4, 4, 100L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<>(), new ThreadFactoryBuilder().setDaemon(true).build()); final Lock minionStateLock = new ReentrantLock(); // Historical metrics Timer fileStatsTimer; Counter sendStatusFailCount; Counter sendStatusFailAfterRetriesCount; Meter nonIdleIgnoredKicks; final int replicateCommandDelaySeconds = Parameter.intValue("replicate.cmd.delay.seconds", 0); final int backupCommandDelaySeconds = Parameter.intValue("backup.cmd.delay.seconds", 0); final File rootDir; final File stateFile; final File liveEverywhereMarkerFile; final String myHost; long startTime; String user; String path; TaskRunner runner; final ConcurrentMap<String, JobTask> tasks = new ConcurrentHashMap<>(); final Object jmsxmitlock = new Object(); final AtomicLong diskTotal = new AtomicLong(0); final AtomicLong diskFree = new AtomicLong(0); final Server jetty; final ServletHandler metricsHandler; final MinionHandler minionHandler = new MinionHandler(this); boolean diskReadOnly; MinionWriteableDiskCheck diskHealthCheck; int minionPid = -1; RabbitQueueingConsumer batchJobConsumer; private MessageConsumer<CoreMessage> batchControlConsumer; private MessageProducer<CoreMessage> queryControlProducer; private MessageProducer<CoreMessage> zkBatchControlProducer; private MessageProducer<CoreMessage> batchControlProducer; Channel channel; private CuratorFramework zkClient; private ZkGroupMembership minionGroupMembership; Histogram activeTaskHistogram; @VisibleForTesting public Minion(CuratorFramework zkClient) { this.zkClient = zkClient; uuid = UUID.randomUUID().toString(); // null placeholder for now rootDir = null; startTime = 0; stateFile = null; liveEverywhereMarkerFile = null; myHost = null; user = null; path = null; jetty = null; metricsHandler = null; diskReadOnly = false; minionPid = -1; activeTaskKeys = new HashSet<>(); } @JsonCreator private Minion(@JsonProperty("dataDir") File rootDir, @Nullable @JsonProperty("queueType") String queueType) throws Exception { this.rootDir = rootDir; startTime = System.currentTimeMillis(); stateFile = new File(LessFiles.initDirectory(rootDir), "minion.state"); liveEverywhereMarkerFile = new File(rootDir, "liveeverywhere.marker"); if (localHost != null) { myHost = localHost; } else { myHost = InetAddress.getLocalHost().getHostAddress(); } user = new SimpleExec("whoami").join().stdoutString().trim(); path = rootDir.getAbsolutePath(); diskTotal.set(rootDir.getTotalSpace()); diskFree.set(rootDir.getFreeSpace()); diskReadOnly = false; minionTaskDeleter = new MinionTaskDeleter(); if (stateFile.exists()) { CodecJSON.decodeString(this, LessBytes.toString(LessFiles.read(stateFile))); } else { uuid = UUID.randomUUID().toString(); } File minionTypesFile = new File(rootDir, "minion.types"); minionTypes = minionTypesFile.exists() ? new String(LessFiles.read(minionTypesFile)).replaceAll("\n", "") : defaultMinionType; activeTaskKeys = new HashSet<>(); jetty = new Server(webPort); jetty.setHandler(minionHandler); jetty.start(); waitForJetty(); sendStatusFailCount = Metrics.newCounter(Minion.class, "sendStatusFail-" + getJettyPort() + "-JMXONLY"); sendStatusFailAfterRetriesCount = Metrics.newCounter(Minion.class, "sendStatusFailAfterRetries-" + getJettyPort() + "-JMXONLY"); nonIdleIgnoredKicks = Metrics.newMeter(Minion.class, "nonIdleIgnoredKicks", "ignored-kick", TimeUnit.MINUTES); fileStatsTimer = Metrics.newTimer(Minion.class, "JobTask-byte-size-timer"); metricsHandler = MetricsServletMaker.makeHandler(); activeTaskHistogram = Metrics.newHistogram(Minion.class, "activeTasks"); new HostMetricUpdater(this); try { joinGroup(); connectToMQ(queueType); updateJobsMeta(rootDir); if (liveEverywhereMarkerFile.createNewFile()) { log.info("cutover to live-everywhere tasks"); } writeState(); if (!Strings.isNullOrEmpty(queueType)) { runner = new TaskRunner(this); runner.start(); } diskHealthCheck = new MinionWriteableDiskCheck(this); diskHealthCheck.startHealthCheckThread(); sendHostStatus(); log.info("[init] up on {}:{} as {} in {}", myHost, getJettyPort(), user, path); String processName = ManagementFactory.getRuntimeMXBean().getName(); minionPid = Integer.valueOf(processName.substring(0, processName.indexOf("@"))); log.info("[minion.start] pid for minion process is: {}", minionPid); minionTaskDeleter.startDeletionThread(); } catch (Exception ex) { log.error("Exception during startup", ex); } } public File getRootDir() { return rootDir; } public void setDiskReadOnly(boolean disk_read_only) { diskReadOnly = disk_read_only; } private int getJettyPort() { return jetty.getConnectors()[0].getLocalPort(); } private void waitForJetty() throws Exception { long wait = JitterClock.globalTime(); for (int i = 0; getJettyPort() <= 0 && i < 20; i++) { Thread.sleep(100); } wait = JitterClock.globalTime() - wait; if (wait > 1000 || getJettyPort() <= 0) { log.warn("[init] jetty took > {}ms to start. on port {}", wait, getJettyPort()); } } private void connectToMQ(@Nullable String queueType) throws IOException, InterruptedException { zkBatchControlProducer = new ZKMessageProducer(getZkClient()); if ("rabbit".equals(queueType)) { log.info("[init] connecting to rabbit message queue"); connectToRabbitMQ(); } else if (Strings.isNullOrEmpty(queueType)) { log.info("[init] skipping message queue"); } else { throw new IllegalArgumentException("queueType (" + queueType + ") must be either a valid message queue type or null"); } } private synchronized boolean connectToRabbitMQ() { ImmutableList<String> routingKeys = ImmutableList.of(uuid, HostMessage.ALL_HOSTS); ImmutableList<String> closeUnbindKeys = ImmutableList.of(HostMessage.ALL_HOSTS); try { batchControlProducer = RabbitMessageProducer.constructAndOpen("CSBatchControl", batchBrokerAddresses, batchBrokerUsername, batchBrokerPassword, null); queryControlProducer = RabbitMessageProducer.constructAndOpen("CSBatchQuery", batchBrokerAddresses, batchBrokerUsername, batchBrokerPassword, null); Connection connection = RabbitMQUtil.createConnection(batchBrokerAddresses, batchBrokerUsername, batchBrokerPassword); channel = connection.createChannel(); channel.exchangeDeclare("CSBatchJob", "direct"); AMQP.Queue.DeclareOk result = channel.queueDeclare(uuid + batchJobQueueSuffix, true, false, false, null); String queueName = result.getQueue(); channel.queueBind(queueName, "CSBatchJob", uuid); channel.queueBind(queueName, "CSBatchJob", HostMessage.ALL_HOSTS); batchJobConsumer = new RabbitQueueingConsumer(channel); channel.basicConsume(queueName, false, batchJobConsumer); batchControlConsumer = new RabbitMessageConsumer<CoreMessage>(channel, "CSBatchControl", uuid + batchControlQueueSuffix, Minion.this, routingKeys, closeUnbindKeys, CoreMessage.class); return true; } catch (IOException e) { log.error("Error connecting to rabbitmq at {}", batchBrokerAddresses, e); return false; } } static void shutdown() { Shutdown.exit(1); } void disconnectFromMQ() { try { if (batchControlConsumer != null) { batchControlConsumer.close(); } } catch (Exception ex) { log.warn("Error trying to close batchControlConsumer: ", ex); try { if (channel != null) { channel.close(); } } catch (Exception ex2) { log.warn("Error trying to close channel: ", ex2); } } try { if (queryControlProducer != null) { queryControlProducer.close(); } } catch (Exception ex) { log.warn("Error trying to close queryControlProducer: ", ex); } try { if (batchControlProducer != null) { batchControlProducer.close(); } } catch (AlreadyClosedException ace) { log.warn("Attempt was made to close batchControlProducer more than once: ", ace); } catch (Exception ex) { log.warn("Error trying to close batchControlProducer: ", ex); } try { if (zkBatchControlProducer != null) { zkBatchControlProducer.close(); } } catch (Exception ex) { log.warn("Error trying to close zkBatchControlProducer: ", ex); } } @VisibleForTesting public void insertJobKickMessage(CommandTaskKick kick) { minionStateLock.lock(); try { for (int i = 0; i < jobQueue.size(); i++) { CommandTaskKick inQ = jobQueue.get(i); if (kick.getPriority() > inQ.getPriority()) { kick.setSubmitTime(JitterClock.globalTime()); jobQueue.add(i, kick); return; } } jobQueue.add(kick); } finally { minionStateLock.unlock(); } writeState(); } void kickNextJob() throws Exception { minionStateLock.lock(); try { if (jobQueue.isEmpty()) { return; } // Iterate over the queue, looking for a job that can run using the current resources for (CommandTaskKick nextKick : jobQueue) { capacityLock.lock(); try { boolean lackCap = activeTaskKeys.size() >= maxActiveTasks; if (lackCap) { sendStatusMessage(new StatusTaskCantBegin(getUUID(), nextKick.getJobUuid(), nextKick.getNodeID(), nextKick.getPriority())); jobQueue.remove(nextKick); break; } else { // remove this kick from the queue jobQueue.remove(nextKick); JobTask task = tasks.get(nextKick.key()); if (task == null) { task = createNewTask(nextKick.getJobUuid(), nextKick.getNodeID()); } task.setAutoRetry(nextKick.getAutoRetry()); try { task.exec(nextKick, true); } catch (ExecStateException ex) { log.warn("[kick] failed to kick non-idle task {}", task.getName(), ex); // It should be okay to simply ignore non-idle kicks, since the actual task state has already // been sent back to spawn. // These ignored kicks are only expected to happen due to a race condition during failing // minions. The below metric can be removed once we confirm that ignored kicks are not // happening when unexpected. nonIdleIgnoredKicks.mark(); } catch (Exception ex) { log.warn("[kick] exception while trying to kick {}", task.getName(), ex); task.sendEndStatus(JobTaskErrorCode.EXIT_SCRIPT_EXEC_ERROR); } writeState(); return; } } finally { capacityLock.unlock(); } } } finally { minionStateLock.unlock(); } } List<JobTask> getMatchingJobs(JobMessage msg) { LinkedList<JobTask> match = new LinkedList<>(); JobKey msgKey = msg.getJobKey(); if (msgKey.getNodeNumber() == null) { for (Entry<String, JobTask> e : tasks.entrySet()) { String key = e.getKey(); if (key.startsWith(msgKey.getJobUuid())) { match.add(e.getValue()); } } } else { JobTask job = tasks.get(msgKey.toString()); if (job != null) { match.add(job); } } return match; } @Override public void onMessage(CoreMessage message) { try { handleMessage(message); } catch (Exception ex) { log.warn("", ex); } } private void handleMessage(CoreMessage message) throws Exception { if(message instanceof HostState) { log.debug("[host.status] request for {}", uuid); sendHostStatus(); } else if(message instanceof CommandTaskStop) { messageTaskExecutorService.execute(new CommandTaskStopRunner(Minion.this, message)); } else if(message instanceof CommandTaskRevert) { messageTaskExecutorService.execute(new CommandTaskRevertRunner(Minion.this, message)); } else if(message instanceof CommandTaskDelete) { messageTaskExecutorService.execute(new CommandTaskDeleteRunner(Minion.this, message)); } else if(message instanceof CommandTaskReplicate) { messageTaskExecutorService.execute(new CommandTaskReplicateRunner(Minion.this, message)); } else if(message instanceof CommandTaskNew) { messageTaskExecutorService.execute(new CommandCreateNewTask(Minion.this, message)); } else if(message instanceof CommandTaskUpdateReplicas) { promoteDemoteTaskExecutorService.execute(new CommandTaskUpdateReplicasRunner(Minion.this, message)); } else { log.warn("[mq.core] unhandled type = {}", message.getClass().toString()); } } /** * import/update jobs from a given root */ int updateJobsMeta(File jobsRoot) throws IOException { int loaded = 0; File[] list = jobsRoot.isDirectory() ? jobsRoot.listFiles() : null; if (list == null || list.length == 0) { return 0; } for (File jobRoot : list) { loaded += updateJobMeta(jobRoot); } log.info("[import] {} tasks from directory '{}'", loaded, jobsRoot); return loaded; } /** * import/update job tasks given job root */ private int updateJobMeta(File jobRoot) throws IOException { if (!(jobRoot.isDirectory() && jobRoot.exists())) { return 0; } int loaded = 0; String jobID = jobRoot.getName(); for (File taskRoot : jobRoot.listFiles()) { loaded += updateTaskMeta(jobID, taskRoot) ? 1 : 0; } return loaded; } /** * update a single task from a task root dir */ private boolean updateTaskMeta(String jobID, File taskRoot) throws IOException { if (!(taskRoot.isDirectory() && taskRoot.exists())) { return false; } Integer taskID = LessNumbers.parseInt(10, taskRoot.getName(), -1); if (taskID < 0) { log.warn("[task.update] invalid task root {}", taskRoot); return false; } JobKey key = new JobKey(jobID, taskID); JobTask task = tasks.get(key.toString()); if (task == null) { task = new JobTask(Minion.this); tasks.put(key.toString(), task); } if (task.restoreTaskState(taskRoot)) { log.info("[import.task] {} as {}", key, task.isRunning() ? "running" : "idle"); return true; } else { log.warn("[import.task] {} failed", key); return false; } } void writeState() { minionStateLock.lock(); try { LessFiles.write(stateFile, LessBytes.toBytes(CodecJSON.encodeString(this)), false); } catch (IOException io) { log.warn("Error writing minion state to disk: ", io); /* assume disk failure: set diskReadOnly=true and exit */ diskHealthCheck.onFailure(); } finally { minionStateLock.unlock(); } } private HostState createHostState() { long time = System.currentTimeMillis(); HostState status = new HostState(uuid); status.setHost(myHost); status.setPort(getJettyPort()); status.setGroup(group); status.setTime(time); status.setUser(user); status.setPath(path); status.setDiskReadOnly(diskReadOnly); status.setUptime(time - startTime); capacityLock.lock(); try { int availSlots = maxActiveTasks - activeTaskKeys.size(); status.setAvailableTaskSlots(Math.max(0, availSlots)); } finally { capacityLock.unlock(); } status.setUsed(new HostCapacity(0, 0, 0, diskTotal.get() - diskFree.get())); status.setMax(new HostCapacity(0, 0, 0, diskTotal.get())); LinkedList<JobKey> running = new LinkedList<>(); LinkedList<JobKey> replicating = new LinkedList<>(); LinkedList<JobKey> backingUp = new LinkedList<>(); LinkedList<JobKey> stoppedTasks = new LinkedList<>(); LinkedList<JobKey> incompleteReplicas = new LinkedList<>(); for (JobTask job : tasks.values()) { try { status.addJob(job.getJobKey().getJobUuid()); if (job.isRunning()) { running.add(job.getJobKey()); } else if (job.isReplicating() && job.isProcessRunning(job.replicatePid)) { replicating.add(job.getJobKey()); } else if (job.isBackingUp()) { backingUp.add(job.getJobKey()); } else if (job.getLiveDir().exists()) { if (job.isComplete()) { stoppedTasks.add(job.getJobKey()); } else { incompleteReplicas.add(job.getJobKey()); } } } catch (Exception ex) { log.warn("Failed to detect status of job {}; omitting from host state", job, ex); } } status.setRunning(running.toArray(new JobKey[running.size()])); status.setReplicating(replicating.toArray(new JobKey[replicating.size()])); status.setBackingup(backingUp.toArray(new JobKey[backingUp.size()])); status.setStopped(stoppedTasks.toArray(new JobKey[stoppedTasks.size()])); status.setIncompleteReplicas(incompleteReplicas.toArray(new JobKey[incompleteReplicas.size()])); LinkedList<JobKey> queued = new LinkedList<>(); minionStateLock.lock(); try { for (CommandTaskKick kick : jobQueue) { queued.add(kick.getJobKey()); } } finally { minionStateLock.unlock(); } status.setQueued(queued.toArray(new JobKey[queued.size()])); status.setMeanActiveTasks(activeTaskHistogram.mean() / (maxActiveTasks > 0 ? maxActiveTasks : 1)); status.setMaxTaskSlots(maxActiveTasks); status.setMinionTypes(minionTypes); status.setUpdated(); return status; } public void sendHostStatus() { updateHostConfig(createHostState()); } public String getUUID() { return uuid; } // Not Thread Safe! private CuratorFramework getZkClient() { if (zkClient == null) { zkClient = ZkUtil.makeStandardClient(); } zkClient.getConnectionStateListenable().addListener((client, newState) -> { if (newState == ConnectionState.RECONNECTED) { joinGroup(); } }); return zkClient; } @VisibleForTesting protected void closeZkClient() { if (zkClient != null) { zkClient.close(); } } protected void joinGroup() { minionGroupMembership = new ZkGroupMembership(getZkClient(), true); String upPath = MINION_ZK_PATH + "up"; try { if (zkClient.checkExists().forPath(upPath) == null) { zkClient.create().creatingParentsIfNeeded().forPath(upPath, null); } } catch (KeeperException.NodeExistsException e) { // someone beat us to it } catch (Exception e) { log.error("Exception joining group", e); } log.info("joining group: {}", upPath); minionGroupMembership.addToGroup(upPath, getUUID(), shutdown); } void sendControlMessage(HostMessage msg) { try { synchronized (jmsxmitlock) { if (batchControlProducer != null) { batchControlProducer.sendMessage(msg, msg.getHostUuid()); } } } catch (Exception ex) { log.error("[mq.ctrl.send] fail <INITIATING JVM SHUTDOWN>", ex); shutdown(); } } void sendStatusMessage(HostMessage msg) { try { synchronized (jmsxmitlock) { if (batchControlProducer != null) { batchControlProducer.sendMessage(msg, "SPAWN"); } } } catch (IOException ex) { throw new UncheckedIOException(ex); } } private void updateHostConfig(HostMessage msg) { boolean sent = false; for (int i = 0; i < sendStatusRetries; i++) { synchronized (jmsxmitlock) { try { if (shutdown.get()) { return; // Interrupt any existing status updates; we'll send one during the shutdown event anyway } if (zkBatchControlProducer != null) { // TODO: move to /minion/state/ or some other dir zkBatchControlProducer.sendMessage(msg, MINION_ZK_PATH + uuid); } sent = true; break; } catch (Exception ex) { log.warn("[mq.ctrl.send] exception", ex); if (i < sendStatusRetries - 1) { log.warn("[mq.ctrl.send] fail on try {}; retrying", i); } sendStatusFailCount.inc(); } } try { Thread.sleep(sendStatusRetryDelay); } catch (InterruptedException ie) { log.warn("[mq.ctrl.send] interrupted during retry delay"); } } if (!sent) { sendStatusFailAfterRetriesCount.inc(); log.error("[mq.ctrl.send] fail after retrying <INITIATING JVM SHUTDOWN>"); shutdown(); } } void removeJobFromQueue(JobKey key, boolean sendToSpawn) { minionStateLock.lock(); try { for (Iterator<CommandTaskKick> iter = jobQueue.iterator(); iter.hasNext(); ) { CommandTaskKick kick = iter.next(); if (kick.getJobKey().matches(key)) { log.info("[task.stop] removing from queue {} kick={} key={}", kick.getJobKey(), kick, key); if (sendToSpawn) { try { sendStatusMessage(new StatusTaskEnd(uuid, kick.getJobUuid(), kick.getNodeID(), 0, 0, 0)); } catch (Exception ex) { log.warn("---> send fail {} {} {}", uuid, key, kick, ex); } } iter.remove(); } } } finally { minionStateLock.unlock(); } } JobTask createNewTask(String jobID, int node) throws ExecException { JobTask task = new JobTask(Minion.this); task.id = jobID; task.node = node; task.taskRoot = new File(rootDir, task.id + "/" + task.node); log.info("[task.new] restore {}/{} root={}", task.id, task.node, task.taskRoot); tasks.put(task.getJobKey().toString(), task); task.initializeFileVariables(); writeState(); return task; } /** * Delete a series of files. * * @param files Files to delete * @return False only if some file existed and could not be deleted */ boolean deleteFiles(File... files) { for (File file : files) { if (file != null && file.exists()) { if (ProcessUtils.shell(MacUtils.rmcmd + " -rf " + file.getAbsolutePath(), rootDir) != 0) { return false; } } } return true; } public boolean getShutdown() { return shutdown.get(); } @Override public void close() throws Exception { if (!shutdown.getAndSet(true)) { log.info("[minion] stopping"); jetty.stop(); if (runner != null) { runner.stopTaskRunner(); } try { Thread.sleep(1000); } catch (InterruptedException ex) { log.warn("Minion interrupted while sleeping (for mystery reasons) during shutdown: ", ex); } disconnectFromMQ(); MoreExecutors.shutdownAndAwaitTermination(messageTaskExecutorService, 120, TimeUnit.SECONDS); MoreExecutors.shutdownAndAwaitTermination(promoteDemoteTaskExecutorService, 120, TimeUnit.SECONDS); minionTaskDeleter.stopDeletionThread(); if ((zkClient != null) && (zkClient.getState() == CuratorFrameworkState.STARTED)) { minionGroupMembership.removeFromGroup("/minion/up", getUUID()); zkClient.close(); } writeState(); } } }