/** * Copyright 2011 Intuit Inc. All Rights Reserved */ package com.intuit.tank.vmManager; /* * #%L * VmManager * %% * Copyright (C) 2011 - 2015 Intuit Inc. * %% * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * #L% */ import java.util.ArrayList; import java.util.List; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.builder.ToStringBuilder; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import com.intuit.tank.api.cloud.VMTracker; import com.intuit.tank.api.model.v1.cloud.CloudVmStatus; import com.intuit.tank.api.model.v1.cloud.CloudVmStatusContainer; import com.intuit.tank.api.model.v1.cloud.VMStatus; import com.intuit.tank.api.model.v1.cloud.ValidationStatus; import com.intuit.tank.dao.VMImageDao; import com.intuit.tank.project.VMInstance; import com.intuit.tank.vm.api.enumerated.JobLifecycleEvent; import com.intuit.tank.vm.api.enumerated.JobStatus; import com.intuit.tank.vm.api.enumerated.VMImageType; import com.intuit.tank.vm.event.JobEvent; import com.intuit.tank.vm.settings.TankConfig; import com.intuit.tank.vm.settings.VmManagerConfig; import com.intuit.tank.vm.vmManager.VMInformation; import com.intuit.tank.vm.vmManager.VMInstanceRequest; import com.intuit.tank.vmManager.environment.amazon.AmazonInstance; /** * AgentWatchdog * * @author dangleton * */ public class AgentWatchdog implements Runnable { private static final Logger LOG = LogManager.getLogger(AgentWatchdog.class); private long sleepTime; private long maxWaitForStart; private long maxWaitForResponse; private int maxRestarts; private VMTracker vmTracker; private VMInstanceRequest instanceRequest; private List<VMInformation> vmInfo; private boolean stopped; private boolean checkForStart = true; private long startTime; private int restartCount; private int rebootCount; private AmazonInstance amazonInstance; /** * @param requestForAgents * @param vmTrackerImpl */ public AgentWatchdog(VMInstanceRequest instanceRequest, List<VMInformation> vmInfo, VMTracker vmTracker) { this.instanceRequest = instanceRequest; this.vmInfo = vmInfo; this.vmTracker = vmTracker; this.startTime = System.currentTimeMillis(); this.amazonInstance = new AmazonInstance(null, instanceRequest.getRegion()); VmManagerConfig vmManagerConfig = new TankConfig().getVmManagerConfig(); this.maxWaitForResponse = vmManagerConfig.getMaxAgentReportMills(1000 * 60 * 5); this.maxWaitForStart = vmManagerConfig.getMaxAgentStartMills(1000 * 60 * 3); this.maxRestarts = vmManagerConfig.getMaxRestarts(2); this.sleepTime = vmManagerConfig.getWatchdogSleepTime(30 * 1000); } /** * @{inheritDoc */ @Override public String toString() { return new ToStringBuilder(this).append("sleepTime", sleepTime).append("maxWaitForStart", maxWaitForStart) .append("maxWaitForResponse", maxWaitForResponse) .append("maxRestarts", maxRestarts).toString(); } /** * @{inheritDoc */ @Override public void run() { LOG.info("Starting WatchDog: " + this.toString()); try { List<VMInformation> instances = new ArrayList<VMInformation>(vmInfo); while (rebootCount <= maxRestarts && restartCount <= maxRestarts && !stopped) { if (!vmTracker.isRunning(instanceRequest.getJobId())) { break; } if (checkForStart) { LOG.info("Checking for " + instances.size() + " running agents..."); removeRunningInstances(instances); if (!instances.isEmpty()) { if (shouldRelaunchInstances()) { relaunch(instances); } else { LOG.info("Waiting for " + instances.size() + " agents to start: " + getInstanceIdList(instances)); } Thread.sleep(sleepTime); continue; } else { LOG.info("All Agents Started."); vmTracker.publishEvent(new JobEvent(instanceRequest.getJobId(), "All Agents Started.", JobLifecycleEvent.AGENT_STARTED)); checkForStart = false; startTime = System.currentTimeMillis(); } } // all instances are now started instances = new ArrayList<VMInformation>(vmInfo); String jobId = instanceRequest.getJobId(); // check to see if all agents have reported back LOG.info("Checking for " + instances.size() + " reporting agents..."); removeReportingInstances(jobId, instances); if (!instances.isEmpty()) { if (shouldRebootInstances()) { reboot(instances); } else { LOG.info("Waiting for " + instances.size() + " agents to report: " + getInstanceIdList(instances)); } Thread.sleep(sleepTime); continue; } else { LOG.info("All Agents Reported back."); vmTracker.publishEvent(new JobEvent(instanceRequest.getJobId(), "All Agents Reported Back and are ready to start load.", JobLifecycleEvent.AGENT_REPORTED)); stopped = true; } } } catch (Exception e) { LOG.error("Error in Watchdog: " + e.toString(), e); } LOG.info("Exiting Watchdog " + this.toString()); } /** * @param instances * */ private void reboot(List<VMInformation> instances) { rebootCount++; if (rebootCount <= maxRestarts) { String msg = "Have " + instances.size() + " agents that started but failed to report status correctly. rebooting " + getInstanceIdList(instances); vmTracker.publishEvent(new JobEvent(instanceRequest.getJobId(), msg, JobLifecycleEvent.AGENT_RESTARTED)); LOG.info(msg); startTime = System.currentTimeMillis(); amazonInstance.reboot(instances); checkForStart = true; } else { stopped = true; String msg = "Have " + instances.size() + " agents that failed to report correctly and have exceeded the maximum number of restarts. Killing job."; vmTracker.publishEvent(new JobEvent(instanceRequest.getJobId(), msg, JobLifecycleEvent.JOB_KILLED)); LOG.info(msg); killJob(); } } /** * @param instances * @return */ private String getInstanceIdList(List<VMInformation> instances) { return StringUtils.join(instances, ", "); } /** * @param instances */ private void removeReportingInstances(String jobId, List<VMInformation> instances) { CloudVmStatusContainer vmStatusForJob = vmTracker.getVmStatusForJob(jobId); if (vmStatusForJob != null && vmStatusForJob.getEndTime() == null) { for (CloudVmStatus status : vmStatusForJob.getStatuses()) { if (status.getVmStatus() == VMStatus.running || (status.getJobStatus() != JobStatus.Unknown && status.getJobStatus() != JobStatus.Starting)) { removeInstance(status.getInstanceId(), instances); } } } else { stopped = true; throw new RuntimeException("Job appears to have been stopped. Exiting..."); } } /** * @param instances * */ private void relaunch(List<VMInformation> instances) { restartCount++; if (restartCount <= maxRestarts) { startTime = System.currentTimeMillis(); String msg = "Have " + instances.size() + " agents that failed to start correctly. Restarting " + getInstanceIdList(instances); vmTracker.publishEvent(new JobEvent(instanceRequest.getJobId(), msg, JobLifecycleEvent.AGENT_REBOOTED)); LOG.info(msg); // relaunch instances and remove old onesn from vmTracker // kill them first just to be sure amazonInstance.killInstances(instances); VMImageDao dao = new VMImageDao(); for (VMInformation info : instances) { vmInfo.remove(info); vmTracker.setStatus(createTerminatedVmStatus(info)); VMInstance image = dao.getImageByInstanceId(info.getInstanceId()); if (image != null) { image.setStatus(VMStatus.terminated.name()); dao.saveOrUpdate(image); } } instanceRequest.setNumberOfInstances(instances.size()); List<VMInformation> newVms = new AmazonInstance(instanceRequest, instanceRequest.getRegion()).create(); instances.clear(); for (VMInformation newInfo : newVms) { vmInfo.add(newInfo); instances.add(newInfo); vmTracker.setStatus(createCloudStatus(instanceRequest, newInfo)); LOG.info("Added image (" + newInfo.getInstanceId() + ") to VMImage table"); try { new VMImageDao().addImageFromInfo(instanceRequest.getJobId(), newInfo, instanceRequest.getRegion()); } catch (Exception e) { LOG.warn("Error persisting VM Image: " + e); } } } else { stopped = true; String msg = "Have " + instances.size() + " agents that failed to start correctly and have exceeded the maximum number of restarts. Killing job."; vmTracker.publishEvent(new JobEvent(instanceRequest.getJobId(), msg, JobLifecycleEvent.JOB_ABORTED)); LOG.info(msg); killJob(); } } /** * */ private void killJob() { throw new RuntimeException("Killing jobs and exiting"); } /** * @param request2 * @param info * @return */ private CloudVmStatus createCloudStatus(VMInstanceRequest req, VMInformation info) { CloudVmStatus ret = new CloudVmStatus(info.getInstanceId(), req.getJobId(), req.getInstanceDescription() != null ? req.getInstanceDescription().getSecurityGroup() : "unknown", JobStatus.Starting, VMImageType.AGENT, req.getRegion(), VMStatus.pending, new ValidationStatus(), 0, 0, null, null); return ret; } /** * @param request2 * @param info * @return */ private CloudVmStatus createTerminatedVmStatus(VMInformation info) { LOG.info(info); LOG.info(instanceRequest); CloudVmStatus ret = new CloudVmStatus(info.getInstanceId(), instanceRequest.getJobId(), "unknown", JobStatus.Stopped, VMImageType.AGENT, instanceRequest.getRegion(), VMStatus.terminated, new ValidationStatus(), 0, 0, null, null); return ret; } /** * @return */ private boolean shouldRelaunchInstances() { return startTime + maxWaitForStart < System.currentTimeMillis(); } /** * @return */ private boolean shouldRebootInstances() { return startTime + maxWaitForResponse < System.currentTimeMillis(); } /** * */ private void removeRunningInstances(List<VMInformation> instances) { List<String> instanceIds = new ArrayList<String>(); CloudVmStatusContainer vmStatusForJob = vmTracker.getVmStatusForJob(instanceRequest.getJobId()); if (shouldRelaunchInstances() && (vmStatusForJob == null || vmStatusForJob.getEndTime() != null)) { stopped = true; throw new RuntimeException("Job appears to have been stopped. Exiting..."); } for (VMInformation info : instances) { instanceIds.add(info.getInstanceId()); } List<VMInformation> foundInstances = amazonInstance.describeInstances(instanceIds .toArray(new String[instanceIds.size()])); for (VMInformation info : foundInstances) { if ("running".equalsIgnoreCase(info.getState())) { removeInstance(info.getInstanceId(), instances); } } } /** * @param info * @param instances */ private void removeInstance(String foundInstanceId, List<VMInformation> instances) { for (int i = instances.size(); --i >= 0;) {// count down loop so no concurrent modification if (foundInstanceId.equals(instances.get(i).getInstanceId())) { instances.remove(i); } } } }