/**
* Copyright (c) 2015 Genome Research Ltd.
*
* Author: Cancer Genome Project cgpit@sanger.ac.uk
*
* This file is part of WwDocker.
*
* WwDocker is free software: you can redistribute it and/or modify it under
* the terms of the GNU Affero General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option) any
* later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
* details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* 1. The usage of a range of years within a copyright statement contained within
* this distribution should be interpreted as being equivalent to a list of years
* including the first and last year specified and all consecutive years between
* them. For example, a copyright statement that reads 'Copyright (c) 2005, 2007-
* 2009, 2011-2012' should be interpreted as being identical to a statement that
* reads 'Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012' and a copyright
* statement that reads "Copyright (c) 2005-2012' should be interpreted as being
* identical to a statement that reads 'Copyright (c) 2005, 2006, 2007, 2008,
* 2009, 2010, 2011, 2012'."
*/
package uk.ac.sanger.cgp.wwdocker.daemon;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeoutException;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import uk.ac.sanger.cgp.wwdocker.Config;
import uk.ac.sanger.cgp.wwdocker.actions.Local;
import uk.ac.sanger.cgp.wwdocker.callable.Docker;
import uk.ac.sanger.cgp.wwdocker.actions.Utils;
import uk.ac.sanger.cgp.wwdocker.beans.WorkerState;
import uk.ac.sanger.cgp.wwdocker.beans.WorkerResources;
import uk.ac.sanger.cgp.wwdocker.beans.WorkflowIni;
import uk.ac.sanger.cgp.wwdocker.enums.HostStatus;
import uk.ac.sanger.cgp.wwdocker.factories.WorkflowFactory;
import uk.ac.sanger.cgp.wwdocker.interfaces.Daemon;
import uk.ac.sanger.cgp.wwdocker.interfaces.Workflow;
import uk.ac.sanger.cgp.wwdocker.messages.Messaging;
/**
*
* @author kr2
*/
public class WorkerDaemon implements Daemon {
private static final Logger logger = LogManager.getLogger();
private static PropertiesConfiguration config;
private static Messaging messaging;
private static Docker dockerThread = null;
private static ExecutorService executor = null;
private static FutureTask<Integer> futureTask = null;
public WorkerDaemon(PropertiesConfiguration config, Messaging rmq) {
WorkerDaemon.config = config;
WorkerDaemon.messaging = rmq;
}
@Override
public void run(String mode) throws IOException, InterruptedException, TimeoutException, ConfigurationException {
WorkerResources hr = new WorkerResources();
logger.debug(Utils.objectToJson(hr));
Thread shutdownThread = null;
String qPrefix = config.getString("qPrefix");
File thisConfig = new File("/opt/wwdocker/"+ qPrefix + ".remote.cfg");
File thisJar = Utils.thisJarFile();
// build a local WorkerState
WorkerState thisState = new WorkerState(thisJar, thisConfig);
thisState.setStatus(HostStatus.CLEAN);
String hostName = thisState.getResource().getHostName();
// Remove from all queues as I'll set my state again now
messaging.removeFromStateQueue(qPrefix.concat(".").concat("BROKEN"), hostName);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("CLEAN"), hostName);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("DONE"), hostName);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("ERROR"), hostName);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("ERRORLOGS"), hostName);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("RECEIVE"), hostName);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("RUNNING"), hostName);
// I'm running so send a message to the CLEAN queue
messaging.sendMessage(qPrefix.concat(".CLEAN"), thisState);
boolean firstCleanIter = true;
String myQueue = qPrefix.concat(".").concat(hostName);
int counter = 30;
Workflow workflowImp = new WorkflowFactory().getWorkflow(config);
int failedRmqGet = 0;
while (true) {
Thread.sleep(500); // don't eat cpu
//Only control messages will be sent directly to the host now
WorkerState recievedState = null;
try {
recievedState = (WorkerState) messaging.getWorkerState(myQueue, 10);
failedRmqGet = 0;
} catch( IOException e ) {
failedRmqGet++;
if(failedRmqGet == 10) {
logger.fatal("Failed to communicate with RMQ server 10 times, aborting.", e);
System.exit(1);
}
logger.warn("Failed to communicate with RMQ server, allowable for 10 iterations only.", e);
}
thisState.getResource().init();
if(recievedState != null) {
if(!recievedState.equals(thisState) && thisState.getStatus().equals(HostStatus.CLEAN)) {
messaging.removeFromStateQueue(qPrefix.concat(".").concat(thisState.getStatus().name()), hostName);
logger.fatal("Host refresh required, shutting down...");
System.exit(0);
}
if(recievedState.getChangeStatusTo() != null) {
if(recievedState.getChangeStatusTo().equals(HostStatus.KILL)) {
messaging.removeFromStateQueue(qPrefix.concat(".").concat(thisState.getStatus().name()), hostName);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("RUNNING"), hostName); // this is never changed unless a host dies/killed
if(thisState.getStatus().equals(HostStatus.ERROR)) {
messaging.removeFromStateQueue(qPrefix.concat(".").concat("ERRORLOGS"), hostName);
}
if(!thisState.getStatus().equals(HostStatus.CLEAN)) {
if(shutdownThread == null) {
messaging.sendMessage(qPrefix.concat(".").concat("PEND"), Utils.objectToJson(thisState.getWorkflowIni()));
}
}
logger.fatal("FORCED SHUTDOWN...");
if(dockerThread != null) {
Local.execCommand("docker ps | tail -n +2 | cut -d ' ' -f 1 | xargs docker kill", Config.getEnvs(config), true);
futureTask.cancel(true);
executor.shutdownNow();
}
System.exit(0);
}
else if(recievedState.getChangeStatusTo().equals(HostStatus.CHECKIN)) {
logger.info(recievedState.toString());
messaging.sendMessage(recievedState.getReplyToQueue(), thisState);
}
else if(recievedState.getChangeStatusTo().equals(HostStatus.RUNNING)) {
// this is only sent if we want to retry the execution of an errored workflow
throw new RuntimeException("Restart attempted, I don't know how yet");
}
}
}
// then we do the actual work
if(thisState.getStatus().equals(HostStatus.CLEAN)) {
// clean up any other queues that may have legacy entries, boolean to prevent rapid query rates
if(firstCleanIter) {
messaging.removeFromStateQueue(qPrefix.concat(".").concat("DONE"), hostName);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("ERROR"), hostName);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("ERRORLOGS"), hostName);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("RECEIVE"), hostName);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("RUNNING"), hostName);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("BROKEN"), hostName);
firstCleanIter = false;
}
//We pull data from the wwd_PEND queue
WorkflowIni workIni = (WorkflowIni) messaging.getMessageObject(qPrefix.concat(".").concat("PEND"), WorkflowIni.class, 10);
if(workIni == null) {
continue;
}
logger.debug(thisState.toString());
thisState.setWorkflowIni(workIni);
shutdownThread = attachWorkIniShutdownHook(thisState.getWorkflowIni(), messaging, qPrefix, hostName);
workflowImp.cleanDockerPath(config); // clean up the workarea
dockerThread = new Docker(workIni, config);
futureTask = new FutureTask<>(dockerThread);
executor = Executors.newSingleThreadExecutor();
executor.execute(futureTask);
// this section saves having to check you've got it right
messaging.removeFromStateQueue(qPrefix.concat(".").concat(thisState.getStatus().name()), hostName);
thisState.setStatus(HostStatus.RUNNING);
messaging.sendMessage(qPrefix.concat(".").concat(thisState.getStatus().name()), thisState);
}
else if(thisState.getStatus().equals(HostStatus.RUNNING)) {
if(futureTask.isDone()) {
try {
int dockerExitCode = futureTask.get();
logger.info("Exit code: "+ dockerExitCode);
if(dockerExitCode == 0) {
thisState.setStatus(HostStatus.DONE);
messaging.sendMessage(qPrefix.concat(".").concat("UPLOADED"), thisState.getWorkflowIni());
}
else {
if(dockerThread.getLogArchive() != null) {
messaging.sendFile(qPrefix.concat(".").concat("ERRORLOGS"), hostName, dockerThread.getLogArchive());
}
thisState.setStatus(HostStatus.ERROR);
}
Runtime.getRuntime().removeShutdownHook(shutdownThread);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("RUNNING"), hostName);
messaging.sendMessage(qPrefix.concat(".").concat(thisState.getStatus().name()), thisState);
shutdownThread = null;
executor.shutdown();
dockerThread = null;
executor = null;
futureTask = null;
} catch (InterruptedException | ExecutionException | IOException e) {
logger.warn(e.getMessage(), e);
thisState.setStatus(HostStatus.ERROR);
}
}
}
else if(thisState.getStatus().equals(HostStatus.DONE)) {
/* if we need to handle working without GNOS access on images
then we need to change the logic here to wait for a
state change pushed from the control code */
messaging.removeFromStateQueue(qPrefix.concat(".").concat(thisState.getStatus().name()), hostName);
thisState.setStatus(HostStatus.CLEAN);
firstCleanIter = true;
thisState.setWorkflowIni(null);
messaging.sendMessage(qPrefix.concat(".").concat(thisState.getStatus().name()), thisState);
}
else if(thisState.getStatus().equals(HostStatus.ERROR)) {
if(counter == 60) {
logger.debug("I'm set to error, waiting for directions...");
counter = 0;
}
counter++;
Thread.sleep(500); // sleep at top too
}
else {
throw new RuntimeException("Don't know what to do yet");
}
}
}
private Thread attachWorkIniShutdownHook(WorkflowIni ini, Messaging messaging, String qPrefix, String hostName) {
Thread sdt = new Thread() {
@Override
public void run(){
try {
// We need to know which INI's may have been lost and which hosts they were on when things go odd.
// This way we can track which hosts may have issues unrelated to workflow state.
messaging.sendMessage(qPrefix.concat(".").concat("UNCLEAN"), Utils.objectToJson(ini));
// really not running if this has been executed.
messaging.removeFromStateQueue(qPrefix.concat(".").concat("BROKEN"), hostName); // broken means failed to provision
messaging.removeFromStateQueue(qPrefix.concat(".").concat("CLEAN"), hostName);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("ERROR"), hostName);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("ERRORLOG"), hostName);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("RECEIVE"), hostName);
messaging.removeFromStateQueue(qPrefix.concat(".").concat("RUNNING"), hostName);
}
catch(IOException | InterruptedException | TimeoutException e) {
throw new RuntimeException("Error while executing shutdownHook", e);
}
}
};
Runtime.getRuntime().addShutdownHook(sdt);
return sdt;
}
}