package cz.cuni.mff.d3s.been.hostruntime;
import static cz.cuni.mff.d3s.been.cluster.Names.ACTION_QUEUE_NAME;
import static cz.cuni.mff.d3s.been.core.TaskPropertyNames.*;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Map;
import java.util.TreeMap;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import cz.cuni.mff.d3s.been.bpk.*;
import cz.cuni.mff.d3s.been.cluster.Names;
import cz.cuni.mff.d3s.been.cluster.Service;
import cz.cuni.mff.d3s.been.cluster.ServiceException;
import cz.cuni.mff.d3s.been.cluster.context.ClusterContext;
import cz.cuni.mff.d3s.been.cluster.context.Tasks;
import cz.cuni.mff.d3s.been.core.protocol.command.CommandEntry;
import cz.cuni.mff.d3s.been.core.protocol.command.CommandEntryState;
import cz.cuni.mff.d3s.been.core.protocol.messages.BaseMessage;
import cz.cuni.mff.d3s.been.core.protocol.messages.DeleteTaskWrkDirMessage;
import cz.cuni.mff.d3s.been.core.protocol.messages.KillTaskMessage;
import cz.cuni.mff.d3s.been.core.protocol.messages.RunTaskMessage;
import cz.cuni.mff.d3s.been.core.ri.RuntimeInfo;
import cz.cuni.mff.d3s.been.core.task.TaskDescriptor;
import cz.cuni.mff.d3s.been.core.task.TaskEntry;
import cz.cuni.mff.d3s.been.core.task.TaskProperty;
import cz.cuni.mff.d3s.been.hostruntime.task.*;
import cz.cuni.mff.d3s.been.hostruntime.tasklogs.TaskLogHandler;
import cz.cuni.mff.d3s.been.mq.IMessageReceiver;
import cz.cuni.mff.d3s.been.mq.IMessageSender;
import cz.cuni.mff.d3s.been.mq.MessageQueues;
import cz.cuni.mff.d3s.been.mq.MessagingException;
import cz.cuni.mff.d3s.been.socketworks.MessageDispatcher;
import cz.cuni.mff.d3s.been.socketworks.NamedSockets;
import cz.cuni.mff.d3s.been.swrepoclient.SwRepoClientFactory;
import cz.cuni.mff.d3s.been.util.ZipUtil;
/**
* Manages all Host Runtime task processes.
* <p/>
* All good names taken, so 'Process' is used.
*
* @author Martin Sixta
* @author donarus
*/
final class ProcessManager implements Service {
/**
* Logger
*/
private static final Logger log = LoggerFactory.getLogger(ProcessManager.class);
private static final String STD_ERR_REDIRECT_FILENAME = "stderr.log";
private static final String STD_OUT_REDIRECT_FILENAME = "stdout.log";
/**
* Host Runtime info
*/
private RuntimeInfo hostInfo;
/**
* Connection to the cluster.
*/
private ClusterContext clusterContext;
/**
* Manages software resources.
*/
private SoftwareResolver softwareResolver;
/**
* Shortcut to task cluster context.
*/
private Tasks clusterTasks;
/**
* Thread dispatching task action messages.
*/
TaskActionThread taskActionThread;
/**
* Context of the Host Runtime
*/
private ProcessManagerContext tasks;
private final MessageDispatcher messageDispatcher;
/**
* Creates new instance.
* <p/>
* Call {@link #start()} to fire it up, {@link #stop()} to get rid of it.
*
* @param clusterContext
* connection to the cluster
* @param swRepoClientFactory
* connection to the Software Repository
* @param hostInfo
* Information about the current Host Runtime
*/
ProcessManager(ClusterContext clusterContext, SwRepoClientFactory swRepoClientFactory, RuntimeInfo hostInfo) {
this.clusterContext = clusterContext;
this.hostInfo = hostInfo;
this.softwareResolver = new SoftwareResolver(clusterContext.getServices(), swRepoClientFactory);
this.clusterTasks = clusterContext.getTasks();
this.tasks = new ProcessManagerContext(clusterContext, hostInfo);
this.messageDispatcher = MessageDispatcher.create("localhost");
}
/**
* Starts processing messages and tasks.
*/
@Override
public void start() throws ServiceException {
startTaskActionThread();
startMessageDispatcher();
}
/**
* Starts the {@link TaskActionThread}
*/
private void startTaskActionThread() throws ServiceException {
taskActionThread = new TaskActionThread();
taskActionThread.start();
}
/**
* Starts the {@link MessageDispatcher}
*/
private void startMessageDispatcher() throws ServiceException {
messageDispatcher.addReceiveHandler(NamedSockets.TASK_LOG_0MQ.getName(), TaskLogHandler.create(clusterContext));
messageDispatcher.addReceiveHandler(
NamedSockets.TASK_RESULT_PERSIST_0MQ.getName(),
ResultHandler.create(clusterContext));
messageDispatcher.addRespondingHandler(
NamedSockets.TASK_CHECKPOINT_0MQ.getName(),
CheckpointHandlerFactory.create(clusterContext));
messageDispatcher.addRespondingHandler(
NamedSockets.TASK_RESULT_QUERY_0MQ.getName(),
PersistenceQueryHandlerFactory.create(clusterContext));
messageDispatcher.start();
}
/**
* Stops processing, kills all remaining running processes
*/
@Override
public void stop() {
stopMessageDispatcher();
stopTaskActionThread();
// Kill all remaining running clusterTasks
tasks.killRunningTasks();
}
/**
* Stops the {@link MessageDispatcher}
*/
private void stopMessageDispatcher() {
log.debug("Stopping message dispatcher...");
messageDispatcher.stop();
log.debug("Message dispatcher stopped.");
}
/**
* Stops the {@link TaskActionThread}
*/
private void stopTaskActionThread() {
log.debug("Stopping task action thread");
try {
taskActionThread.poison();
taskActionThread.join();
} catch (InterruptedException e) {
e.printStackTrace();
}
log.debug("Task action thread stopped");
}
/**
* Handles RunTaskMessage.
* <p/>
* Tries to run a task.
*
* @param message
* message carrying the information
*/
void onRunTask(RunTaskMessage message) {
TaskEntry taskHandle = loadTask(message.taskId);
if (taskHandle == null) {
log.warn("No such task to run: {}", message.taskId);
} else {
runTask(taskHandle);
}
}
/**
* Handles KillTaskMessage.
* <p/>
* Tries to kill a task.
*
* @param message
* message carrying the information
*/
synchronized void onKillTask(KillTaskMessage message) {
TaskEntry taskEntry = loadTask(message.taskId);
if (taskEntry == null) {
log.warn("No such task to kill: {}", message.taskId);
} else {
tasks.killTask(taskEntry.getId());
}
}
/**
* Returns cluster-wide identifier of this Host Runtime.
*
* @return node identifier
*/
public String getNodeId() {
return hostInfo.getId();
}
private void runTask(TaskEntry taskEntry) {
String id = taskEntry.getId();
TaskHandle taskHandle = new TaskHandle(taskEntry, clusterContext);
try {
tasks.tryAcceptTask(taskHandle);
} catch (Exception e) {
taskHandle.reSubmit("Cannot accept the task on %s. Reason: %s", getNodeId(), e.getMessage());
log.info("Cannot run task {}", taskHandle.getTaskId());
return;
}
File taskDir = createTaskDir(taskEntry);
tasks.updateTaskDirs();
try {
TaskProcess process = createTaskProcess(taskEntry, taskDir);
tasks.addTask(id, process);
if (process.isDebugListeningMode()) {
taskHandle.setDebug(process.getDebugPort(), process.isSuspended());
}
taskHandle.setRunning(process);
int exitValue = process.start();
taskHandle.setFinished(exitValue);
try {
process.close();
FileUtils.deleteDirectory(taskDir);
tasks.updateTaskDirs();
} catch (IOException e) {
String msg = String.format(
"Task directory '%s' for task '%s' has not been deleted due to underlying exception.",
taskDir,
id);
log.warn(msg, e);
}
} catch (TaskException e) {
String msg = String.format("Task '%s' has been aborted due to underlying exception.", id);
log.error(msg, e);
taskHandle.setAborted(msg, e.getExitValue());
} catch (Exception e) {
String msg = String.format("Task '%s' has been aborted due to underlying exception.", id);
log.error(msg, e);
taskHandle.setAborted(msg);
} finally {
tasks.removeTask(taskHandle);
}
}
/**
* Creates a new task processes.
* <p/>
* TODO: Refactoring might be useful. Fortunately the mess is concentrated
* only in this function
*
* @param taskEntry
* entry associated with the new process
* @param taskDirectory
* root directory of the task
* @return task process representation
* @throws IOException
* @throws BpkConfigurationException
* @throws TaskException
*/
private synchronized
TaskProcess
createTaskProcess(TaskEntry taskEntry, File taskDirectory) throws IOException, BpkConfigurationException, TaskException {
TaskDescriptor taskDescriptor = taskEntry.getTaskDescriptor();
Bpk bpk = getBpk(taskDescriptor);
ZipUtil.unzipToDir(bpk.getInputStream(), taskDirectory);
// obtain bpk configuration
Path taskWrkDir = taskDirectory.toPath();
// obtain runtime information
BpkRuntime runtime = getBpkRuntime(taskDirectory);
// create process for the task
CmdLineBuilder cmdLineBuilder = CmdLineBuilderFactory.create(runtime, taskDescriptor, taskDirectory);
// create dependency downloader
DependencyDownloader dependencyDownloader = DependencyDownloaderFactory.create(runtime);
// create streams to redirect stdout and stderr to
OutputStream stdOutFileOutputStream = new FileOutputStream(new File(taskDirectory, STD_OUT_REDIRECT_FILENAME));
OutputStream stdErrFileOutputStream = new FileOutputStream(new File(taskDirectory, STD_ERR_REDIRECT_FILENAME));
// let the compiler optimize this out
String taskId = taskEntry.getId();
String contextId = taskEntry.getTaskContextId();
String benchmarkId = taskEntry.getBenchmarkId();
TaskStdInOutHandler stdOutHandler = new TaskStdInOutHandler(taskId, contextId, benchmarkId, "stdout", stdOutFileOutputStream);
TaskStdInOutHandler stdErrHandler = new TaskStdInOutHandler(taskId, contextId, benchmarkId, "stderr", stdErrFileOutputStream);
// create environment properties
Map<String, String> environment = createEnvironmentProperties(taskEntry);
TaskProcess taskProcess = new TaskProcess(cmdLineBuilder, taskWrkDir, environment, stdOutHandler, stdErrHandler, dependencyDownloader);
long timeout = determineTimeout(taskDescriptor);
taskProcess.setTimeout(timeout);
return taskProcess;
}
private Bpk getBpk(TaskDescriptor taskDescriptor) throws TaskException {
BpkIdentifier bpkIdentifier = BpkIdentifierCreator.createBpkIdentifier(taskDescriptor);
return softwareResolver.getBpk(bpkIdentifier);
}
private BpkRuntime getBpkRuntime(File workingDirectory) throws BpkConfigurationException {
// obtain bpk configuration
Path configPath = workingDirectory.toPath().resolve(BpkNames.CONFIG_FILE);
BpkConfiguration bpkConfiguration = BpkConfigUtils.fromXml(configPath);
return bpkConfiguration.getRuntime();
}
private long determineTimeout(TaskDescriptor td) {
return td.isSetFailurePolicy() ? td.getFailurePolicy().getTimeoutRun() : TaskProcess.NO_TIMEOUT;
}
private Map<String, String> createEnvironmentProperties(TaskEntry taskEntry) {
Map<String, String> properties = new TreeMap<String, String>(System.getenv());
properties.putAll(messageDispatcher.getBindings());
// Task specific properties
properties.put(LOGGER, System.getProperty(LOGGER));
properties.put(TASK_ID, taskEntry.getId());
properties.put(CONTEXT_ID, taskEntry.getTaskContextId());
properties.put(BENCHMARK_ID, taskEntry.getBenchmarkId());
// add properties specified in the TaskDescriptor
TaskDescriptor td = taskEntry.getTaskDescriptor();
if (td.isSetProperties() && td.getProperties().isSetProperty()) {
for (TaskProperty property : td.getProperties().getProperty()) {
String value = property.getValue();
if (value == null) {
value = ""; // must not be null, issue #146
}
properties.put(property.getName(), value);
}
}
return properties;
}
private File createTaskDir(TaskEntry taskEntry) {
String taskDirName = taskEntry.getTaskDescriptor().getName() + "_" + taskEntry.getId();
File taskDir = new File(hostInfo.getTasksWorkingDirectory(), taskDirName);
// TODO check return value
taskDir.mkdirs();
return taskDir;
}
private TaskEntry loadTask(String taskId) {
return clusterTasks.getTask(taskId);
}
/**
* Thread listening for task action messages. Dispatches messages to its
* handlers.
* <p/>
* The thread is an inner class for easy access to the ProcessManager.
*/
private class TaskActionThread extends Thread {
final MessageQueues queues;
private final Logger log = LoggerFactory.getLogger(TaskActionThread.class);
// TODO Consider use of ExecutorService for task handling
TaskActionThread() {
this.queues = MessageQueues.getInstance();
}
@Override
public void run() {
IMessageReceiver<BaseMessage> receiver;
try {
receiver = queues.getReceiver(ACTION_QUEUE_NAME);
} catch (MessagingException e) {
String msg = String.format("Cannot start %s", TaskActionThread.class);
log.error(msg, e);
return;
}
while (!Thread.interrupted()) {
try {
final BaseMessage msg = receiver.receive();
if (msg instanceof RunTaskMessage) {
// spawn a new thread for the task, it might take a while
new Thread() {
@Override
public void run() {
onRunTask((RunTaskMessage) msg);
}
}.start();
} else if (msg instanceof KillTaskMessage) {
onKillTask((KillTaskMessage) msg);
} else if (msg instanceof PoisonMessage) {
break;
} else if (msg instanceof MonitoringSampleMessage) {
tasks.updateMonitoringSample(((MonitoringSampleMessage) msg).getSample());
} else if (msg instanceof DeleteTaskWrkDirMessage) {
onDeleteTaskWrkDir((DeleteTaskWrkDirMessage) msg);
} else {
log.warn("Host Runtime does not know how to handle message of type {}", msg.getClass());
}
} catch (MessagingException e) {
log.error("Error receiving a message", e);
} catch (Exception e) {
log.error("Unknown error", e);
break;
}
}
log.info("Processing of Task Action Messages stopped");
}
public void poison() {
IMessageSender<BaseMessage> sender = null;
try {
sender = queues.createSender(ACTION_QUEUE_NAME);
PoisonMessage msg = new PoisonMessage("0");
sender.send(msg);
} catch (MessagingException e) {
log.error("Cannot poison Task Action queue", e);
} finally {
if (sender != null) {
sender.close();
}
}
}
}
private void onDeleteTaskWrkDir(DeleteTaskWrkDirMessage msg) {
String description = String.format("DELETE TASK WORKING DIRECTORY '%s'", msg.taskWrkDirName);
Map<Long, CommandEntry> commandEntries = clusterContext.getMap(Names.BEEN_MAP_COMMAND_ENTRIES);
String runtimeId = hostInfo.getId();
commandEntries.put(
msg.operationId,
new CommandEntry(runtimeId, description, CommandEntryState.PENDING, msg.operationId));
CommandEntry commandEntry;
try {
Path tasksRoot = Paths.get(hostInfo.getTasksWorkingDirectory());
Path dirToDelete = Paths.get(msg.taskWrkDirName).toAbsolutePath(); // must use absolute path!
if (dirToDelete.startsWith(tasksRoot)) {
FileUtils.deleteDirectory(dirToDelete.toFile());
tasks.updateTaskDirs();
commandEntry = new CommandEntry(runtimeId, description, CommandEntryState.FINISHED, msg.operationId);
} else {
String errorMsg = String.format(
"Cannot delete task working directory '%s' because it is not a task directory",
msg.taskWrkDirName);
log.error(errorMsg);
commandEntry = new CommandEntry(runtimeId, errorMsg, CommandEntryState.FAILED, msg.operationId);
}
} catch (IOException | InvalidPathException e) {
String errorMsg = String.format("Cannot delete task working directory '%s'", msg.taskWrkDirName);
log.error(errorMsg, e);
commandEntry = new CommandEntry(runtimeId, errorMsg, CommandEntryState.FAILED, msg.operationId);
}
commandEntries.put(msg.operationId, commandEntry);
}
/**
* Poison message for the task action thread
*/
private static class PoisonMessage extends BaseMessage {
public PoisonMessage(String receiverId) {
super(receiverId);
}
}
}