/* * ProActive Parallel Suite(TM): * The Open Source library for parallel and distributed * Workflows & Scheduling, Orchestration, Cloud Automation * and Big Data Analysis on Enterprise Grids & Clouds. * * Copyright (c) 2007 - 2017 ActiveEon * Contact: contact@activeeon.com * * This library is free software: you can redistribute it and/or * modify it under the terms of the GNU Affero General Public License * as published by the Free Software Foundation: version 3 of * the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * If needed, contact us to obtain a release under GPL Version 2 or 3 * or a different license than the AGPL. */ package org.ow2.proactive.scheduler.task; import java.io.File; import java.io.Serializable; import java.security.KeyPair; import java.security.KeyPairGenerator; import java.security.NoSuchAlgorithmException; import java.security.PublicKey; import java.security.SecureRandom; import java.util.Collections; import java.util.Map; import java.util.concurrent.TimeUnit; import org.apache.log4j.Logger; import org.objectweb.proactive.Body; import org.objectweb.proactive.InitActive; import org.objectweb.proactive.annotation.ImmediateService; import org.objectweb.proactive.api.PAActiveObject; import org.objectweb.proactive.core.util.ProActiveInet; import org.objectweb.proactive.extensions.annotation.ActiveObject; import org.objectweb.proactive.extensions.dataspaces.exceptions.FileSystemException; import org.objectweb.proactive.extensions.dataspaces.vfs.selector.FileSelector; import org.ow2.proactive.resourcemanager.nodesource.dataspace.DataSpaceNodeConfigurationAgent; import org.ow2.proactive.scheduler.common.TaskTerminateNotification; import org.ow2.proactive.scheduler.common.exception.SchedulerException; import org.ow2.proactive.scheduler.common.exception.WalltimeExceededException; import org.ow2.proactive.scheduler.common.task.TaskId; import org.ow2.proactive.scheduler.common.task.TaskResult; import org.ow2.proactive.scheduler.common.task.dataspaces.OutputAccessMode; import org.ow2.proactive.scheduler.common.task.dataspaces.OutputSelector; import org.ow2.proactive.scheduler.common.util.TaskLoggerRelativePathGenerator; import org.ow2.proactive.scheduler.common.util.VariableSubstitutor; import org.ow2.proactive.scheduler.common.util.logforwarder.AppenderProvider; import org.ow2.proactive.scheduler.task.containers.ExecutableContainer; import org.ow2.proactive.scheduler.task.context.NodeDataSpacesURIs; import org.ow2.proactive.scheduler.task.context.TaskContext; import org.ow2.proactive.scheduler.task.context.TaskContextVariableExtractor; import org.ow2.proactive.scheduler.task.data.TaskDataspaces; import org.ow2.proactive.scheduler.task.executors.TaskExecutor; import org.ow2.proactive.scheduler.task.utils.Decrypter; import org.ow2.proactive.scheduler.task.utils.WallTimer; import org.ow2.proactive.scheduler.task.utils.task.termination.CleanupTimeoutGetter; import org.ow2.proactive.scheduler.task.utils.task.termination.CleanupTimeoutGetterDoubleValue; import org.ow2.proactive.scheduler.task.utils.task.termination.TaskKiller; import com.google.common.base.Stopwatch; /** * The node side of task execution: * - communicates with the Scheduler via ProActive * - deals with data transfers * - deals with task killing and walltime * - sends result back to the Scheduler */ @ActiveObject public class TaskLauncher implements InitActive { private static final Logger logger = Logger.getLogger(TaskLauncher.class); final private TaskContextVariableExtractor taskContextVariableExtractor = new TaskContextVariableExtractor(); private TaskLauncherFactory factory; private TaskId taskId; private TaskLauncherInitializer initializer; private TaskLogger taskLogger; private TaskKiller taskKiller; private Decrypter decrypter; private ProgressFileReader progressFileReader; private Thread nodeShutdownHook; /** * Needed for ProActive but should never be used manually to create an instance of the object. */ public TaskLauncher() { // Needed for ProActive but should never be used manually to create an instance of the object. } public TaskLauncher(TaskLauncherInitializer initializer, TaskLauncherFactory factory) { this(initializer); this.factory = factory; } public TaskLauncher(TaskLauncherInitializer initializer) { this.initializer = initializer; } @Override public void initActivity(Body body) { this.taskId = initializer.getTaskId(); this.taskLogger = new TaskLogger(taskId, getHostname()); this.progressFileReader = new ProgressFileReader(); this.taskKiller = new TaskKiller(Thread.currentThread(), new CleanupTimeoutGetter()); nodeShutdownHook = new Thread(new Runnable() { @Override public void run() { kill(); } }); } /** * Method used to wait until the TaskLauncher is activated (i.e. the initActivity method has been run). * * @return dummy boolean value */ public boolean isActivated() { return true; } public void doTask(ExecutableContainer executableContainer, TaskResult[] previousTasksResults, TaskTerminateNotification terminateNotification) { logger.info("Task started " + taskId.getJobId().getReadableName() + " : " + taskId.getReadableName()); this.taskKiller = this.replaceTaskKillerWithDoubleTimeoutValueIfRunAsMe(executableContainer.isRunAsUser()); WallTimer wallTimer = new WallTimer(initializer.getWalltime(), taskKiller); Stopwatch taskStopwatchForFailures = Stopwatch.createUnstarted(); TaskResultImpl taskResult; TaskDataspaces dataspaces = null; try { addShutdownHook(); // lock the cache space cleaning mechanism DataSpaceNodeConfigurationAgent.lockCacheSpaceCleaning(); dataspaces = factory.createTaskDataspaces(taskId, initializer.getNamingService(), executableContainer.isRunAsUser()); File taskLogFile = taskLogger.createFileAppender(dataspaces.getScratchFolder()); progressFileReader.start(dataspaces.getScratchFolder(), taskId); TaskContext context = new TaskContext(executableContainer, initializer, previousTasksResults, new NodeDataSpacesURIs(dataspaces.getScratchURI(), dataspaces.getCacheURI(), dataspaces.getInputURI(), dataspaces.getOutputURI(), dataspaces.getUserURI(), dataspaces.getGlobalURI()), progressFileReader.getProgressFile().toString(), getHostname(), decrypter); File workingDir = getTaskWorkingDir(context, dataspaces); logger.info("Task working dir: " + workingDir); logger.info("Cache space: " + context.getNodeDataSpaceURIs().getCacheURI()); logger.info("Input space: " + context.getNodeDataSpaceURIs().getInputURI()); logger.info("Output space: " + context.getNodeDataSpaceURIs().getOutputURI()); logger.info("User space: " + context.getNodeDataSpaceURIs().getUserURI()); logger.info("Global space: " + context.getNodeDataSpaceURIs().getGlobalURI()); logger.info("Scheduler rest url: " + context.getSchedulerRestUrl()); wallTimer.start(); dataspaces.copyInputDataToScratch(initializer.getFilteredInputFiles(fileSelectorsFilters(context))); // should handle interrupt if (decrypter != null) { decrypter.setCredentials(executableContainer.getCredentials()); } TaskExecutor taskExecutor = factory.createTaskExecutor(workingDir); taskStopwatchForFailures.start(); taskResult = taskExecutor.execute(context, taskLogger.getOutputSink(), taskLogger.getErrorSink()); taskStopwatchForFailures.stop(); switch (taskKiller.getStatus()) { case WALLTIME_REACHED: taskResult = getWalltimedTaskResult(taskStopwatchForFailures); sendResultToScheduler(terminateNotification, taskResult); return; case KILLED_MANUALLY: // killed by Scheduler, no need to send results back return; } dataspaces.copyScratchDataToOutput(initializer.getFilteredOutputFiles(fileSelectorsFilters(context, taskResult))); wallTimer.stop(); copyTaskLogsToUserSpace(taskLogFile, dataspaces); taskResult.setLogs(taskLogger.getLogs()); sendResultToScheduler(terminateNotification, taskResult); } catch (Throwable taskFailure) { wallTimer.stop(); switch (taskKiller.getStatus()) { case WALLTIME_REACHED: taskResult = getWalltimedTaskResult(taskStopwatchForFailures); sendResultToScheduler(terminateNotification, taskResult); break; case KILLED_MANUALLY: // killed by Scheduler, no need to send results back return; default: logger.info("Failed to execute task", taskFailure); taskFailure.printStackTrace(taskLogger.getErrorSink()); taskResult = new TaskResultImpl(taskId, taskFailure, taskLogger.getLogs(), taskStopwatchForFailures.elapsed(TimeUnit.MILLISECONDS)); sendResultToScheduler(terminateNotification, taskResult); } } finally { try { progressFileReader.stop(); taskLogger.close(); if (dataspaces != null) { dataspaces.close(); } // unlocks the cache space cleaning thread DataSpaceNodeConfigurationAgent.unlockCacheSpaceCleaning(); removeShutdownHook(); } finally { terminate(); } } } private TaskKiller replaceTaskKillerWithDoubleTimeoutValueIfRunAsMe(boolean isRunAsUser) { if (isRunAsUser == true) { return new TaskKiller(Thread.currentThread(), new CleanupTimeoutGetterDoubleValue()); } else { return this.taskKiller; } } private void addShutdownHook() { try { Runtime.getRuntime().addShutdownHook(nodeShutdownHook); } catch (IllegalStateException ignored) { // ignore } } private void removeShutdownHook() { try { Runtime.getRuntime().removeShutdownHook(nodeShutdownHook); } catch (IllegalStateException ignored) { // ignored } } private TaskResultImpl getWalltimedTaskResult(Stopwatch taskStopwatchForFailures) { String message = "Walltime of " + initializer.getWalltime() + " ms reached on task " + taskId.getReadableName(); return getTaskResult(taskStopwatchForFailures, new WalltimeExceededException(message)); } private TaskResultImpl getTaskResult(Stopwatch taskStopwatchForFailures, SchedulerException exception) { taskLogger.getErrorSink().println(exception.getMessage()); return new TaskResultImpl(taskId, exception, taskLogger.getLogs(), taskStopwatchForFailures.elapsed(TimeUnit.MILLISECONDS)); } private Map<String, Serializable> fileSelectorsFilters(TaskContext taskContext, TaskResult taskResult) throws Exception { return taskContextVariableExtractor.extractVariables(taskContext, taskResult, true); } private Map<String, Serializable> fileSelectorsFilters(TaskContext taskContext) throws Exception { return taskContextVariableExtractor.extractVariables(taskContext, true); } private void copyTaskLogsToUserSpace(File taskLogFile, TaskDataspaces dataspaces) { if (initializer.isPreciousLogs()) { try { FileSelector taskLogFileSelector = new FileSelector(taskLogFile.getName()); taskLogFileSelector.setIncludes(new TaskLoggerRelativePathGenerator(taskId).getRelativePath()); dataspaces.copyScratchDataToOutput(Collections.singletonList(new OutputSelector(taskLogFileSelector, OutputAccessMode.TransferToUserSpace))); } catch (FileSystemException e) { logger.warn("Cannot copy logs of task to user data spaces", e); } } } private File getTaskWorkingDir(TaskContext taskContext, TaskDataspaces dataspaces) throws Exception { File workingDir = dataspaces.getScratchFolder(); if (taskContext.getInitializer().getForkEnvironment() != null) { String workingDirPath = taskContext.getInitializer().getForkEnvironment().getWorkingDir(); if (workingDirPath != null) { workingDirPath = VariableSubstitutor.filterAndUpdate(workingDirPath, taskContextVariableExtractor.extractVariables(taskContext, true)); workingDir = new File(workingDirPath); } } return workingDir; } private void sendResultToScheduler(TaskTerminateNotification terminateNotification, TaskResultImpl taskResult) { if (isNodeShuttingDown()) { return; } int pingAttempts = initializer.getPingAttempts(); int pingPeriodMs = initializer.getPingPeriod() * 1000; for (int i = 0; i < pingAttempts; i++) { try { terminateNotification.terminate(taskId, taskResult); logger.debug("Successfully notified task termination " + taskId); return; } catch (Throwable t) { logger.warn("Cannot notify task termination " + taskId + ", will try again in " + pingPeriodMs + " ms", t); if (i != pingAttempts - 1) { try { Thread.sleep(pingPeriodMs); } catch (InterruptedException e) { logger.error("Interrupted while waiting to notify task termination", e); } } } } logger.error("Cannot notify task termination " + taskId + " after " + pingAttempts + " attempts, terminating task launcher now"); } private boolean isNodeShuttingDown() { try { Runtime.getRuntime().addShutdownHook(new Thread()); } catch (IllegalStateException e) { return true; } return false; } @ImmediateService public void activateLogs(AppenderProvider logSink) { taskLogger.resetLogContextForImmediateService(); taskLogger.activateLogs(logSink); } @ImmediateService public void getStoredLogs(AppenderProvider logSink) { taskLogger.resetLogContextForImmediateService(); taskLogger.getStoredLogs(logSink); } public PublicKey generatePublicKey() throws NoSuchAlgorithmException { KeyPairGenerator keyGen; keyGen = KeyPairGenerator.getInstance("RSA"); keyGen.initialize(1024, new SecureRandom()); KeyPair keyPair = keyGen.generateKeyPair(); decrypter = new Decrypter(keyPair.getPrivate()); return keyPair.getPublic(); } @ImmediateService public void kill() { taskLogger.resetLogContextForImmediateService(); logger.info("Kill received for task"); taskKiller.kill(TaskKiller.Status.KILLED_MANUALLY); } private void terminate() { try { if (PAActiveObject.isInActiveObject()) { PAActiveObject.terminateActiveObject(false); } } catch (Exception e) { logger.info("Exception when terminating task launcher active object", e); } logger.info("Task terminated"); } @ImmediateService public int getProgress() { return progressFileReader.getProgress(); } private static String getHostname() { return ProActiveInet.getInstance().getInetAddress().getHostName(); } }