/*
* ProActive Parallel Suite(TM):
* The Open Source library for parallel and distributed
* Workflows & Scheduling, Orchestration, Cloud Automation
* and Big Data Analysis on Enterprise Grids & Clouds.
*
* Copyright (c) 2007 - 2017 ActiveEon
* Contact: contact@activeeon.com
*
* This library is free software: you can redistribute it and/or
* modify it under the terms of the GNU Affero General Public License
* as published by the Free Software Foundation: version 3 of
* the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* If needed, contact us to obtain a release under GPL Version 2 or 3
* or a different license than the AGPL.
*/
package org.ow2.proactive.scheduler.core;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.log4j.Logger;
import org.ow2.proactive.scheduler.common.NotificationData;
import org.ow2.proactive.scheduler.common.SchedulerEvent;
import org.ow2.proactive.scheduler.common.exception.TaskAbortedException;
import org.ow2.proactive.scheduler.common.exception.TaskPreemptedException;
import org.ow2.proactive.scheduler.common.exception.TaskRestartedException;
import org.ow2.proactive.scheduler.common.exception.UnknownJobException;
import org.ow2.proactive.scheduler.common.exception.UnknownTaskException;
import org.ow2.proactive.scheduler.common.job.JobId;
import org.ow2.proactive.scheduler.common.job.JobInfo;
import org.ow2.proactive.scheduler.common.job.JobPriority;
import org.ow2.proactive.scheduler.common.job.JobStatus;
import org.ow2.proactive.scheduler.common.task.RestartMode;
import org.ow2.proactive.scheduler.common.task.SimpleTaskLogs;
import org.ow2.proactive.scheduler.common.task.TaskId;
import org.ow2.proactive.scheduler.common.task.TaskInfo;
import org.ow2.proactive.scheduler.common.task.TaskState;
import org.ow2.proactive.scheduler.common.task.TaskStatus;
import org.ow2.proactive.scheduler.core.db.SchedulerDBManager;
import org.ow2.proactive.scheduler.core.helpers.StartAtUpdater;
import org.ow2.proactive.scheduler.core.helpers.TaskResultCreator;
import org.ow2.proactive.scheduler.descriptor.EligibleTaskDescriptor;
import org.ow2.proactive.scheduler.descriptor.JobDescriptor;
import org.ow2.proactive.scheduler.job.ChangedTasksInfo;
import org.ow2.proactive.scheduler.job.ClientJobState;
import org.ow2.proactive.scheduler.job.InternalJob;
import org.ow2.proactive.scheduler.job.JobInfoImpl;
import org.ow2.proactive.scheduler.task.TaskInfoImpl;
import org.ow2.proactive.scheduler.task.TaskLauncher;
import org.ow2.proactive.scheduler.task.TaskResultImpl;
import org.ow2.proactive.scheduler.task.internal.InternalTask;
import org.ow2.proactive.scheduler.util.JobLogger;
import org.ow2.proactive.scheduler.util.TaskLogger;
import org.ow2.proactive.utils.TaskIdWrapper;
class LiveJobs {
private static final Logger logger = Logger.getLogger(SchedulingService.class);
private static final JobLogger jlogger = JobLogger.getInstance();
private static final TaskLogger tlogger = TaskLogger.getInstance();
private static final TaskResultCreator taskResultCreator = TaskResultCreator.getInstance();
private static class JobData {
final InternalJob job;
final ReentrantLock jobLock = new ReentrantLock();
private JobData(InternalJob job) {
this.job = job;
}
void unlock() {
jobLock.unlock();
}
}
private final SchedulerDBManager dbManager;
private final SchedulerStateUpdate listener;
private final Map<JobId, JobData> jobs = new ConcurrentHashMap<>();
private final ConcurrentHashMap<TaskIdWrapper, RunningTaskData> runningTasksData = new ConcurrentHashMap<>();
private final OnErrorPolicyInterpreter onErrorPolicyInterpreter = new OnErrorPolicyInterpreter();
private final StartAtUpdater startAtUpdater = new StartAtUpdater();
LiveJobs(SchedulerDBManager dbManager, SchedulerStateUpdate listener) {
this.dbManager = dbManager;
this.listener = listener;
}
Collection<RunningTaskData> getRunningTasks() {
return runningTasksData.values();
}
boolean canPingTask(RunningTaskData taskData) {
return runningTasksData.get(TaskIdWrapper.wrap(taskData.getTask().getId())) == taskData;
}
void jobRecovered(InternalJob job) {
jobs.put(job.getId(), new JobData(job));
}
void unpauseAll() {
for (JobId jobId : jobs.keySet()) {
JobData jobData = lockJob(jobId);
if (jobData != null) {
try {
InternalJob job = jobData.job;
if (job.getStatus() == JobStatus.PAUSED) {
job.setUnPause();
dbManager.updateJobAndTasksState(job);
updateJobInSchedulerState(job, SchedulerEvent.JOB_RESUMED);
}
} finally {
jobData.unlock();
}
}
}
}
List<RunningTaskData> getRunningTasks(JobId jobId) {
List<RunningTaskData> result = new ArrayList<>();
for (RunningTaskData taskData : runningTasksData.values()) {
if (taskData.getTask().getJobId().equals(jobId)) {
result.add(taskData);
}
}
return result;
}
RunningTaskData getRunningTask(TaskId taskId) {
for (RunningTaskData taskData : runningTasksData.values()) {
if (taskData.getTask().getId().equals(taskId)) {
return taskData;
}
}
return null;
}
boolean hasJobOwnedByUser(String user) {
for (JobData jobData : jobs.values()) {
if (jobData.job.getOwner().equals(user)) {
return true;
}
}
return false;
}
void changeJobPriority(JobId jobId, JobPriority priority) {
JobData jobData = lockJob(jobId);
if (jobData == null) {
return;
}
try {
jobData.job.setPriority(priority);
dbManager.changeJobPriority(jobId, priority);
listener.jobStateUpdated(jobData.job.getOwner(),
new NotificationData<JobInfo>(SchedulerEvent.JOB_CHANGE_PRIORITY,
new JobInfoImpl((JobInfoImpl) jobData.job.getJobInfo())));
listener.jobUpdatedFullData(jobData.job);
} finally {
jobData.unlock();
}
}
public Boolean restartAllInErrorTasks(JobId jobId) {
JobData jobData = lockJob(jobId);
if (jobData == null) {
return false;
}
try {
InternalJob job = jobData.job;
for (TaskState taskState : job.getTasks()) {
try {
restartInErrorTask(jobId, taskState.getName());
} catch (UnknownTaskException e) {
logger.error("", e);
jlogger.error(jobId, "", e);
tlogger.error(taskState.getId(), "", e);
}
}
setJobStatusToInErrorIfNotPaused(job);
dbManager.updateJobAndTasksState(job);
updateJobInSchedulerState(job, SchedulerEvent.JOB_RESTARTED_FROM_ERROR);
return Boolean.TRUE;
} finally {
jobData.unlock();
}
}
boolean resumeJob(JobId jobId) {
JobData jobData = lockJob(jobId);
if (jobData == null) {
return false;
}
try {
InternalJob job = jobData.job;
Set<TaskId> updatedTasks = job.setUnPause();
if (updatedTasks.size() > 0) {
jlogger.info(jobId, "has just been resumed.");
dbManager.updateJobAndTasksState(job);
updateTasksInSchedulerState(job, updatedTasks);
}
//update tasks events list and send it to front-end
updateJobInSchedulerState(job, SchedulerEvent.JOB_RESUMED);
return updatedTasks.size() > 0;
} finally {
jobData.unlock();
}
}
boolean pauseJob(JobId jobId) {
JobData jobData = lockJob(jobId);
if (jobData == null) {
return false;
}
try {
InternalJob job = jobData.job;
Set<TaskId> updatedTasks = job.setPaused();
if (updatedTasks.size() > 0) {
jlogger.info(jobId, "has just been paused.");
dbManager.updateJobAndTasksState(job);
updateTasksInSchedulerState(job, updatedTasks);
}
//update tasks events list and send it to front-end
updateJobInSchedulerState(job, SchedulerEvent.JOB_PAUSED);
return updatedTasks.size() > 0;
} finally {
jobData.unlock();
}
}
boolean updateStartAt(JobId jobId, String startAt) {
JobData jobData = lockJob(jobId);
if (jobData == null) {
return false;
}
try {
return startAtUpdater.updateStartAt(jobData.job, startAt, dbManager);
} finally {
jobData.unlock();
}
}
void jobSubmitted(InternalJob job) {
job.prepareTasks();
job.submitAction();
dbManager.newJobSubmitted(job);
ClientJobState clientJobState = new ClientJobState(job);
jobs.put(job.getId(), new JobData(job));
listener.jobSubmitted(clientJobState);
}
Map<JobId, JobDescriptor> lockJobsToSchedule() {
TreeSet<JobPriority> prioritiesScheduled = new TreeSet<>();
TreeSet<JobPriority> prioritiesNotScheduled = new TreeSet<>();
Map<JobId, JobDescriptor> result = new HashMap<>();
for (Map.Entry<JobId, JobData> entry : jobs.entrySet()) {
JobData value = entry.getValue();
if (value.jobLock.tryLock()) {
InternalJob job = entry.getValue().job;
result.put(job.getId(), job.getJobDescriptor());
prioritiesScheduled.add(job.getPriority());
if (unlockIfConflict(prioritiesScheduled, prioritiesNotScheduled, result))
return new HashMap<>(0);
} else {
prioritiesNotScheduled.add(value.job.getPriority());
if (unlockIfConflict(prioritiesScheduled, prioritiesNotScheduled, result))
return new HashMap<>(0);
}
}
return result;
}
private boolean unlockIfConflict(TreeSet<JobPriority> prioritiesScheduled,
TreeSet<JobPriority> prioritiesNotScheduled, Map<JobId, JobDescriptor> result) {
if (priorityConflict(prioritiesScheduled, prioritiesNotScheduled)) {
unlockJobsToSchedule(result.values());
return true;
}
return false;
}
/**
* This method checks if there is a conflict priority between the jobs selected to be scheduled and those not selected.
* There is a conflict if any job scheduled has a strictly lower priority than any unscheduled job
*
* @param prioritiesScheduled
* @param prioritiesNotScheduled
* @return
*/
public boolean priorityConflict(TreeSet<JobPriority> prioritiesScheduled,
TreeSet<JobPriority> prioritiesNotScheduled) {
for (JobPriority jp : prioritiesNotScheduled) {
if (!prioritiesScheduled.headSet(jp).isEmpty()) {
return true;
}
}
return false;
}
void unlockJobsToSchedule(Collection<JobDescriptor> jobDescriptors) {
for (JobDescriptor desc : jobDescriptors) {
JobData jobData = checkJobAccess(desc.getJobId());
jobData.unlock();
}
}
void restartWaitingTask(TaskId taskId) {
JobData jobData = lockJob(taskId.getJobId());
if (jobData == null) {
return;
}
try {
InternalTask task = jobData.job.getTask(taskId);
if (!task.getStatus().isTaskAlive()) {
tlogger.warn(taskId, "task to be restarted isn't alive " + task.getStatus());
return;
}
jobData.job.reStartTask(task);
} catch (UnknownTaskException e) {
logger.error("Unexpected exception", e);
} finally {
jobData.unlock();
}
}
private void restartTaskOnNodeFailure(InternalTask task, JobData jobData, TerminationData terminationData) {
final String errorMsg = "An error has occurred due to a node failure and the maximum amount of retries property has been reached.";
task.setProgress(0);
task.decreaseNumberOfExecutionOnFailureLeft();
tlogger.info(task.getId(), "number of retry on failure left " + task.getNumberOfExecutionOnFailureLeft());
InternalJob job = jobData.job;
if (task.getNumberOfExecutionOnFailureLeft() > 0) {
task.setStatus(TaskStatus.WAITING_ON_FAILURE);
job.newWaitingTask();
listener.taskStateUpdated(job.getOwner(),
new NotificationData<TaskInfo>(SchedulerEvent.TASK_WAITING_FOR_RESTART,
new TaskInfoImpl((TaskInfoImpl) task.getTaskInfo())));
job.reStartTask(task);
dbManager.taskRestarted(job, task, null);
tlogger.info(task.getId(), " is waiting for restart");
} else {
job.incrementNumberOfFailedTasksBy(1);
endJob(jobData, terminationData, task, null, errorMsg, JobStatus.FAILED);
}
}
TerminationData restartTaskOnNodeFailure(InternalTask task) {
JobData jobData = lockJob(task.getJobId());
if (jobData == null) {
return emptyResult(task.getId());
}
try {
TaskId taskId = task.getId();
if (task.getStatus() != TaskStatus.RUNNING) {
return emptyResult(taskId);
}
RunningTaskData taskData = runningTasksData.remove(TaskIdWrapper.wrap(taskId));
if (taskData == null) {
throw new IllegalStateException("Task " + task.getId() + " is not running.");
}
TerminationData result = TerminationData.newTerminationData();
result.addTaskData(jobData.job, taskData, false, null);
restartTaskOnNodeFailure(task, jobData, result);
return result;
} finally {
jobData.unlock();
}
}
private void restartTaskOnError(JobData jobData, InternalTask task, TaskStatus status, TaskResultImpl result,
long waitTime, TerminationData terminationData) {
InternalJob job = jobData.job;
tlogger.info(task.getId(), "node Exclusion : restart mode is '" + task.getRestartTaskOnError() + "'");
if (task.getRestartTaskOnError().equals(RestartMode.ELSEWHERE)) {
task.setNodeExclusion(task.getExecuterInformation().getNodes());
}
task.setStatus(status);
job.newWaitingTask();
dbManager.updateAfterTaskFinished(job, task, result);
listener.taskStateUpdated(job.getOwner(),
new NotificationData<TaskInfo>(SchedulerEvent.TASK_WAITING_FOR_RESTART,
new TaskInfoImpl((TaskInfoImpl) task.getTaskInfo())));
terminationData.addRestartData(task.getId(), waitTime);
logger.info("END restartTaskOnError");
}
TerminationData simulateJobStart(List<EligibleTaskDescriptor> tasksToSchedule, String errorMsg) {
TerminationData terminationData = TerminationData.newTerminationData();
for (EligibleTaskDescriptor eltd : tasksToSchedule) {
JobId jobId = eltd.getJobId();
if (!terminationData.jobTerminated(jobId)) {
JobData jobData = lockJob(jobId);
if (jobData != null) {
try {
if (jobData.job.getStartTime() < 0) {
jobData.job.start();
updateJobInSchedulerState(jobData.job, SchedulerEvent.JOB_PENDING_TO_RUNNING);
jlogger.info(jobId, "started");
}
endJob(jobData, terminationData, eltd.getInternal(), null, errorMsg, JobStatus.CANCELED);
} finally {
jobData.unlock();
}
}
}
}
return terminationData;
}
void taskStarted(InternalJob job, InternalTask task, TaskLauncher launcher) {
checkJobAccess(job.getId());
if (runningTasksData.containsKey(TaskIdWrapper.wrap(task.getId()))) {
throw new IllegalStateException("Task is already started");
}
tlogger.info(task.getId(), "task started " + task.getId());
runningTasksData.put(TaskIdWrapper.wrap(task.getId()),
new RunningTaskData(task, job.getOwner(), job.getCredentials(), launcher));
boolean firstTaskStarted;
if (job.getStartTime() < 0) {
// if it is the first task of this job
job.start();
updateJobInSchedulerState(job, SchedulerEvent.JOB_PENDING_TO_RUNNING);
jlogger.info(job.getId(), "started");
firstTaskStarted = true;
} else {
firstTaskStarted = false;
}
// set the different informations on task
job.startTask(task);
dbManager.jobTaskStarted(job, task, firstTaskStarted);
listener.taskStateUpdated(job.getOwner(),
new NotificationData<TaskInfo>(SchedulerEvent.TASK_PENDING_TO_RUNNING,
new TaskInfoImpl((TaskInfoImpl) task.getTaskInfo())));
//fill previous task progress with 0, means task has started
task.setProgress(0);
}
private TerminationData emptyResult(TaskId taskId) {
RunningTaskData taskData = runningTasksData.remove(TaskIdWrapper.wrap(taskId));
if (taskData != null) {
throw new IllegalStateException("Task is marked as running: " + taskId);
}
return TerminationData.EMPTY;
}
private TerminationData emptyData(JobId jobId) {
for (TaskIdWrapper taskId : runningTasksData.keySet()) {
if (taskId.getTaskId().getJobId().equals(jobId)) {
throw new IllegalStateException("Unexpected task data: " + taskId);
}
}
return TerminationData.EMPTY;
}
public TerminationData taskTerminatedWithResult(TaskId taskId, TaskResultImpl result) {
JobData jobData = lockJob(taskId.getJobId());
if (jobData == null) {
return emptyResult(taskId);
}
try {
InternalTask task;
try {
task = jobData.job.getTask(taskId);
} catch (UnknownTaskException e) {
logger.error("Unexpected exception", e);
return emptyResult(taskId);
}
if (task.getStatus() != TaskStatus.RUNNING) {
tlogger.info(taskId, "task isn't running anymore");
return emptyResult(taskId);
}
TaskIdWrapper taskIdWrapper = TaskIdWrapper.wrap(taskId);
RunningTaskData taskData = runningTasksData.remove(taskIdWrapper);
if (taskData == null) {
throw new IllegalStateException("No information for: " + taskId);
}
TerminationData terminationData = createAndFillTerminationData(result, taskData, jobData.job, true);
boolean errorOccurred = result.hadException();
if (errorOccurred) {
tlogger.error(taskId, "error", result.getException());
}
tlogger.info(taskId, "finished with" + (errorOccurred ? "" : "out") + " errors");
if (errorOccurred) {
tlogger.info(taskId, "task has terminated with an error ");
task.decreaseNumberOfExecutionLeft();
boolean requiresPauseJobOnError = onErrorPolicyInterpreter.requiresPauseJobOnError(task);
int numberOfExecutionLeft = task.getNumberOfExecutionLeft();
if (numberOfExecutionLeft <= 0 && onErrorPolicyInterpreter.requiresCancelJobOnError(task)) {
tlogger.info(taskId, "no retry left and task is tagged with cancel job on error");
jobData.job.increaseNumberOfFaultyTasks(taskId);
endJob(jobData,
terminationData,
task,
result,
"An error occurred in your task and the maximum number of executions has been reached. " +
"You also ask to cancel the job in such a situation!",
JobStatus.CANCELED);
jlogger.info(taskId.getJobId(), "job has been canceled");
return terminationData;
} else if (numberOfExecutionLeft > 0) {
tlogger.info(taskId, "number of execution left is " + numberOfExecutionLeft);
if (onErrorPolicyInterpreter.requiresPauseTaskOnError(task) || requiresPauseJobOnError) {
long waitTime = jobData.job.getNextWaitingTime(task.getMaxNumberOfExecution() -
numberOfExecutionLeft);
restartTaskOnError(jobData,
task,
TaskStatus.WAITING_ON_ERROR,
result,
waitTime,
terminationData);
tlogger.info(taskId, "new restart is scheduled");
return terminationData;
} else {
jobData.job.increaseNumberOfFaultyTasks(taskId);
long waitTime = jobData.job.getNextWaitingTime(task.getMaxNumberOfExecution() -
numberOfExecutionLeft);
restartTaskOnError(jobData,
task,
TaskStatus.WAITING_ON_ERROR,
result,
waitTime,
terminationData);
tlogger.info(taskId, "new restart is scheduled");
return terminationData;
}
} else if (numberOfExecutionLeft <= 0) {
if (!onErrorPolicyInterpreter.requiresPauseTaskOnError(task) &&
!onErrorPolicyInterpreter.requiresPauseJobOnError(task) &&
!onErrorPolicyInterpreter.requiresCancelJobOnError(task)) {
jobData.job.increaseNumberOfFaultyTasks(taskId);
} else if (onErrorPolicyInterpreter.requiresPauseTaskOnError(task)) {
suspendTaskOnError(jobData, task, result.getTaskDuration());
tlogger.info(taskId,
"Task always contains errors after automatic restart, so it stays in In_Error state");
return terminationData;
} else if (requiresPauseJobOnError) {
suspendTaskOnError(jobData, task, result.getTaskDuration());
pauseJob(task.getJobId());
logger.info("Task always contains errors after automatic restart, so Job is always paused on error");
return terminationData;
}
if (requiresPauseJobOnError) {
pauseJob(task.getJobId());
}
}
}
terminateTask(jobData, task, errorOccurred, result, terminationData);
return terminationData;
} finally {
jobData.unlock();
}
}
private TerminationData createAndFillTerminationData(TaskResultImpl result, RunningTaskData taskData,
InternalJob job, boolean normalTermination) {
TerminationData terminationData = TerminationData.newTerminationData();
terminationData.addTaskData(job, taskData, normalTermination, result);
return terminationData;
}
private void suspendTaskOnError(JobData jobData, InternalTask task, long taskDuration) {
InternalJob job = jobData.job;
job.setInErrorTime(System.currentTimeMillis());
job.setTaskPausedOnError(task);
setJobStatusToInErrorIfNotPaused(job);
job.incrementNumberOfInErrorTasksBy(1);
task.setInErrorTime(task.getStartTime() + taskDuration);
dbManager.updateJobAndTasksState(job);
updateTaskPausedOnerrorState(job, task.getId());
updateJobInSchedulerState(job, SchedulerEvent.JOB_IN_ERROR);
}
private void setJobStatusToInErrorIfNotPaused(InternalJob job) {
if (!job.getStatus().equals(JobStatus.PAUSED)) {
job.setStatus(JobStatus.IN_ERROR);
}
}
TerminationData finishInErrorTask(JobId jobId, String taskName) throws UnknownTaskException, UnknownJobException {
JobData jobData = lockJob(jobId);
if (jobData == null) {
throw new UnknownJobException(jobId);
}
InternalJob job = jobData.job;
try {
InternalTask task = job.getTask(taskName);
if (task == null) {
throw new UnknownTaskException(taskName);
}
TaskId taskId = task.getId();
if (task.getStatus() != TaskStatus.IN_ERROR) {
tlogger.info(task.getId(), "Task must be in state IN_ERROR: " + task.getStatus());
return emptyResult(task.getId());
}
TaskResultImpl taskResult = taskResultCreator.getTaskResult(dbManager, job, task);
RunningTaskData data = new RunningTaskData(task,
job.getOwner(),
job.getCredentials(),
task.getExecuterInformation().getLauncher());
TerminationData terminationData = TerminationData.newTerminationData();
terminationData.addTaskData(job, data, false, taskResult);
tlogger.debug(taskId, "result added to job " + job.getId());
//to be done before terminating the task, once terminated it is not running anymore..
ChangedTasksInfo changesInfo = job.finishInErrorTask(taskId, taskResult, listener);
boolean jobFinished = job.isFinished();
//update job info if it is terminated
if (jobFinished) {
//terminating job
job.terminate();
jlogger.debug(job.getId(), "terminated");
jobs.remove(job.getId());
terminationData.addJobToTerminate(job.getId());
}
//Update database
if (taskResult.getAction() != null) {
dbManager.updateAfterWorkflowTaskFinished(job, changesInfo, taskResult);
} else {
dbManager.updateAfterTaskFinished(job, task, taskResult);
}
//send event
listener.taskStateUpdated(job.getOwner(),
new NotificationData<TaskInfo>(SchedulerEvent.TASK_IN_ERROR_TO_FINISHED,
new TaskInfoImpl((TaskInfoImpl) task.getTaskInfo())));
//if this job is finished (every task have finished)
jlogger.info(job.getId(),
"finished tasks " + job.getNumberOfFinishedTasks() + ", total tasks " +
job.getTotalNumberOfTasks() + ", finished " + jobFinished);
if (jobFinished) {
//send event to client
listener.jobStateUpdated(job.getOwner(),
new NotificationData<JobInfo>(SchedulerEvent.JOB_RUNNING_TO_FINISHED,
new JobInfoImpl((JobInfoImpl) job.getJobInfo())));
listener.jobUpdatedFullData(job);
}
return terminationData;
} finally {
jobData.unlock();
}
}
void restartInErrorTask(JobId jobId, String taskName) throws UnknownTaskException {
JobData jobData = lockJob(jobId);
try {
InternalTask task = jobData.job.getTask(taskName);
tlogger.info(task.getId(), "restarting in-error task " + task.getId());
jobData.job.restartInErrorTask(task);
dbManager.updateJobAndTasksState(jobData.job);
updateJobInSchedulerState(jobData.job, SchedulerEvent.JOB_RESTARTED_FROM_ERROR);
} finally {
jobData.unlock();
}
}
TerminationData restartTask(JobId jobId, String taskName, int restartDelay)
throws UnknownJobException, UnknownTaskException {
JobData jobData = lockJob(jobId);
if (jobData == null) {
throw new UnknownJobException(jobId);
}
try {
InternalTask task = jobData.job.getTask(taskName);
tlogger.info(task.getId(), "restarting task " + task.getId());
if (!task.getStatus().isTaskAlive()) {
tlogger.warn(task.getId(), "task isn't alive: " + task.getStatus());
return emptyResult(task.getId());
}
TaskIdWrapper taskIdWrapper = TaskIdWrapper.wrap(task.getId());
RunningTaskData taskData = runningTasksData.remove(taskIdWrapper);
if (taskData == null) {
throw new IllegalStateException("Task " + task.getId() + " is not running.");
}
TaskResultImpl taskResult = taskResultCreator.getTaskResult(dbManager,
jobData.job,
task,
new TaskRestartedException("Aborted by user"),
new SimpleTaskLogs("", "Aborted by user"));
TerminationData terminationData = createAndFillTerminationData(taskResult, taskData, jobData.job, false);
task.decreaseNumberOfExecutionLeft();
if (task.getNumberOfExecutionLeft() <= 0 && onErrorPolicyInterpreter.requiresCancelJobOnError(task)) {
endJob(jobData,
terminationData,
task,
taskResult,
"An error occurred in your task and the maximum number of executions has been reached. " +
"You also ask to cancel the job in such a situation !",
JobStatus.CANCELED);
return terminationData;
} else if (task.getNumberOfExecutionLeft() > 0) {
long waitTime = restartDelay * 1000l;
restartTaskOnError(jobData, task, TaskStatus.WAITING_ON_ERROR, taskResult, waitTime, terminationData);
return terminationData;
}
terminateTask(jobData, task, true, taskResult, terminationData);
return terminationData;
} finally {
jobData.unlock();
}
}
TerminationData preemptTask(JobId jobId, String taskName, int restartDelay)
throws UnknownJobException, UnknownTaskException {
JobData jobData = lockJob(jobId);
if (jobData == null) {
throw new UnknownJobException(jobId);
}
try {
InternalTask task = jobData.job.getTask(taskName);
tlogger.info(task.getId(), "preempting task " + task.getId());
if (!task.getStatus().isTaskAlive()) {
tlogger.info(task.getId(), "task isn't alive: " + task.getStatus());
return emptyResult(task.getId());
}
RunningTaskData taskData = runningTasksData.remove(TaskIdWrapper.wrap(task.getId()));
if (taskData == null) {
throw new IllegalStateException("Task " + task.getId() + " is not running.");
}
TaskResultImpl taskResult = taskResultCreator.getTaskResult(dbManager,
jobData.job,
task,
new TaskPreemptedException("Preempted by admin"),
new SimpleTaskLogs("", "Preempted by admin"));
TerminationData terminationData = createAndFillTerminationData(taskResult, taskData, jobData.job, false);
long waitTime = restartDelay * 1000L;
restartTaskOnError(jobData, task, TaskStatus.PENDING, taskResult, waitTime, terminationData);
return terminationData;
} finally {
jobData.unlock();
}
}
TerminationData killTask(JobId jobId, String taskName) throws UnknownJobException, UnknownTaskException {
JobData jobData = lockJob(jobId);
if (jobData == null) {
throw new UnknownJobException(jobId);
}
try {
InternalTask task = jobData.job.getTask(taskName);
tlogger.info(task.getId(), "killing task " + task.getId());
if (!task.getStatus().isTaskAlive()) {
tlogger.warn(task.getId(), "task isn't alive: " + task.getStatus());
return emptyResult(task.getId());
}
RunningTaskData taskData = runningTasksData.remove(TaskIdWrapper.wrap(task.getId()));
if (taskData == null) {
// the task is not in running state
taskData = new RunningTaskData(task, jobData.job.getOwner(), jobData.job.getCredentials(), null);
}
TaskResultImpl taskResult = taskResultCreator.getTaskResult(dbManager,
jobData.job,
task,
new TaskAbortedException("The task has been manually killed."),
new SimpleTaskLogs("",
"The task has been manually killed."));
TerminationData terminationData = createAndFillTerminationData(taskResult, taskData, jobData.job, false);
if (onErrorPolicyInterpreter.requiresCancelJobOnError(task)) {
endJob(jobData,
terminationData,
task,
taskResult,
"The task has been manually killed. " + "You also ask to cancel the job in such a situation!",
JobStatus.CANCELED);
} else {
terminateTask(jobData, task, true, taskResult, terminationData);
}
return terminationData;
} finally {
jobData.unlock();
}
}
private void terminateTask(JobData jobData, InternalTask task, boolean errorOccurred, TaskResultImpl result,
TerminationData terminationData) {
InternalJob job = jobData.job;
TaskId taskId = task.getId();
tlogger.debug(taskId, "result added to job " + job.getId());
//to be done before terminating the task, once terminated it is not running anymore..
job.getRunningTaskDescriptor(taskId);
ChangedTasksInfo changesInfo = job.terminateTask(errorOccurred, taskId, listener, result.getAction(), result);
boolean jobFinished = job.isFinished();
//update job info if it is terminated
if (jobFinished) {
//terminating job
job.terminate();
jlogger.debug(job.getId(), "terminated");
terminationData.addJobToTerminate(job.getId());
}
//Update database
if (result.getAction() != null) {
dbManager.updateAfterWorkflowTaskFinished(job, changesInfo, result);
} else {
dbManager.updateAfterTaskFinished(job, task, result);
}
//send event
listener.taskStateUpdated(job.getOwner(),
new NotificationData<TaskInfo>(SchedulerEvent.TASK_RUNNING_TO_FINISHED,
new TaskInfoImpl((TaskInfoImpl) task.getTaskInfo())));
//if this job is finished (every task have finished)
jlogger.info(job.getId(),
"finished tasks " + job.getNumberOfFinishedTasks() + ", total tasks " +
job.getTotalNumberOfTasks() + ", finished " + jobFinished);
if (jobFinished) {
//send event to client
listener.jobStateUpdated(job.getOwner(),
new NotificationData<JobInfo>(SchedulerEvent.JOB_RUNNING_TO_FINISHED,
new JobInfoImpl((JobInfoImpl) job.getJobInfo())));
listener.jobUpdatedFullData(job);
}
}
private TerminationData terminateJob(JobId jobId, JobStatus jobStatus) {
JobData jobData = lockJob(jobId);
if (jobData == null) {
return emptyData(jobId);
}
try {
TerminationData terminationData = TerminationData.newTerminationData();
endJob(jobData, terminationData, null, null, "", jobStatus);
return terminationData;
} finally {
jobData.unlock();
}
}
public TerminationData killJob(JobId jobId) {
jlogger.info(jobId, "killing job");
return terminateJob(jobId, JobStatus.KILLED);
}
public TerminationData removeJob(JobId jobId) {
return terminateJob(jobId, JobStatus.FINISHED);
}
private void endJob(JobData jobData, TerminationData terminationData, InternalTask task, TaskResultImpl taskResult,
String errorMsg, JobStatus jobStatus) {
JobId jobId = jobData.job.getId();
jobs.remove(jobId);
terminationData.addJobToTerminate(jobId);
InternalJob job = jobData.job;
SchedulerEvent event;
if (job.getStatus() == JobStatus.PENDING) {
event = SchedulerEvent.JOB_PENDING_TO_FINISHED;
} else {
event = SchedulerEvent.JOB_RUNNING_TO_FINISHED;
}
if (task != null) {
jlogger.info(job.getId(), "ending request caused by task " + task.getId());
} else {
jlogger.info(job.getId(), "ending request");
}
for (Iterator<RunningTaskData> i = runningTasksData.values().iterator(); i.hasNext();) {
RunningTaskData taskData = i.next();
if (taskData.getTask().getJobId().equals(jobId)) {
i.remove();
//remove previous read progress
taskData.getTask().setProgress(0);
terminationData.addTaskData(job, taskData, false, taskResult);
}
}
//if job has been killed
if (jobStatus == JobStatus.KILLED) {
Set<TaskId> tasksToUpdate = job.failed(null, jobStatus);
dbManager.updateAfterJobKilled(job, tasksToUpdate);
updateTasksInSchedulerState(job, tasksToUpdate);
} else {
// don't tamper the original job status if it's already in a finished state (failed/canceled)
if (jobStatus != JobStatus.FINISHED) {
Set<TaskId> tasksToUpdate = job.failed(task.getId(), jobStatus);
//store the exception into jobResult / To prevent from empty task result (when job canceled), create one
boolean noResult = (jobStatus == JobStatus.CANCELED && taskResult == null);
if (jobStatus == JobStatus.FAILED || noResult) {
taskResult = new TaskResultImpl(task.getId(),
new Exception(errorMsg),
new SimpleTaskLogs("", errorMsg),
-1);
}
dbManager.updateAfterJobFailed(job, task, taskResult, tasksToUpdate);
updateTasksInSchedulerState(job, tasksToUpdate);
}
}
//update job and tasks events list and send it to front-end
updateJobInSchedulerState(job, event);
jlogger.info(job.getId(), "finished (" + jobStatus + ")");
}
private void updateTasksInSchedulerState(InternalJob job, Set<TaskId> tasksToUpdate) {
for (TaskId tid : tasksToUpdate) {
try {
InternalTask t = job.getTask(tid);
TaskInfo ti = new TaskInfoImpl((TaskInfoImpl) t.getTaskInfo());
listener.taskStateUpdated(job.getOwner(),
new NotificationData<>(SchedulerEvent.TASK_RUNNING_TO_FINISHED, ti));
} catch (UnknownTaskException e) {
logger.error(e);
}
}
}
private JobData lockJob(JobId jobId) {
JobData jobData = jobs.get(jobId);
if (jobData == null) {
jlogger.info(jobId, "does not exist");
return null;
}
jobData.jobLock.lock();
if (jobs.containsKey(jobId)) {
return jobData;
} else {
jobData.unlock();
return null;
}
}
private JobData checkJobAccess(JobId jobId) {
JobData jobData = jobs.get(jobId);
if (jobData == null) {
throw new IllegalArgumentException("Unknown job: " + jobId);
}
if (!jobData.jobLock.isHeldByCurrentThread()) {
throw new IllegalThreadStateException("Thread doesn't hold lock for job " + jobId);
} else {
return jobData;
}
}
private void updateJobInSchedulerState(InternalJob currentJob, SchedulerEvent eventType) {
try {
listener.jobStateUpdated(currentJob.getOwner(),
new NotificationData<JobInfo>(eventType,
new JobInfoImpl((JobInfoImpl) currentJob.getJobInfo())));
listener.jobUpdatedFullData(currentJob);
} catch (Throwable t) {
//Just to prevent update method error
}
}
private void updateTaskPausedOnerrorState(InternalJob job, TaskId taskToUpdate) {
try {
InternalTask t = job.getTask(taskToUpdate);
TaskInfo ti = new TaskInfoImpl((TaskInfoImpl) t.getTaskInfo());
listener.taskStateUpdated(job.getOwner(), new NotificationData<>(SchedulerEvent.TASK_IN_ERROR, ti));
} catch (UnknownTaskException e) {
logger.error(e);
}
}
}