/*
* ProActive Parallel Suite(TM):
* The Open Source library for parallel and distributed
* Workflows & Scheduling, Orchestration, Cloud Automation
* and Big Data Analysis on Enterprise Grids & Clouds.
*
* Copyright (c) 2007 - 2017 ActiveEon
* Contact: contact@activeeon.com
*
* This library is free software: you can redistribute it and/or
* modify it under the terms of the GNU Affero General Public License
* as published by the Free Software Foundation: version 3 of
* the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* If needed, contact us to obtain a release under GPL Version 2 or 3
* or a different license than the AGPL.
*/
package org.ow2.proactive.scheduler.core.db;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;
import org.apache.log4j.Logger;
import org.ow2.proactive.scheduler.common.job.JobStatus;
import org.ow2.proactive.scheduler.common.task.TaskStatus;
import org.ow2.proactive.scheduler.job.InternalJob;
import org.ow2.proactive.scheduler.task.internal.InternalTask;
import org.ow2.proactive.scheduler.util.JobLogger;
public class SchedulerStateRecoverHelper {
private static final Logger logger = Logger.getLogger(SchedulerStateRecoverHelper.class);
private static final JobLogger jobLogger = JobLogger.getInstance();
private final SchedulerDBManager dbManager;
public SchedulerStateRecoverHelper(SchedulerDBManager dbManager) {
this.dbManager = dbManager;
}
public RecoveredSchedulerState recover(long loadJobPeriod) {
List<InternalJob> notFinishedJobs = dbManager.loadNotFinishedJobs(true);
Vector<InternalJob> pendingJobs = new Vector<>();
Vector<InternalJob> runningJobs = new Vector<>();
for (InternalJob job : notFinishedJobs) {
job.getJobDescriptor();
switch (job.getStatus()) {
case PENDING:
pendingJobs.add(job);
break;
case STALLED:
case RUNNING:
case IN_ERROR:
runningJobs.add(job);
runningTasksToPending(job.getITasks());
break;
case PAUSED:
if ((job.getNumberOfPendingTasks() + job.getNumberOfRunningTasks() +
job.getNumberOfFinishedTasks()) == 0) {
pendingJobs.add(job);
} else {
runningJobs.add(job);
runningTasksToPending(job.getITasks());
}
break;
default:
throw new IllegalStateException("Unexpected job status: " + job.getStatus());
}
}
Vector<InternalJob> finishedJobs = new Vector<>();
for (Iterator<InternalJob> iterator = runningJobs.iterator(); iterator.hasNext();) {
InternalJob job = iterator.next();
try {
List<InternalTask> tasksList = copyAndSort(job.getITasks());
//simulate the running execution to recreate the tree.
for (InternalTask task : tasksList) {
job.recoverTask(task.getId());
}
if ((job.getStatus() == JobStatus.RUNNING) || (job.getStatus() == JobStatus.PAUSED)) {
//set the status to stalled because the scheduler start in stopped mode.
if (job.getStatus() == JobStatus.RUNNING) {
job.setStatus(JobStatus.STALLED);
}
//set the task to pause inside the job if it is paused.
if (job.getStatus() == JobStatus.PAUSED) {
job.setStatus(JobStatus.STALLED);
job.setPaused();
}
//update the count of pending and running task.
job.setNumberOfPendingTasks(job.getNumberOfPendingTasks() + job.getNumberOfRunningTasks());
job.setNumberOfRunningTasks(0);
}
} catch (Exception e) {
logger.error("Failed to recover job " + job.getId() + " " + job.getName() +
" job might be in a inconsistent state", e);
jobLogger.error(job.getId(), "Failed to recover job, job might be in an inconsistent state", e);
// partially cancel job (not tasks) and move it to finished jobs to avoid running it
iterator.remove();
job.setStatus(JobStatus.CANCELED);
finishedJobs.add(job);
dbManager.updateJobAndTasksState(job);
}
}
for (InternalJob job : pendingJobs) {
//set the task to pause inside the job if it is paused.
if (job.getStatus() == JobStatus.PAUSED) {
job.setStatus(JobStatus.STALLED);
job.setPaused();
}
}
finishedJobs.addAll(dbManager.loadFinishedJobs(false, loadJobPeriod));
return new RecoveredSchedulerState(pendingJobs, runningJobs, finishedJobs);
}
private void runningTasksToPending(List<InternalTask> tasks) {
for (InternalTask task : tasks) {
if (task.getStatus() == TaskStatus.RUNNING) {
task.setStatus(TaskStatus.PENDING);
}
}
}
/**
* Make a copy of the given argument
* As no task could be running after recover, this method also move task from RUNNING status to PENDING one.
* Then sort the array according to finished time order.
*
* @param tasks the list of internal tasks to copy.
* @return the sorted copy of the given argument.
*/
protected List<InternalTask> copyAndSort(List<InternalTask> tasks) {
ArrayList<InternalTask> tasksList = new ArrayList<>();
//copy the list with only the finished task.
for (InternalTask task : tasks) {
switch (task.getStatus()) {
case ABORTED:
case FAILED:
case FINISHED:
case FAULTY:
case SKIPPED:
tasksList.add(task);
}
//if task was running, put it in pending status
if (task.getStatus() == TaskStatus.RUNNING) {
task.setStatus(TaskStatus.PENDING);
}
}
//sort parents before children
return TopologicalTaskSorter.sortInternalTasks(tasksList);
}
}