/*
* ProActive Parallel Suite(TM):
* The Open Source library for parallel and distributed
* Workflows & Scheduling, Orchestration, Cloud Automation
* and Big Data Analysis on Enterprise Grids & Clouds.
*
* Copyright (c) 2007 - 2017 ActiveEon
* Contact: contact@activeeon.com
*
* This library is free software: you can redistribute it and/or
* modify it under the terms of the GNU Affero General Public License
* as published by the Free Software Foundation: version 3 of
* the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* If needed, contact us to obtain a release under GPL Version 2 or 3
* or a different license than the AGPL.
*/
package org.ow2.proactive.scheduler.core;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import org.apache.log4j.Logger;
import org.objectweb.proactive.core.node.Node;
import org.ow2.proactive.scheduler.common.NotificationData;
import org.ow2.proactive.scheduler.common.SchedulerEvent;
import org.ow2.proactive.scheduler.common.SchedulerStatus;
import org.ow2.proactive.scheduler.common.exception.InternalException;
import org.ow2.proactive.scheduler.common.exception.UnknownJobException;
import org.ow2.proactive.scheduler.common.exception.UnknownTaskException;
import org.ow2.proactive.scheduler.common.job.JobId;
import org.ow2.proactive.scheduler.common.job.JobInfo;
import org.ow2.proactive.scheduler.common.job.JobPriority;
import org.ow2.proactive.scheduler.common.task.TaskId;
import org.ow2.proactive.scheduler.common.task.TaskInfo;
import org.ow2.proactive.scheduler.common.task.TaskResult;
import org.ow2.proactive.scheduler.common.util.logforwarder.AppenderProvider;
import org.ow2.proactive.scheduler.core.db.RecoveredSchedulerState;
import org.ow2.proactive.scheduler.core.properties.PASchedulerProperties;
import org.ow2.proactive.scheduler.descriptor.EligibleTaskDescriptor;
import org.ow2.proactive.scheduler.descriptor.JobDescriptor;
import org.ow2.proactive.scheduler.job.InternalJob;
import org.ow2.proactive.scheduler.job.JobInfoImpl;
import org.ow2.proactive.scheduler.policy.Policy;
import org.ow2.proactive.scheduler.task.TaskInfoImpl;
import org.ow2.proactive.scheduler.task.TaskLauncher;
import org.ow2.proactive.scheduler.task.TaskResultImpl;
import org.ow2.proactive.scheduler.task.internal.InternalTask;
import org.ow2.proactive.scheduler.util.JobLogger;
import org.ow2.proactive.scheduler.util.ServerJobAndTaskLogs;
import org.ow2.proactive.scheduler.util.TaskLogger;
import org.ow2.proactive.utils.NodeSet;
import it.sauronsoftware.cron4j.Scheduler;
public class SchedulingService {
static final Logger logger = Logger.getLogger(SchedulingService.class);
static final TaskLogger tlogger = TaskLogger.getInstance();
static final JobLogger jlogger = JobLogger.getInstance();
static final long SCHEDULER_AUTO_REMOVED_JOB_DELAY = PASchedulerProperties.SCHEDULER_AUTOMATIC_REMOVED_JOB_DELAY.getValueAsInt() *
1000;
static final long SCHEDULER_REMOVED_JOB_DELAY = PASchedulerProperties.SCHEDULER_REMOVED_JOB_DELAY.getValueAsInt() *
1000;
private final SchedulingInfrastructure infrastructure;
private final LiveJobs jobs;
private final SchedulerStateUpdate listener;
private final ListenJobLogsSupport listenJobLogsSupport;
volatile SchedulerStatus status = SchedulerStatus.STOPPED;
private volatile Policy policy;
private final SchedulingThread schedulingThread;
private Thread pinger;
private Scheduler houseKeepingScheduler;
/**
* Url used to store the last url of the RM (used to try to reconnect to the rm when it is down)
*/
private URI lastRmUrl;
public SchedulingService(SchedulingInfrastructure infrastructure, SchedulerStateUpdate listener,
RecoveredSchedulerState recoveredState, String policyClassName, SchedulingMethod schedulingMethod)
throws Exception {
this.infrastructure = infrastructure;
this.listener = listener;
this.jobs = new LiveJobs(infrastructure.getDBManager(), listener);
this.listenJobLogsSupport = ListenJobLogsSupport.newInstance(infrastructure.getDBManager(), jobs);
if (recoveredState != null) {
recover(recoveredState);
}
this.policy = (Policy) Class.forName(policyClassName).newInstance();
if (!this.policy.reloadConfig()) {
throw new RuntimeException("Scheduling policy cannot be started, see log file for details.");
}
logger.debug("Instantiated policy : " + policyClassName);
lastRmUrl = infrastructure.getRMProxiesManager().getRmUrl();
if (schedulingMethod == null) {
schedulingMethod = new SchedulingMethodImpl(this);
}
start();
schedulingThread = new SchedulingThread(schedulingMethod, this);
schedulingThread.start();
pinger = new NodePingThread(this);
pinger.start();
if (PASchedulerProperties.SCHEDULER_AUTOMATIC_REMOVED_JOB_DELAY.getValueAsInt() > 0) {
startHouseKeeping();
}
}
public void startHouseKeeping() {
houseKeepingScheduler = new Scheduler();
String cronExpr = "* * * * *";
if (PASchedulerProperties.SCHEDULER_AUTOMATIC_REMOVED_JOB_CRON_EXPR.isSet()) {
cronExpr = PASchedulerProperties.SCHEDULER_AUTOMATIC_REMOVED_JOB_CRON_EXPR.getValueAsString();
}
houseKeepingScheduler.schedule(cronExpr, new HousekeepingRunner());
houseKeepingScheduler.start();
}
public Policy getPolicy() {
return policy;
}
public LiveJobs getJobs() {
return jobs;
}
public SchedulerStateUpdate getListener() {
return listener;
}
public boolean isSubmitPossible() {
return status.isSubmittable();
}
public boolean start() {
if (!status.isStartable()) {
return false;
}
status = SchedulerStatus.STARTED;
logger.info("Scheduler has just been started !");
listener.schedulerStateUpdated(SchedulerEvent.STARTED);
return true;
}
public boolean stop() {
if (!status.isStoppable()) {
return false;
}
status = SchedulerStatus.STOPPED;
logger.info("Scheduler has just been stopped, no tasks will be launched until start.");
listener.schedulerStateUpdated(SchedulerEvent.STOPPED);
return true;
}
public boolean pause() {
if (!status.isPausable()) {
return false;
}
status = SchedulerStatus.PAUSED;
logger.info("Scheduler has just been paused !");
listener.schedulerStateUpdated(SchedulerEvent.PAUSED);
return true;
}
public boolean freeze() {
if (!status.isFreezable()) {
return false;
}
status = SchedulerStatus.FROZEN;
logger.info("Scheduler has just been frozen !");
listener.schedulerStateUpdated(SchedulerEvent.FROZEN);
return true;
}
public boolean resume() {
if (!status.isResumable()) {
return false;
}
status = SchedulerStatus.STARTED;
logger.info("Scheduler has just been resumed !");
listener.schedulerStateUpdated(SchedulerEvent.RESUMED);
wakeUpSchedulingThread();
return true;
}
public boolean shutdown() {
if (status.isDown()) {
return false;
}
status = SchedulerStatus.SHUTTING_DOWN;
logger.info("Scheduler is shutting down, this may take time to finish every jobs!");
listener.schedulerStateUpdated(SchedulerEvent.SHUTTING_DOWN);
logger.info("Unpause all running and pending jobs!");
jobs.unpauseAll();
infrastructure.schedule(new Runnable() {
public void run() {
if (jobs.getRunningTasks().isEmpty()) {
listener.schedulerStateUpdated(SchedulerEvent.SHUTDOWN);
} else {
infrastructure.schedule(this, 5000);
}
}
}, 5000);
return true;
}
public boolean kill() {
if (status.isKilled()) {
return false;
}
status = SchedulerStatus.KILLED;
pinger.interrupt();
schedulingThread.interrupt();
logger.info("Killing all running task processes...");
for (RunningTaskData taskData : jobs.getRunningTasks()) {
NodeSet nodes = taskData.getTask().getExecuterInformation().getNodes();
try {
taskData.getLauncher().kill();
} catch (Throwable t) {
logger.error("Failed to terminate launcher", t);
}
try {
infrastructure.getRMProxiesManager()
.getUserRMProxy(taskData.getUser(), taskData.getCredentials())
.releaseNodes(nodes, taskData.getTask().getCleaningScript());
} catch (Throwable t) {
logger.error("Failed to release nodes", t);
}
}
listenJobLogsSupport.shutdown();
infrastructure.shutdown();
listener.schedulerStateUpdated(SchedulerEvent.KILLED);
return true;
}
public ListenJobLogsSupport getListenJobLogsSupport() {
return listenJobLogsSupport;
}
public boolean reloadPolicyConfiguration() {
if (status.isShuttingDown()) {
logger.warn("Policy configuration can only be reloaded when Scheduler is up, current state : " + status);
return false;
}
return policy.reloadConfig();
}
public boolean changePolicy(String newPolicyClassName) {
try {
if (status.isShuttingDown()) {
logger.warn("Policy can only be changed when Scheduler is up, current state : " + status);
return false;
}
//TODO class loading ? (for now, class must be in scheduler classpath or addons)
Policy newPolicy = (Policy) Class.forName(newPolicyClassName).newInstance();
//newPolicy.setCore(this);
if (!newPolicy.reloadConfig()) {
return false;
}
//if success, change current policy
policy = newPolicy;
listener.schedulerStateUpdated(SchedulerEvent.POLICY_CHANGED);
logger.info("Policy changed ! new policy name : " + newPolicyClassName);
return true;
} catch (InstantiationException e) {
logger.error("", e);
throw new InternalException("Exception occurs while instanciating the policy !", e);
} catch (IllegalAccessException e) {
logger.error("", e);
throw new InternalException("Exception occurs while accessing the policy !", e);
} catch (ClassNotFoundException e) {
logger.error("", e);
throw new InternalException("Exception occurs while loading the policy class !", e);
}
}
public boolean linkResourceManager(String rmURL) {
try {
//re-link the RM
getInfrastructure().getRMProxiesManager().rebindRMProxiesManager(new URI(rmURL.trim()));
logger.info("New resource manager has been linked to the scheduler");
if (status == SchedulerStatus.UNLINKED) {
logger.info("Resume to continue the scheduling.");
listener.schedulerStateUpdated(SchedulerEvent.RM_UP);
//restart the scheduler
status = SchedulerStatus.STARTED;
listener.schedulerStateUpdated(SchedulerEvent.STARTED);
}
return true;
} catch (Exception e) {
throw new InternalException("Error while connecting the new Resource Manager !", e);
}
}
public SchedulingInfrastructure getInfrastructure() {
return infrastructure;
}
/*
* Should be called only by scheduling method impl when job scheduling starts
*/
public Map<JobId, JobDescriptor> lockJobsToSchedule() {
return jobs.lockJobsToSchedule();
}
/*
* Should be called only by scheduling method impl after job scheduling finished
*/
public void unlockJobsToSchedule(Collection<JobDescriptor> jobDescriptors) {
jobs.unlockJobsToSchedule(jobDescriptors);
}
/*
* Should be called only by scheduling method impl while it holds job lock
*/
public void taskStarted(InternalJob job, InternalTask task, TaskLauncher launcher) {
jobs.taskStarted(job, task, launcher);
}
/*
* Should be called only by scheduling method impl while it holds job lock
*/
public void simulateJobStartAndCancelIt(final List<EligibleTaskDescriptor> tasksToSchedule, final String errorMsg) {
infrastructure.getInternalOperationsThreadPool().submit(new Runnable() {
public void run() {
TerminationData terminationData = jobs.simulateJobStart(tasksToSchedule, errorMsg);
try {
terminationData.handleTermination(SchedulingService.this);
} catch (Exception e) {
logger.error("Exception occurred, fail to get variables into the cleaning script: ", e);
}
}
});
}
public void submitJob(InternalJob job) {
try {
infrastructure.getClientOperationsThreadPool().submit(new SubmitHandler(this, job)).get();
} catch (Exception e) {
throw handleFutureWaitException(e);
}
}
public boolean pauseJob(final JobId jobId) {
try {
if (status.isShuttingDown()) {
return false;
}
return infrastructure.getClientOperationsThreadPool().submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
return jobs.pauseJob(jobId);
}
}).get();
} catch (Exception e) {
throw handleFutureWaitException(e);
}
}
public boolean changeStartAt(final JobId jobId, final String startAt) {
try {
if (status.isShuttingDown()) {
return false;
}
return infrastructure.getClientOperationsThreadPool().submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
return jobs.updateStartAt(jobId, startAt);
}
}).get();
} catch (Exception e) {
throw handleFutureWaitException(e);
}
}
public boolean restartAllInErrorTasks(final JobId jobId) {
try {
return infrastructure.getClientOperationsThreadPool().submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
Boolean result = jobs.restartAllInErrorTasks(jobId);
wakeUpSchedulingThread();
return result;
}
}).get();
} catch (Exception e) {
throw handleFutureWaitException(e);
}
}
public boolean resumeJob(final JobId jobId) {
try {
return infrastructure.getClientOperationsThreadPool().submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
Boolean result = jobs.resumeJob(jobId);
wakeUpSchedulingThread();
return result;
}
}).get();
} catch (Exception e) {
throw handleFutureWaitException(e);
}
}
public void changeJobPriority(final JobId jobId, final JobPriority priority) {
if (status.isShuttingDown()) {
return;
}
try {
infrastructure.getClientOperationsThreadPool().submit(new Runnable() {
@Override
public void run() {
jlogger.info(jobId, "request to change the priority to " + priority);
jobs.changeJobPriority(jobId, priority);
wakeUpSchedulingThread();
}
}).get();
} catch (Exception e) {
throw handleFutureWaitException(e);
}
}
public boolean removeJob(JobId jobId) {
try {
return infrastructure.getClientOperationsThreadPool().submit(new JobRemoveHandler(this, jobId)).get();
} catch (Exception e) {
throw handleFutureWaitException(e);
}
}
public void scheduleJobRemove(JobId jobId, long at) {
InternalJob job = infrastructure.getDBManager().loadJobWithTasksIfNotRemoved(jobId);
boolean shouldRemoveFromDb = PASchedulerProperties.JOB_REMOVE_FROM_DB.getValueAsBoolean();
if (job != null) {
infrastructure.getDBManager().scheduleJobForRemoval(job.getJobInfo().getJobId(), at, shouldRemoveFromDb);
}
}
public void restartTaskOnNodeFailure(final InternalTask task) {
if (status.isUnusable()) {
return;
}
infrastructure.getInternalOperationsThreadPool().submit(new Runnable() {
@Override
public void run() {
TerminationData terminationData = jobs.restartTaskOnNodeFailure(task);
try {
terminationData.handleTermination(SchedulingService.this);
} catch (Exception e) {
logger.error("Exception occurred, fail to get variables into the cleaning script: ", e);
}
wakeUpSchedulingThread();
}
});
}
class TerminationDataHandler implements Runnable {
private final TerminationData terminationData;
public TerminationDataHandler(TerminationData terminationData) {
this.terminationData = terminationData;
}
public void run() {
try {
terminationData.handleTermination(SchedulingService.this);
} catch (Exception e) {
logger.error("Exception occurred, fail to get variables into the cleaning script:", e);
}
}
}
public boolean killJob(final JobId jobId) {
try {
if (status.isUnusable()) {
return false;
}
Boolean result = infrastructure.getClientOperationsThreadPool().submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
TerminationData terminationData = jobs.killJob(jobId);
boolean jobKilled = terminationData.jobTerminated(jobId);
submitTerminationDataHandler(terminationData);
wakeUpSchedulingThread();
return jobKilled;
}
}).get();
return result;
} catch (Exception e) {
throw handleFutureWaitException(e);
}
}
void submitTerminationDataHandler(TerminationData terminationData) {
if (!terminationData.isEmpty()) {
getInfrastructure().getInternalOperationsThreadPool().submit(new TerminationDataHandler(terminationData));
}
}
public boolean killTask(final JobId jobId, final String taskName) throws UnknownJobException, UnknownTaskException {
try {
if (status.isUnusable()) {
return false;
}
Boolean result = infrastructure.getClientOperationsThreadPool().submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
TerminationData terminationData = jobs.killTask(jobId, taskName);
boolean taskKilled = terminationData.taskTerminated(jobId, taskName);
submitTerminationDataHandler(terminationData);
wakeUpSchedulingThread();
return taskKilled;
}
}).get();
return result;
} catch (ExecutionException e) {
if (e.getCause() instanceof UnknownTaskException) {
throw (UnknownTaskException) e.getCause();
} else if (e.getCause() instanceof UnknownJobException) {
throw (UnknownJobException) e.getCause();
} else {
throw launderThrowable(e.getCause());
}
} catch (Exception e) {
throw launderThrowable(e);
}
}
public boolean restartTask(final JobId jobId, final String taskName, final int restartDelay)
throws UnknownJobException, UnknownTaskException {
try {
if (status.isUnusable()) {
return false;
}
Boolean result = infrastructure.getClientOperationsThreadPool().submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
TerminationData terminationData = jobs.restartTask(jobId, taskName, restartDelay);
boolean taskRestarted = terminationData.taskTerminated(jobId, taskName);
submitTerminationDataHandler(terminationData);
wakeUpSchedulingThread();
return taskRestarted;
}
}).get();
return result;
} catch (ExecutionException e) {
if (e.getCause() instanceof UnknownTaskException) {
throw (UnknownTaskException) e.getCause();
} else if (e.getCause() instanceof UnknownJobException) {
throw (UnknownJobException) e.getCause();
} else {
throw launderThrowable(e.getCause());
}
} catch (Exception e) {
throw launderThrowable(e);
}
}
public boolean finishInErrorTask(final JobId jobId, final String taskName)
throws UnknownJobException, UnknownTaskException {
try {
if (status.isUnusable()) {
return false;
}
return infrastructure.getClientOperationsThreadPool().submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
TerminationData terminationData = jobs.finishInErrorTask(jobId, taskName);
boolean taskfinished = terminationData.taskTerminated(jobId, taskName);
submitTerminationDataHandler(terminationData);
wakeUpSchedulingThread();
return taskfinished;
}
}).get();
} catch (ExecutionException e) {
if (e.getCause() instanceof UnknownTaskException) {
throw (UnknownTaskException) e.getCause();
} else if (e.getCause() instanceof UnknownJobException) {
throw (UnknownJobException) e.getCause();
} else {
throw launderThrowable(e.getCause());
}
} catch (Exception e) {
throw launderThrowable(e);
}
}
public boolean restartInErrorTask(final JobId jobId, final String taskName)
throws UnknownJobException, UnknownTaskException {
try {
if (status.isUnusable()) {
return false;
}
return infrastructure.getClientOperationsThreadPool().submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
jobs.restartInErrorTask(jobId, taskName);
wakeUpSchedulingThread();
return Boolean.TRUE;
}
}).get();
} catch (ExecutionException e) {
if (e.getCause() instanceof UnknownTaskException) {
throw (UnknownTaskException) e.getCause();
} else if (e.getCause() instanceof UnknownJobException) {
throw (UnknownJobException) e.getCause();
} else {
throw launderThrowable(e.getCause());
}
} catch (Exception e) {
throw launderThrowable(e);
}
}
public boolean preemptTask(final JobId jobId, final String taskName, final int restartDelay)
throws UnknownJobException, UnknownTaskException {
try {
Boolean result = infrastructure.getClientOperationsThreadPool().submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
TerminationData terminationData = jobs.preemptTask(jobId, taskName, restartDelay);
boolean taskRestarted = terminationData.taskTerminated(jobId, taskName);
submitTerminationDataHandler(terminationData);
wakeUpSchedulingThread();
return taskRestarted;
}
}).get();
return result;
} catch (ExecutionException e) {
if (e.getCause() instanceof UnknownTaskException) {
throw (UnknownTaskException) e.getCause();
} else if (e.getCause() instanceof UnknownJobException) {
throw (UnknownJobException) e.getCause();
} else {
throw launderThrowable(e.getCause());
}
} catch (Exception e) {
throw launderThrowable(e);
}
}
public void listenJobLogs(final JobId jobId, final AppenderProvider appenderProvider) throws UnknownJobException {
try {
infrastructure.getClientOperationsThreadPool().submit(new Callable<Void>() {
@Override
public Void call() throws UnknownJobException {
getListenJobLogsSupport().listenJobLogs(jobId, appenderProvider);
return null;
}
}).get();
} catch (ExecutionException e) {
if (e.getCause() instanceof UnknownJobException) {
throw (UnknownJobException) e.getCause();
} else {
throw launderThrowable(e.getCause());
}
} catch (Exception e) {
throw launderThrowable(e);
}
}
public void taskTerminatedWithResult(final TaskId taskId, final TaskResult taskResult) {
infrastructure.getInternalOperationsThreadPool().submit(new Runnable() {
@Override
public void run() {
try {
TerminationData terminationData = jobs.taskTerminatedWithResult(taskId,
(TaskResultImpl) taskResult);
terminationData.handleTermination(SchedulingService.this);
wakeUpSchedulingThread();
} catch (Throwable e) {
logger.error("Failed to terminate task " + taskId, e);
}
}
});
}
void handleException(Throwable t) {
logger.error("Unexpected exception in the scheduling thread - checking the connection to resource manager", t);
try {
// check if the connection to RM is still active
// if not reactivate it for all the proxies
checkAndReconnectRM();
} catch (Exception rme) {
logger.error("Error while reconnecting to the resource manager", rme);
}
}
/**
* Check the connection to the RM. If the connection is down and automatic reconnection is enabled, this method performs n reconnection attempts before returning the result.
* These parameters can be set in the configuration :
* - Enabling/Disabling automatic reconnection: pa.scheduler.core.rmconnection.autoconnect (default is true)
* - Delay in ms between 2 consecutive attempts: pa.scheduler.core.rmconnection.timespan (default is 5000 ms)
* - Maximum number of attempts: pa.scheduler.core.rmconnection.attempts (default is 10)
*
* @return true if the RM is alive, false otherwise.
*/
private boolean checkAndReconnectRM() {
// Result of the method.
boolean alive = false;
// Checks if the option is enabled (false by default)
boolean autoReconnectRM = PASchedulerProperties.SCHEDULER_RMCONNECTION_AUTO_CONNECT.isSet() ? PASchedulerProperties.SCHEDULER_RMCONNECTION_AUTO_CONNECT.getValueAsBoolean()
: false;
// Delay (in ms) between each connection attempts (5s by default)
int timespan = PASchedulerProperties.SCHEDULER_RMCONNECTION_TIMESPAN.isSet() ? PASchedulerProperties.SCHEDULER_RMCONNECTION_TIMESPAN.getValueAsInt()
: 5000;
// Maximum number of attempts (10 by default)
int maxAttempts = PASchedulerProperties.SCHEDULER_RMCONNECTION_ATTEMPTS.isSet() ? PASchedulerProperties.SCHEDULER_RMCONNECTION_ATTEMPTS.getValueAsInt()
: 10;
// If the options is disabled or the number of attempts is wrong, it is set to 1
if (!autoReconnectRM || maxAttempts <= 0)
maxAttempts = 1;
// Check the timespan option
if (timespan <= 0)
timespan = 5000;
// Save the url in a string of the last connected RM.
String rmURL = this.lastRmUrl.toString();
int nbAttempts = 1;
logger.info("Automatically reconnecting to RM at url " + rmURL + "...");
while (!alive && nbAttempts <= maxAttempts) {
try {
infrastructure.getRMProxiesManager().rebindRMProxiesManager(new URI(rmURL));
logger.info("Successfully reconnected to Resource Manager at " + rmURL);
alive = true;
} catch (Exception rme) {
alive = false;
if (nbAttempts != maxAttempts) {
try {
// Sleep before two attempts
logger.info("Waiting " + timespan + " ms before the next attempt...");
Thread.sleep(timespan);
} catch (InterruptedException ex) {
logger.error("An exception has occurred while waiting.");
}
}
}
nbAttempts++;
}
if (!alive) {
logger.info("Resource Manager seems to be dead.");
// Disconnect proxies and freeze the scheduler.
clearProxiesAndFreeze();
logger.fatal("\n*****************************************************************************************************************\n" +
"* Resource Manager is no more available, Scheduler has been paused waiting for a resource manager to be reconnect\n" +
"* Scheduler is in critical state and its functionalities are reduced : \n" +
"* \t-> use the linkrm(\"" + rmURL +
"\") command in scheduler-client to reconnect a new one.\n" +
"*****************************************************************************************************************");
listener.schedulerStateUpdated(SchedulerEvent.RM_DOWN);
}
return alive;
}
/**
* Terminate all proxies and freeze the scheduler.
*/
private void clearProxiesAndFreeze() {
// Terminate proxy and disconnect RM
logger.error("Resource Manager will be disconnected");
infrastructure.getRMProxiesManager().terminateAllProxies();
//if failed
freeze();
//scheduler functionality are reduced until now
status = SchedulerStatus.UNLINKED;
}
static RuntimeException handleFutureWaitException(Exception e) {
if (e instanceof ExecutionException) {
return launderThrowable(e.getCause());
} else {
return launderThrowable(e);
}
}
static RuntimeException launderThrowable(Throwable t) {
if (t instanceof RuntimeException) {
return (RuntimeException) t;
} else if (t instanceof Error) {
throw (Error) t;
} else {
throw new IllegalStateException("Not unchecked", t);
}
}
void terminateJobHandling(final JobId jobId) {
try {
listenJobLogsSupport.cleanLoggers(jobId);
// auto remove
if (SchedulingService.SCHEDULER_AUTO_REMOVED_JOB_DELAY > 0) {
long timeToRemove = System.currentTimeMillis() + SchedulingService.SCHEDULER_AUTO_REMOVED_JOB_DELAY;
scheduleJobRemove(jobId, timeToRemove);
}
} catch (Throwable t) {
logger.warn("", t);
}
}
private void recover(RecoveredSchedulerState recoveredState) {
Vector<InternalJob> finishedJobs = recoveredState.getFinishedJobs();
Vector<InternalJob> pendingJobs = recoveredState.getPendingJobs();
Vector<InternalJob> runningJobs = recoveredState.getRunningJobs();
jobsRecovered(pendingJobs);
jobsRecovered(runningJobs);
recoverTasksState(finishedJobs, false);
recoverTasksState(runningJobs, true);
recoverTasksState(pendingJobs, true);
if (SCHEDULER_REMOVED_JOB_DELAY > 0 || SCHEDULER_AUTO_REMOVED_JOB_DELAY > 0) {
logger.debug("Removing non-managed jobs");
Iterator<InternalJob> iterJob = recoveredState.getFinishedJobs().iterator();
while (iterJob.hasNext()) {
final InternalJob job = iterJob.next();
//re-set job removed delay (if job result has been sent to user)
long toWait = 0;
if (job.isToBeRemoved()) {
toWait = SCHEDULER_REMOVED_JOB_DELAY *
SCHEDULER_AUTO_REMOVED_JOB_DELAY == 0 ? SCHEDULER_REMOVED_JOB_DELAY +
SCHEDULER_AUTO_REMOVED_JOB_DELAY
: Math.min(SCHEDULER_REMOVED_JOB_DELAY,
SCHEDULER_AUTO_REMOVED_JOB_DELAY);
} else {
toWait = SCHEDULER_AUTO_REMOVED_JOB_DELAY;
}
if (toWait > 0) {
scheduleJobRemove(job.getId(), System.currentTimeMillis() + toWait);
jlogger.debug(job.getId(), "will be removed in " + (SCHEDULER_REMOVED_JOB_DELAY / 1000) + "sec");
}
}
}
}
private void recoverTasksState(Vector<InternalJob> jobs, boolean restoreInErrorTasks) {
Iterator<InternalJob> iterJob = jobs.iterator();
while (iterJob.hasNext()) {
InternalJob job = iterJob.next();
int faultyTasksCount = 0;
for (InternalTask internalTask : job.getITasks()) {
switch (internalTask.getStatus()) {
case FAULTY:
faultyTasksCount++;
break;
case WAITING_ON_ERROR:
faultyTasksCount++;
job.saveFaultyTaskId(internalTask.getId());
break;
}
}
if (faultyTasksCount != job.getNumberOfFaultyTasks()) {
logger.warn("Number of faulty tasks saved in DB for Job " + job.getId() +
" does not match the one computed using task statuses");
}
if (restoreInErrorTasks) {
job.getJobDescriptor().restoreInErrorTasks();
}
}
}
private void jobsRecovered(Collection<InternalJob> jobs) {
DataSpaceServiceStarter dsStarter = infrastructure.getDataSpaceServiceStarter();
for (InternalJob job : jobs) {
this.jobs.jobRecovered(job);
switch (job.getStatus()) {
case PENDING:
break;
case STALLED:
case RUNNING:
//start dataspace app for this job
job.startDataSpaceApplication(dsStarter.getNamingService(), job.getITasks());
// restart classServer if needed
break;
case FINISHED:
case CANCELED:
case FAILED:
case KILLED:
break;
case PAUSED:
}
}
}
void getProgressAndPingTaskNode(RunningTaskData taskData) {
if (!jobs.canPingTask(taskData)) {
return;
}
InternalTask task = taskData.getTask();
try {
int progress = taskData.getLauncher().getProgress();//(2)
//get previous inside td
if (progress != task.getProgress()) {
task.setProgress(progress);//(1)
//if progress != previously set progress (0 by default) -> update
listener.taskStateUpdated(taskData.getUser(),
new NotificationData<TaskInfo>(SchedulerEvent.TASK_PROGRESS,
new TaskInfoImpl((TaskInfoImpl) task.getTaskInfo())));
}
} catch (Throwable t) {
tlogger.debug(task.getId(), "TaskLauncher is not accessible, checking if the node can be reached.", t);
pingTaskNodeAndInitiateRestart(task);
}
}
private void pingTaskNodeAndInitiateRestart(InternalTask task) {
RunningTaskData runningTask = jobs.getRunningTask(task.getId());
if (runningTask != null) {
// We try to ping the node where the task is running to make sure the exception raised is due to a node failure.
// We don't consider here other nodes reserved for the task,
// as it is the responsibility of the task itself to manage extra nodes lifecycle
// in case of complex multinodes task deployment.
Node nodeUsedToExecuteTask = runningTask.getNodeExecutor();
try {
nodeUsedToExecuteTask.getNumberOfActiveObjects();
} catch (Exception e) {
int attempts = runningTask.increaseAndGetPingAttempts();
String nodeUrl = nodeUsedToExecuteTask.getNodeInformation().getURL();
if (attempts > PASchedulerProperties.SCHEDULER_NODE_PING_ATTEMPTS.getValueAsInt()) {
tlogger.error(task.getId(), "node failed " + nodeUrl + ", initiate task restart.", e);
restartTaskOnNodeFailure(task);
} else {
tlogger.warn(task.getId(),
"cannot contact node " + nodeUrl + " - waiting while it comes back, attempt " +
attempts,
e);
}
}
}
}
protected void sleepSchedulingThread() throws InterruptedException {
schedulingThread.sleepSchedulingThread();
}
protected void wakeUpSchedulingThread() {
schedulingThread.wakeUpSchedulingThread();
}
/**
* This Runnable handles the Housekeeping
*/
public class HousekeepingRunner implements Runnable {
private List<Long> removeFromContext(List<JobId> jobIdList) {
List<Long> longList = new ArrayList<>(jobIdList.size());
for (JobId jobId : jobIdList) {
TerminationData terminationData = jobs.removeJob(jobId);
submitTerminationDataHandler(terminationData);
InternalJob job = getInfrastructure().getDBManager().loadJobWithTasksIfNotRemoved(jobId);
if (job != null) {
job.setRemovedTime(System.currentTimeMillis());
ServerJobAndTaskLogs.remove(jobId);
getListener().jobStateUpdated(job.getOwner(),
new NotificationData<JobInfo>(SchedulerEvent.JOB_REMOVE_FINISHED,
new JobInfoImpl((JobInfoImpl) job.getJobInfo())));
wakeUpSchedulingThread();
}
longList.add(jobId.longValue());
}
return longList;
}
private void removeFromDB(List<Long> longJobIdList) {
if (!longJobIdList.isEmpty()) {
getInfrastructure().getDBManager()
.executeHousekeepingInDB(longJobIdList,
PASchedulerProperties.JOB_REMOVE_FROM_DB.getValueAsBoolean());
}
}
@Override
public void run() {
long timeNow = System.currentTimeMillis();
List<JobId> jobIdList = getInfrastructure().getDBManager().getJobsToRemove(timeNow);
// remove from the memory context
long inMemoryTimeStart = System.currentTimeMillis();
List<Long> longJobIdList = removeFromContext(jobIdList);
long inMemoryTimeStop = System.currentTimeMillis();
// set the removedTime and also remove if required by the JOB_REMOVE_FROM_DB setting
long dbTimeStart = System.currentTimeMillis();
removeFromDB(longJobIdList);
long dbTimeStop = System.currentTimeMillis();
logger.info("HOUSEKEEPING of jobs " + longJobIdList + " performed (Hibernate context removal took " +
(inMemoryTimeStop - inMemoryTimeStart) + " ms" + " and db removal took " +
(dbTimeStop - dbTimeStart) + " ms)");
}
}
}