/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ode.scheduler.simple;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Properties;
import java.util.Random;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicLong;
import javax.transaction.Status;
import javax.transaction.Synchronization;
import javax.transaction.SystemException;
import javax.transaction.Transaction;
import javax.transaction.TransactionManager;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.log4j.helpers.AbsoluteTimeDateFormat;
import org.apache.ode.bpel.iapi.ClusterAware;
import org.apache.ode.bpel.iapi.ContextException;
import org.apache.ode.bpel.iapi.Scheduler;
import org.apache.ode.dao.scheduler.DatabaseException;
import org.apache.ode.dao.scheduler.JobDAO;
import org.apache.ode.dao.scheduler.SchedulerDAOConnection;
import org.apache.ode.dao.scheduler.SchedulerDAOConnectionFactory;
import org.apache.ode.dao.scheduler.Task;
/**
* A reliable and relatively simple scheduler that uses a database to persist information about
* scheduled tasks.
*
* The challenge is to achieve high performance in a small memory footprint without loss of reliability
* while supporting distributed/clustered configurations.
*
* The design is based around three time horizons: "immediate", "near future", and "everything else".
* Immediate jobs (i.e. jobs that are about to be up) are written to the database and kept in
* an in-memory priority queue. When they execute, they are removed from the database. Near future
* jobs are placed in the database and assigned to the current node, however they are not stored in
* memory. Periodically jobs are "upgraded" from near-future to immediate status, at which point they
* get loaded into memory. Jobs that are further out in time, are placed in the database without a
* node identifer; when they are ready to be "upgraded" to near-future jobs they are assigned to one
* of the known live nodes. Recovery is rather straighforward, with stale node identifiers being
* reassigned to known good nodes.
*
* @author Maciej Szefler ( m s z e f l e r @ g m a i l . c o m )
*
*/
public class SimpleScheduler implements Scheduler, TaskRunner, ClusterAware {
private static final Log __log = LogFactory.getLog(SimpleScheduler.class);
/**
* Jobs scheduled with a time that is between [now, now+immediateInterval] will be assigned to the current node, and placed
* directly on the todo queue.
*/
long _immediateInterval = 30000;
/**
* Jobs scheduled with a time that is between (now+immediateInterval,now+nearFutureInterval) will be assigned to the current
* node, but will not be placed on the todo queue (the promoter will pick them up).
*/
long _nearFutureInterval = 10 * 60 * 1000;
/** 10s of no communication and you are deemed dead. */
long _staleInterval = 10000;
/** Duration used to log a warning if a job scheduled at a date D is queued at D'>D+_warningDelay */
long _warningDelay = 5*60*1000;
/**
* Estimated sustained transaction per second capacity of the system.
* e.g. 100 means the system can process 100 jobs per seconds, on average
* This number is used to determine how many jobs to load from the database at once.
*/
int _tps = 100;
TransactionManager _txm;
ExecutorService _exec;
String _nodeId;
/** Maximum number of jobs in the "near future" / todo queue. */
int _todoLimit = 10000;
/** The object that actually handles the jobs. */
volatile JobProcessor _jobProcessor;
volatile JobProcessor _polledRunnableProcessor;
private SchedulerThread _todo;
private SchedulerDAOConnectionFactory _dbcf;
/** Set of outstanding jobs, i.e., jobs that have been enqueued but not dequeued or dispatched yet.
Used to avoid cases where a job would be dispatched twice if the server is under high load and
does not fully process a job before it is reloaded from the database. */
private ConcurrentHashMap<String, Long> _outstandingJobs = new ConcurrentHashMap<String, Long>();
/** Set of Jobs processed since the last LoadImmediate task.
This prevents a race condition where a job is processed twice. This could happen if a LoadImediate tasks loads a job
from the db before the job is processed but puts it in the _outstandingJobs map after the job was processed .
In such a case the job is no longer in the _outstandingJobs map, and so it's queued again. */
private ConcurrentHashMap<String, Long> _processedSinceLastLoadTask = new ConcurrentHashMap<String, Long>();
/**
* Set of jobs that needed to be retried.
*/
private ConcurrentHashMap<String, Boolean> _retryJobList = new ConcurrentHashMap<String, Boolean>();
private boolean _running;
/** Time for next upgrade. */
private AtomicLong _nextUpgrade = new AtomicLong();
private Random _random = new Random();
private long _pollIntervalForPolledRunnable = Long.getLong("org.apache.ode.polledRunnable.pollInterval", 10 * 60 * 1000);
/** Number of immediate retries when the transaction fails **/
private int _immediateTransactionRetryLimit = 3;
/** Interval between immediate retries when the transaction fails **/
private long _immediateTransactionRetryInterval = 1000;
private List<String> _defaultNodeList = new ArrayList<String>();
private List<String> _nodeList = new ArrayList<String>();
public SimpleScheduler(String nodeId, SchedulerDAOConnectionFactory dbcf, TransactionManager txm, Properties conf) {
_nodeId = nodeId;
_dbcf = dbcf;
_txm = txm;
_todoLimit = getIntProperty(conf, "ode.scheduler.queueLength", _todoLimit);
_immediateInterval = getLongProperty(conf, "ode.scheduler.immediateInterval", _immediateInterval);
_nearFutureInterval = getLongProperty(conf, "ode.scheduler.nearFutureInterval", _nearFutureInterval);
_staleInterval = getLongProperty(conf, "ode.scheduler.staleInterval", _staleInterval);
_tps = getIntProperty(conf, "ode.scheduler.transactionsPerSecond", _tps);
_warningDelay = getLongProperty(conf, "ode.scheduler.warningDelay", _warningDelay);
_immediateTransactionRetryLimit = getIntProperty(conf, "ode.scheduler.immediateTransactionRetryLimit", _immediateTransactionRetryLimit);
_immediateTransactionRetryInterval = getLongProperty(conf, "ode.scheduler.immediateTransactionRetryInterval", _immediateTransactionRetryInterval);
_todo = new SchedulerThread(this);
_defaultNodeList.add(nodeId);
}
public void setPollIntervalForPolledRunnable(long pollIntervalForPolledRunnable) {
_pollIntervalForPolledRunnable = pollIntervalForPolledRunnable;
}
private int getIntProperty(Properties props, String propName, int defaultValue) {
String s = props.getProperty(propName);
if (s != null) return Integer.parseInt(s);
else return defaultValue;
}
private long getLongProperty(Properties props, String propName, long defaultValue) {
String s = props.getProperty(propName);
if (s != null) return Long.parseLong(s);
else return defaultValue;
}
public void setNodeId(String nodeId) {
_nodeId = nodeId;
}
public void setStaleInterval(long staleInterval) {
_staleInterval = staleInterval;
}
public void setImmediateInterval(long immediateInterval) {
_immediateInterval = immediateInterval;
}
public void setNearFutureInterval(long nearFutureInterval) {
_nearFutureInterval = nearFutureInterval;
}
public void setTransactionsPerSecond(int tps) {
_tps = tps;
}
public void setTransactionManager(TransactionManager txm) {
_txm = txm;
}
public void setSchedulerDAOConnectionFactory(SchedulerDAOConnectionFactory dbcf) {
_dbcf = dbcf;
}
public void setExecutorService(ExecutorService executorService) {
_exec = executorService;
}
public void setPolledRunnableProcesser(JobProcessor polledRunnableProcessor) {
_polledRunnableProcessor = polledRunnableProcessor;
}
public void cancelJob(String jobId) throws ContextException {
_todo.dequeue(new JobDAOTask(jobId));
_outstandingJobs.remove(jobId);
SchedulerDAOConnection conn = _dbcf.getConnection();
try {
conn.deleteJob(jobId, _nodeId);
} catch (DatabaseException e) {
__log.debug("Job removal failed.", e);
throw new ContextException("Job removal failed.", e);
}
}
public <T> Future<T> execIsolatedTransaction(final Callable<T> transaction) throws Exception, ContextException {
return _exec.submit(new Callable<T>() {
public T call() throws Exception {
try {
return execTransaction(transaction);
} catch (Exception e) {
__log.error("An exception occured while executing an isolated transaction, " +
"the transaction is going to be abandoned.", e);
return null;
}
}
});
}
public <T> T execTransaction(Callable<T> transaction) throws Exception, ContextException {
return execTransaction(transaction, 0);
}
public <T> T execTransaction(Callable<T> transaction, int timeout) throws Exception, ContextException {
if( _txm == null ) {
throw new ContextException("Cannot locate the transaction manager; the server might be shutting down.");
}
// The value of the timeout is in seconds. If the value is zero, the transaction service restores the default value.
if (timeout < 0) {
throw new IllegalArgumentException("Timeout must be positive, received: "+timeout);
}
boolean existingTransaction = false;
try {
existingTransaction = ( _txm.getTransaction() != null );
} catch (Exception ex) {
String errmsg = "Internal Error, could not get current transaction.";
throw new ContextException(errmsg, ex);
}
// already in transaction, execute and return directly
if (existingTransaction) {
return transaction.call();
}
// run in new transaction
Exception ex = null;
int immediateRetryCount = _immediateTransactionRetryLimit;
_txm.setTransactionTimeout(timeout);
if(__log.isDebugEnabled() && timeout!=0) __log.debug("Custom transaction timeout: "+timeout);
try {
do {
try {
if (__log.isDebugEnabled()) __log.debug("Beginning a new transaction");
_txm.begin();
} catch (Exception e) {
String errmsg = "Internal Error, could not begin transaction.";
throw new ContextException(errmsg, e);
}
try {
ex = null;
return transaction.call();
} catch (Exception e) {
ex = e;
} finally {
if (ex == null) {
if (__log.isDebugEnabled()) {
__log.debug("Commiting on " + _txm + "...");
}
try {
_txm.commit();
if (__log.isDebugEnabled()) {
__log.debug("committed on " + _txm + " successfully.");
}
} catch( Exception e2 ) {
ex = e2;
__log.error("error in commiting transaction", e2);
}
} else {
if (__log.isDebugEnabled()) {
__log.debug("Rollbacking on " + _txm + "...");
}
_txm.rollback();
}
if( ex != null && immediateRetryCount > 0 ) {
if (__log.isDebugEnabled()) __log.debug("Will retry the transaction in " + _immediateTransactionRetryInterval + " msecs on " + _txm + " for error: ", ex);
Thread.sleep(_immediateTransactionRetryInterval);
}
}
} while( immediateRetryCount-- > 0 );
} finally {
// 0 restores the default value
if (_txm != null) {
_txm.setTransactionTimeout(0);
}
}
throw ex;
}
public void setRollbackOnly() throws Exception {
TransactionManager txm = _txm;
if( txm == null ) {
throw new ContextException("Cannot locate the transaction manager; the server might be shutting down.");
}
txm.setRollbackOnly();
}
public void registerSynchronizer(final Synchronizer synch) throws ContextException {
TransactionManager txm = _txm;
if( txm == null ) {
throw new ContextException("Cannot locate the transaction manager; the server might be shutting down.");
}
try {
txm.getTransaction().registerSynchronization(new Synchronization() {
public void beforeCompletion() {
synch.beforeCompletion();
}
public void afterCompletion(int status) {
synch.afterCompletion(status == Status.STATUS_COMMITTED);
}
});
} catch (Exception e) {
throw new ContextException("Unable to register synchronizer.", e);
}
}
public String schedulePersistedJob(final JobDetails jobDetail, Date when) throws ContextException {
long ctime = System.currentTimeMillis();
if (when == null)
when = new Date(ctime);
if (__log.isDebugEnabled())
__log.debug("scheduling " + jobDetail + " for " + when);
return schedulePersistedJob(jobDetail, true, when, ctime);
}
public String scheduleMapSerializableRunnable(MapSerializableRunnable runnable, Date when) throws ContextException {
long ctime = System.currentTimeMillis();
if (when == null)
when = new Date(ctime);
JobDetails jobDetails = new JobDetails();
jobDetails.getDetailsExt().put("runnable", runnable);
runnable.storeToDetails(jobDetails);
if (__log.isDebugEnabled())
__log.debug("scheduling " + jobDetails + " for " + when);
return schedulePersistedJob(jobDetails, true, when, ctime);
}
private String schedulePersistedJob(JobDetails jobDetails, boolean transacted, Date when, long ctime) throws ContextException {
boolean immediate = when.getTime() <= ctime + _immediateInterval;
boolean nearfuture = !immediate && ( when.getTime() <= ctime + _nearFutureInterval );
JobDAO job;
try {
if (immediate) {
// If we have too many jobs in the queue, we don't allow any new ones
if (_outstandingJobs.size() > _todoLimit) {
__log.error("The execution queue is backed up, the engine can't keep up with the load. Either "
+ "increase the queue size or regulate the flow.");
return null;
}
job = insertJob(transacted, jobDetails, when.getTime(), _nodeId, true, true);
__log.debug("scheduled immediate job: " + job.getJobId());
} else if (nearfuture) {
// Near future, assign the job to ourselves (why? -- this makes it very unlikely that we
// would get two nodes trying to process the same instance, which causes unsightly rollbacks).
job = insertJob(transacted, jobDetails, when.getTime(), _nodeId, false, false);
__log.debug("scheduled near-future job: " + job.getJobId());
} else /* far future */ {
// Not the near future, we don't assign a node-id, we'll assign it later.
job = insertJob(transacted, jobDetails, when.getTime(), null, false, false);
__log.debug("scheduled far-future job: " + job.getJobId());
}
} catch (DatabaseException dbe) {
__log.error("Database error.", dbe);
throw new ContextException("Database error.", dbe);
}
return job.getJobId();
}
private JobDAO insertJob(final boolean transacted, final JobDetails jobDetails, final long scheduledDate, final String nodeID,
final boolean loaded, final boolean enqueue) throws ContextException, DatabaseException {
SchedulerDAOConnection conn = _dbcf.getConnection();
final JobDAO job = conn.createJob(transacted, jobDetails, true, scheduledDate);
if (!conn.insertJob(job, nodeID, loaded)) {
String msg = String.format("Database insert failed. jobId %s nodeId %s", job.getJobId(), nodeID);
__log.error(msg);
throw new ContextException(msg);
}
if (enqueue) {
addTodoOnCommit(job);
}
return job;
}
public String scheduleVolatileJob(boolean transacted, JobDetails jobDetail) throws ContextException {
return scheduleVolatileJob(transacted, jobDetail, null);
}
public String scheduleVolatileJob(boolean transacted, JobDetails jobDetail, Date when) throws ContextException {
long ctime = System.currentTimeMillis();
if (when == null) {
when = new Date(ctime);
}
SchedulerDAOConnection conn = _dbcf.getConnection();
JobDAO job = conn.createJob(transacted, jobDetail, false, when.getTime());
addTodoOnCommit(job);
return job.toString();
}
public void setJobProcessor(JobProcessor processor) throws ContextException {
_jobProcessor = processor;
}
public List<String> getNodeList() {
if (this._nodeList == null || this._nodeList.size() == 0) {
return _defaultNodeList;
}
return _nodeList;
}
public void setNodeList(List<String> nodeList) {
this._nodeList = nodeList;
}
public void shutdown() {
stop();
_jobProcessor = null;
_txm = null;
_todo = null;
}
public synchronized void start() {
if (_running)
return;
if (_exec == null) {
_exec = Executors.newCachedThreadPool();
}
_todo.clearTasks(UpgradeJobsTask.class);
_todo.clearTasks(LoadImmediateTask.class);
_processedSinceLastLoadTask.clear();
_outstandingJobs.clear();
_retryJobList.clear();
long now = System.currentTimeMillis();
// schedule immediate job loading for now!
_todo.enqueue(new LoadImmediateTask(now));
// do the upgrade sometime (random) in the immediate interval.
_todo.enqueue(new UpgradeJobsTask(now + randomMean(_immediateInterval)));
_todo.start();
_running = true;
}
private long randomMean(long mean) {
return (long) _random.nextDouble() * mean + (mean/2);
}
public synchronized void stop() {
if (!_running)
return;
_todo.stop();
_todo.clearTasks(UpgradeJobsTask.class);
_todo.clearTasks(LoadImmediateTask.class);
_processedSinceLastLoadTask.clear();
_outstandingJobs.clear();
_retryJobList.clear();
// disable because this is not the right way to do it
// will be fixed by ODE-595
// graceful shutdown; any new submits will throw RejectedExecutionExceptions
// _exec.shutdown();
_running = false;
}
/**
* This is the class for delegating job to jobProcessor, also introduced retry mechanism here.
* @author jeffyu
*
*/
private class RunJobCallable implements Callable<Void> {
final JobProcessor processor;
final JobDAO job;
RunJobCallable(JobDAO jobDao, JobProcessor processor) {
this.job = jobDao;
this.processor = processor;
}
public Void call() throws Exception {
try {
final Scheduler.JobInfo jobInfo = new Scheduler.JobInfo(job.getJobId(), job.getDetails(), job.getDetails().getRetryCount());
if (job.isTransacted()) {
processInTransactionContext(jobInfo);
} else {
processor.onScheduledJob(jobInfo);
}
return null;
} finally {
// the order of these 2 actions is crucial to avoid a race condition.
// if the transaction failed, and has retry mechanism, we will not put it to avoid being ignore.
if (_retryJobList.get(job.getJobId()) == null ) {
_processedSinceLastLoadTask.put(job.getJobId(), job.getScheduledDate());
} else {
_retryJobList.remove(job.getJobId());
}
_outstandingJobs.remove(job.getJobId());
}
}
private void processInTransactionContext(final Scheduler.JobInfo jobInfo) throws Exception {
final boolean[] needRetry = new boolean[]{true};
try {
execTransaction(new Callable<Void>() {
public Void call() throws ContextException, Exception {
SchedulerDAOConnection conn = _dbcf.getConnection();
if (job.isPersisted()) {
if (!conn.deleteJob(job.getJobId(), _nodeId)) {
throw new JobNoLongerInDbException(job.getJobId(), _nodeId);
}
}
try {
processor.onScheduledJob(jobInfo);
// If the job is a "runnable" job, schedule the next job occurence
if (job.getDetails().getDetailsExt().get("runnable") != null &&
!"COMPLETED".equals(String.valueOf(jobInfo.jobDetail.getDetailsExt().get("runnable_status")))) {
// the runnable is still in progress, schedule checker to 10 mins later
if (_pollIntervalForPolledRunnable < 0) {
if (__log.isWarnEnabled())
__log.warn("The poll interval for polled runnables is negative; setting it to 1000ms");
_pollIntervalForPolledRunnable = 1000;
}
long schedDate = System.currentTimeMillis() + _pollIntervalForPolledRunnable;
job.setScheduledDate(schedDate);
conn.insertJob(job, _nodeId, false);
}
} catch (JobProcessorException jpe) {
if (!jpe.retry) {
needRetry[0] = false;
}
// Let execTransaction know that shit happened.
throw jpe;
}
return null;
}
});
} catch (JobNoLongerInDbException jde) {
// This may happen if two node try to do the same job... we try to avoid
// it the synchronization is a best-effort but not perfect.
__log.debug("job no longer in db forced rollback: "+job);
} catch (final Exception ex) {
__log.error("Error while processing a "+(job.isPersisted()?"":"non-")+"persisted job"+(needRetry[0] && job.isPersisted()?": ":", no retry: ")+job, ex);
// We only get here if the above execTransaction fails, so that transaction got
// rollbacked already
if (job.isPersisted()) {
try {
execTransaction(new Callable<Void>() {
public Void call() throws Exception {
retryJob(needRetry);
return null;
}
});
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
private void retryJob(final boolean[] needRetry) throws DatabaseException {
SchedulerDAOConnection conn = _dbcf.getConnection();
int retry = job.getDetails().getRetryCount() + 1;
if (!needRetry[0] || retry > 10) {
conn.deleteJob(job.getJobId(), _nodeId);
if (retry > 10) {
__log.error("Error while processing job after 10 retries, no more retries:" + job);
}
} else {
job.getDetails().setRetryCount(retry);
long delay = (long)(Math.pow(5, retry));
long scheddate = System.currentTimeMillis() + delay*1000;
job.setScheduled(false);
job.setScheduledDate(scheddate);
conn.updateJob(job);
_retryJobList.put(job.getJobId(), new Boolean(true));
__log.error("Error while processing job, retrying in " + delay + "s, the job is " + job);
}
}
}
/**
* Run a job in the current thread.
**/
protected void runJob(final JobDAO jobDao) {
_exec.submit(new RunJobCallable(jobDao, _jobProcessor));
}
/**
* Run a job from a polled runnable thread. The runnable is not persistent,
* however, the poller is persistent and wakes up every given interval to
* check the status of the runnable.
* <ul>
* <li>1. The runnable is being scheduled; the poller persistent job dispatches
* the runnable to a runnable delegate thread and schedules itself to a later time.</li>
* <li>2. The runnable is running; the poller job re-schedules itself every time it
* sees the runnable is not completed.</li>
* <li>3. The runnable failed; the poller job passes the exception thrown on the runnable
* down, and the standard scheduler retries happen.</li>
* <li>4. The runnable completes; the poller persistent does not re-schedule itself.</li>
* <li>5. System powered off and restarts; the poller job does not know what the status
* of the runnable. This is handled just like the case #1.</li>
* </ul>
* <p/>
* There is at least one re-scheduling of the poller job. Since, the runnable's state is
* not persisted, and the same runnable may be tried again after system failure,
* the runnable that's used with this polling should be repeatable.
*
*/
protected void runPolledRunnable(final JobDAO jobDao) {
_exec.submit(new RunJobCallable(jobDao, _polledRunnableProcessor));
}
private void addTodoOnCommit(final JobDAO job) {
registerSynchronizer(new Synchronizer() {
public void afterCompletion(boolean success) {
if (success) {
enqueue(job);
}
}
public void beforeCompletion() {
}
});
}
public boolean isTransacted() {
TransactionManager txm = _txm;
if( txm == null ) {
throw new ContextException("Cannot locate the transaction manager; the server might be shutting down.");
}
try {
Transaction tx = txm.getTransaction();
return (tx != null && tx.getStatus() != Status.STATUS_NO_TRANSACTION);
} catch (SystemException e) {
throw new ContextException("Internal Error: Could not obtain transaction status.");
}
}
public void runTask(final Task task) {
if (task instanceof JobDAOTask) {
JobDAOTask job = (JobDAOTask)task;
if( job.getJobDAO().getDetails().getDetailsExt().get("runnable") != null ) {
runPolledRunnable(job.getJobDAO());
} else {
runJob(job.getJobDAO());
}
} else if (task instanceof SchedulerTask) {
_exec.submit(new Callable<Void>() {
public Void call() throws Exception {
try {
((SchedulerTask) task).run();
} catch (Exception ex) {
__log.error("Error during SchedulerTask execution", ex);
}
return null;
}
});
}
}
boolean doLoadImmediate() {
__log.debug("LOAD IMMEDIATE started");
// don't load anything if we're already half-full; we've got plenty to do already
if (_outstandingJobs.size() > _todoLimit/2) {
return true;
}
List<JobDAO> jobs;
try {
// don't load more than we can chew
final int batch = Math.min((int) (_immediateInterval * _tps / 1000), _todoLimit-_outstandingJobs.size());
// jobs might have been enqueued by #addTodoOnCommit meanwhile
if (batch<=0) {
if (__log.isDebugEnabled()) __log.debug("Max capacity reached: "+_outstandingJobs.size()+" jobs dispacthed i.e. queued or being executed");
return true;
}
if (__log.isDebugEnabled()) __log.debug("loading "+batch+" jobs from db");
jobs = execTransaction(new Callable<List<JobDAO>>() {
public List<JobDAO> call() throws ContextException, DatabaseException {
SchedulerDAOConnection conn = _dbcf.getConnection();
return conn.dequeueImmediate(_nodeId, System.currentTimeMillis() + _immediateInterval, batch);
}
});
if (__log.isDebugEnabled()) __log.debug("loaded "+jobs.size()+" jobs from db");
long delayedTime = System.currentTimeMillis() - _warningDelay;
int delayedCount = 0;
boolean runningLate;
AbsoluteTimeDateFormat f = new AbsoluteTimeDateFormat();
for (JobDAO j : jobs) {
// jobs might have been enqueued by #addTodoOnCommit meanwhile
if (_outstandingJobs.size() >= _todoLimit){
if (__log.isDebugEnabled()) __log.debug("Max capacity reached: "+_outstandingJobs.size()+" jobs dispacthed i.e. queued or being executed");
break;
}
runningLate = (j.getScheduledDate() <= delayedTime);
if (runningLate) {
delayedCount++;
}
if (__log.isDebugEnabled())
__log.debug("todo.enqueue job from db: " + j.getJobId().trim() + " for " + j.getScheduledDate() + "(" + f.format(j.getScheduledDate())+") "+(runningLate?" delayed=true":""));
enqueue(j);
}
if (delayedCount > 0) {
__log.warn("Dispatching jobs with more than "+(_warningDelay/60000)+" minutes delay. Either the server was down for some time or the job load is greater than available capacity");
}
// clear only if the batch succeeded
_processedSinceLastLoadTask.clear();
_retryJobList.clear();
return true;
} catch (Exception ex) {
__log.error("Error loading immediate jobs from database.", ex);
return false;
} finally {
__log.debug("LOAD IMMEDIATE complete");
}
}
/**
* Put job into _outstandingJobs for immediate execution.
*
* @param job
*/
private void enqueue(JobDAO job) {
if (_processedSinceLastLoadTask.get(job.getJobId()) == null) {
if (_outstandingJobs.putIfAbsent(job.getJobId(), job.getScheduledDate()) == null) {
if (job.getScheduledDate() <= System.currentTimeMillis()) {
runJob(job);
} else {
_todo.enqueue(new JobDAOTask(job));
}
} else {
if (__log.isDebugEnabled()) __log.debug("Job "+job.getJobId()+" is being processed (outstanding job)");
}
} else {
if (__log.isDebugEnabled()) __log.debug("Job "+job.getJobId()+" is being processed (processed since last load)");
}
}
boolean doUpgrade() {
__log.debug("UPGRADE started");
// We're going to try to upgrade near future jobs using the db only.
// We assume that the distribution of the trailing digits in the
// scheduled time are uniformly distributed, and use modular division
// of the time by the number of nodes to create the node assignment.
// This can be done in a single update statement.
final long maxtime = System.currentTimeMillis() + _nearFutureInterval;
try {
return execTransaction(new Callable<Boolean>() {
public Boolean call() throws ContextException, DatabaseException {
SchedulerDAOConnection conn = _dbcf.getConnection();
int numNodes = getNodeList().size();
for (int i = 0; i < numNodes; ++i) {
String node = getNodeList().get(i);
conn.updateAssignToNode(node, i, numNodes, maxtime);
}
return true;
}
});
} catch (Exception ex) {
__log.error("Database error upgrading jobs.", ex);
return false;
} finally {
__log.debug("UPGRADE complete");
}
}
private abstract class SchedulerTask extends Task implements Runnable {
SchedulerTask(long schedDate) {
super(schedDate);
}
}
private class LoadImmediateTask extends SchedulerTask {
LoadImmediateTask(long schedDate) {
super(schedDate);
}
public void run() {
boolean success = false;
try {
success = doLoadImmediate();
} finally {
if (success) {
_todo.enqueue(new LoadImmediateTask(System.currentTimeMillis() + (long) (_immediateInterval * .90)));
} else {
_todo.enqueue(new LoadImmediateTask(System.currentTimeMillis() + 1000));
}
}
}
}
/**
* Upgrade jobs from far future to immediate future (basically, assign them to a node).
* @author mszefler
*
*/
private class UpgradeJobsTask extends SchedulerTask {
UpgradeJobsTask(long schedDate) {
super(schedDate);
}
public void run() {
long ctime = System.currentTimeMillis();
long ntime = _nextUpgrade.get();
__log.debug("UPGRADE task for " + getScheduledDate() + " fired at " + ctime);
// We could be too early, this can happen if upgrade gets delayed due to another
// node
if (_nextUpgrade.get() > System.currentTimeMillis()) {
__log.debug("UPGRADE skipped -- wait another " + (ntime - ctime) + "ms");
_todo.enqueue(new UpgradeJobsTask(ntime));
return;
}
boolean success = false;
try {
success = doUpgrade();
} finally {
long future = System.currentTimeMillis() + (success ? (long) (_nearFutureInterval * .50) : 1000);
_nextUpgrade.set(future);
_todo.enqueue(new UpgradeJobsTask(future));
__log.debug("UPGRADE completed, success = " + success + "; next time in " + (future - ctime) + "ms");
}
}
}
/**
* Right now, just assume all of nodes are coordinator for now.
*
*/
public boolean amICoordinator() {
return true;
}
}