/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.ode.scheduler.simple; import java.util.*; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CopyOnWriteArraySet; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicLong; import javax.transaction.Status; import javax.transaction.Synchronization; import javax.transaction.SystemException; import javax.transaction.Transaction; import javax.transaction.TransactionManager; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.ode.bpel.iapi.ContextException; import org.apache.ode.bpel.iapi.Scheduler; /** * A reliable and relatively simple scheduler that uses a database to persist information about * scheduled tasks. * * The challange is to achieve high performance in a small memory footprint without loss of reliability * while supporting distributed/clustered configurations. * * The design is based around three time horizons: "immediate", "near future", and "everything else". * Immediate jobs (i.e. jobs that are about to be up) are written to the database and kept in * an in-memory priority queue. When they execute, they are removed from the database. Near future * jobs are placed in the database and assigned to the current node, however they are not stored in * memory. Periodically jobs are "upgraded" from near-future to immediate status, at which point they * get loaded into memory. Jobs that are further out in time, are placed in the database without a * node identifer; when they are ready to be "upgraded" to near-future jobs they are assigned to one * of the known live nodes. Recovery is rather straighforward, with stale node identifiers being * reassigned to known good nodes. * * @author Maciej Szefler ( m s z e f l e r @ g m a i l . c o m ) * */ public class SimpleScheduler implements Scheduler, TaskRunner { private static final Log __log = LogFactory.getLog(SimpleScheduler.class); /** * Jobs scheduled with a time that is between [now, now+immediateInterval] will be assigned to the current node, and placed * directly on the todo queue. */ long _immediateInterval = 30000; /** * Jobs sccheduled with a time that is between (now+immediateInterval,now+nearFutureInterval) will be assigned to the current * node, but will not be placed on the todo queue (the promoter will pick them up). */ long _nearFutureInterval = 10 * 60 * 1000; /** 10s of no communication and you are deemed dead. */ long _staleInterval = 10000; TransactionManager _txm; ExecutorService _exec; String _nodeId; /** Maximum number of jobs in the "near future" / todo queue. */ volatile int _todoLimit = 200; /** The object that actually handles the jobs. */ volatile JobProcessor _jobProcessor; private SchedulerThread _todo; private DatabaseDelegate _db; /** All the nodes we know about */ private CopyOnWriteArraySet<String> _knownNodes = new CopyOnWriteArraySet<String>(); /** When we last heard from our nodes. */ private ConcurrentHashMap<String, Long> _lastHeartBeat = new ConcurrentHashMap<String, Long>(); private boolean _running; /** Time for next upgrade. */ private AtomicLong _nextUpgrade = new AtomicLong(); /** Time for next job load */ private AtomicLong _nextScheduleImmediate = new AtomicLong(); private Random _random = new Random(); public SimpleScheduler(String nodeId, DatabaseDelegate del) { _nodeId = nodeId; _db = del; _todo = new SchedulerThread(this); } public void setNodeId(String nodeId) { _nodeId = nodeId; } public void setStaleInterval(long staleInterval) { _staleInterval = staleInterval; } public void setImmediateInterval(long immediateInterval) { _immediateInterval = immediateInterval; } public void setNearFutureInterval(long nearFutureInterval) { _nearFutureInterval = nearFutureInterval; } public void setTransactionManager(TransactionManager txm) { _txm = txm; } public void setDatabaseDelegate(DatabaseDelegate dbd) { _db = dbd; } public void setExecutorService(ExecutorService executorService) { _exec = executorService; } public void cancelJob(String jobId) throws ContextException { // TODO: maybe later, not really necessary. } public <T> Future<T> execIsolatedTransaction(final Callable<T> transaction) throws Exception, ContextException { return _exec.submit(new Callable<T>() { public T call() throws Exception { try { return execTransaction(transaction); } catch (Exception e) { __log.error("An exception occured while executing an isolated transaction, " + "the transaction is going to be abandoned.", e); return null; } } }); } public <T> T execTransaction(Callable<T> transaction) throws Exception, ContextException { try { _txm.begin(); } catch (Exception ex) { String errmsg = "Internal Error, could not begin transaction."; throw new ContextException(errmsg, ex); } boolean success = false; try { T retval = transaction.call(); success = true; return retval; } catch (Exception ex) { throw ex; } finally { if (success) _txm.commit(); else _txm.rollback(); } } public void registerSynchronizer(final Synchronizer synch) throws ContextException { try { _txm.getTransaction().registerSynchronization(new Synchronization() { public void beforeCompletion() { synch.beforeCompletion(); } public void afterCompletion(int status) { synch.afterCompletion(status == Status.STATUS_COMMITTED); } }); } catch (Exception e) { throw new ContextException("Unable to register synchronizer.", e); } } public String schedulePersistedJob(final Map<String, Object> jobDetail, Date when) throws ContextException { long ctime = System.currentTimeMillis(); if (when == null) when = new Date(ctime); if (__log.isDebugEnabled()) __log.debug("scheduling " + jobDetail + " for " + when); boolean immediate = when.getTime() <= ctime + _immediateInterval; boolean nearfuture = !immediate && when.getTime() <= ctime + _nearFutureInterval; Job job = new Job(when.getTime(), true, jobDetail); try { if (immediate) { // If we have too many jobs in the queue, we don't allow any new ones if (_todo.size() > _todoLimit) throw new ContextException("The execution queue is backed up... Forcing ContextException"); // Immediate scheduling means we put it in the DB for safe keeping _db.insertJob(job, _nodeId, true); // And add it to our todo list . addTodoOnCommit(job); __log.debug("scheduled immediate job: " + job.jobId); } else if (nearfuture) { // Near future, assign the job to ourselves (why? -- this makes it very unlikely that we // would get two nodes trying to process the same instance, which causes unsightly rollbacks). _db.insertJob(job, _nodeId, false); __log.debug("scheduled near-future job: " + job.jobId); } else /* far future */{ // Not the near future, we don't assign a node-id, we'll assign it later. _db.insertJob(job, null, false); __log.debug("scheduled far-future job: " + job.jobId); } } catch (DatabaseException dbe) { __log.error("Database error.", dbe); throw new ContextException("Database error.", dbe); } return job.jobId; } public String scheduleVolatileJob(boolean transacted, Map<String, Object> jobDetail) throws ContextException { Job job = new Job(System.currentTimeMillis(), transacted, jobDetail); job.persisted = false; addTodoOnCommit(job); return job.toString(); } public void setJobProcessor(JobProcessor processor) throws ContextException { _jobProcessor = processor; } public void shutdown() { stop(); _jobProcessor = null; _txm = null; _todo = null; } public synchronized void start() { if (_running) return; if (_exec == null) _exec = Executors.newCachedThreadPool(); _todo.clearTasks(UpgradeJobsTask.class); _todo.clearTasks(LoadImmediateTask.class); _todo.clearTasks(CheckStaleNodes.class); _knownNodes.clear(); try { execTransaction(new Callable<Void>() { public Void call() throws Exception { _knownNodes.addAll(_db.getNodeIds()); return null; } }); } catch (Exception ex) { __log.error("Error retrieving node list.", ex); throw new ContextException("Error retrieving node list.", ex); } // Pretend we got a heartbeat... for (String s : _knownNodes) _lastHeartBeat.put(s, System.currentTimeMillis()); // schedule immediate job loading for now! _todo.enqueue(new LoadImmediateTask(System.currentTimeMillis())); // schedule check for stale nodes, make it random so that the nodes don't overlap. _todo.enqueue(new CheckStaleNodes(System.currentTimeMillis() + (long) (_random.nextDouble() * _staleInterval))); // do the upgrade sometime (random) in the immediate interval. _todo.enqueue(new UpgradeJobsTask(System.currentTimeMillis() + (long) (_random.nextDouble() * _immediateInterval))); _todo.start(); _running = true; } public synchronized void stop() { if (!_running) return; _todo.stop(); _todo.clearTasks(UpgradeJobsTask.class); _todo.clearTasks(LoadImmediateTask.class); _todo.clearTasks(CheckStaleNodes.class); _running = false; } /** * Run a job in the current thread. * * @param job * job to run. */ protected void runJob(final Job job) { final Scheduler.JobInfo jobInfo = new Scheduler.JobInfo(job.jobId, job.detail, (Integer)(job.detail.get("retry") != null ? job.detail.get("retry") : 0)); _exec.submit(new Callable<Void>() { public Void call() throws Exception { if (job.transacted) { try { execTransaction(new Callable<Void>() { public Void call() throws Exception { _jobProcessor.onScheduledJob(jobInfo); if (job.persisted) if (!_db.deleteJob(job.jobId, _nodeId)) throw new JobNoLongerInDbException(job.jobId,_nodeId); return null; } }); } catch (JobNoLongerInDbException jde) { // This may happen if two node try to do the same job... we try to avoid // it the synchronization is a best-effort but not perfect. __log.debug("job no longer in db forced rollback."); } catch (JobProcessorException jpe) { if (jpe.retry) { __log.error("Error while processing transaction, retrying.", jpe); doRetry(job); } else { __log.error("Error while processing transaction, no retry.", jpe); } } catch (Exception ex) { __log.error("Error while executing transaction", ex); } } else { _jobProcessor.onScheduledJob(jobInfo); } return null; } }); } private void addTodoOnCommit(final Job job) { registerSynchronizer(new Synchronizer() { public void afterCompletion(boolean success) { if (success) { _todo.enqueue(job); } } public void beforeCompletion() { } }); } public boolean isTransacted() { try { Transaction tx = _txm.getTransaction(); return (tx != null && tx.getStatus() != Status.STATUS_NO_TRANSACTION); } catch (SystemException e) { throw new ContextException("Internal Error: Could not obtain transaction status."); } } public void runTask(Task task) { if (task instanceof Job) runJob((Job) task); if (task instanceof SchedulerTask) ((SchedulerTask) task).run(); } public void updateHeartBeat(String nodeId) { if (nodeId == null) return; if (_nodeId.equals(nodeId)) return; _lastHeartBeat.put(nodeId, System.currentTimeMillis()); _knownNodes.add(nodeId); } boolean doLoadImmediate() { __log.debug("LOAD IMMEDIATE started"); List<Job> jobs; try { do { jobs = execTransaction(new Callable<List<Job>>() { public List<Job> call() throws Exception { return _db.dequeueImmediate(_nodeId, System.currentTimeMillis() + _immediateInterval, 10); } }); for (Job j : jobs) { if (__log.isDebugEnabled()) __log.debug("todo.enqueue job from db: " + j.jobId + " for " + j.schedDate); _todo.enqueue(j); } } while (jobs.size() == 10); return true; } catch (Exception ex) { __log.error("Error loading immediate jobs from database.", ex); return false; } finally { __log.debug("LOAD IMMEDIATE complete"); } } boolean doUpgrade() { __log.debug("UPGRADE started"); final ArrayList<String> knownNodes = new ArrayList<String>(_knownNodes); // Don't forget about self. knownNodes.add(_nodeId); Collections.sort(knownNodes); // We're going to try to upgrade near future jobs using the db only. // We assume that the distribution of the trailing digits in the // scheduled time are uniformly distributed, and use modular division // of the time by the number of nodes to create the node assignment. // This can be done in a single update statement. final long maxtime = System.currentTimeMillis() + _nearFutureInterval; try { return execTransaction(new Callable<Boolean>() { public Boolean call() throws Exception { int numNodes = knownNodes.size(); for (int i = 0; i < numNodes; ++i) { String node = knownNodes.get(i); _db.updateAssignToNode(node, i, numNodes, maxtime); } return true; } }); } catch (Exception ex) { __log.error("Database error upgrading jobs.", ex); return false; } finally { __log.debug("UPGRADE complete"); } } /** * Re-assign stale node's jobs to self. * @param nodeId */ void recoverStaleNode(final String nodeId) { __log.debug("recovering stale node " + nodeId); try { int numrows = execTransaction(new Callable<Integer>() { public Integer call() throws Exception { return _db.updateReassign(nodeId, _nodeId); } }); __log.debug("reassigned " + numrows + " jobs to self. "); // We can now forget about this node, if we see it again, it will be // "new to us" _knownNodes.remove(nodeId); _lastHeartBeat.remove(nodeId); // Force a load-immediate to catch anything new from the recovered node. doLoadImmediate(); } catch (Exception ex) { __log.error("Database error reassigning node.", ex); } finally { __log.debug("node recovery complete"); } } private void doRetry(Job job) throws DatabaseException { Calendar retryTime = Calendar.getInstance(); retryTime.add(Calendar.SECOND, 2); job.detail.put("retry", job.detail.get("retry") != null ? (((Integer)job.detail.get("retry")) + 1) : 1); Job jobRetry = new Job(retryTime.getTime().getTime(), true, job.detail); _db.insertJob(jobRetry, _nodeId, false); } private abstract class SchedulerTask extends Task implements Runnable { SchedulerTask(long schedDate) { super(schedDate); } } private class LoadImmediateTask extends SchedulerTask { LoadImmediateTask(long schedDate) { super(schedDate); } public void run() { boolean success = false; try { success = doLoadImmediate(); } finally { if (success) _todo.enqueue(new LoadImmediateTask(System.currentTimeMillis() + (long) (_immediateInterval * .75))); else _todo.enqueue(new LoadImmediateTask(System.currentTimeMillis() + 100)); } } } /** * Upgrade jobs from far future to immediate future (basically, assign them to a node). * @author mszefler * */ private class UpgradeJobsTask extends SchedulerTask { UpgradeJobsTask(long schedDate) { super(schedDate); } public void run() { long ctime = System.currentTimeMillis(); long ntime = _nextUpgrade.get(); __log.debug("UPGRADE task for " + schedDate + " fired at " + ctime); // We could be too early, this can happen if upgrade gets delayed due to another // node if (_nextUpgrade.get() > System.currentTimeMillis()) { __log.debug("UPGRADE skipped -- wait another " + (ntime - ctime) + "ms"); _todo.enqueue(new UpgradeJobsTask(ntime)); return; } boolean success = false; try { success = doUpgrade(); } finally { long future = System.currentTimeMillis() + (success ? (long) (_nearFutureInterval * .50) : 100); _nextUpgrade.set(future); _todo.enqueue(new UpgradeJobsTask(future)); __log.debug("UPGRADE completed, success = " + success + "; next time in " + (future - ctime) + "ms"); } } } /** * Check if any of the nodes in our cluster are stale. */ private class CheckStaleNodes extends SchedulerTask { CheckStaleNodes(long schedDate) { super(schedDate); } public void run() { _todo.enqueue(new CheckStaleNodes(System.currentTimeMillis() + _staleInterval)); __log.debug("CHECK STALE NODES started"); for (String nodeId : _knownNodes) { Long lastSeen = _lastHeartBeat.get(nodeId); if (lastSeen == null || (System.currentTimeMillis() - lastSeen) > _staleInterval) recoverStaleNode(nodeId); } } } }