package org.zstack.core.job; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.dao.EmptyResultDataAccessException; import org.springframework.transaction.annotation.Transactional; import org.zstack.core.Platform; import org.zstack.core.cloudbus.CloudBus; import org.zstack.core.cloudbus.CloudBusEventListener; import org.zstack.core.cloudbus.EventSubscriberReceipt; import org.zstack.core.db.*; import org.zstack.core.errorcode.ErrorFacade; import org.zstack.header.errorcode.SysErrors; import org.zstack.core.thread.AsyncThread; import org.zstack.header.Component; import org.zstack.header.core.Completion; import org.zstack.header.core.NopeCompletion; import org.zstack.header.core.NopeReturnValueCompletion; import org.zstack.header.core.ReturnValueCompletion; import org.zstack.header.errorcode.ErrorCode; import org.zstack.header.exception.CloudRuntimeException; import org.zstack.header.managementnode.ManagementNodeChangeListener; import org.zstack.header.message.Event; import org.zstack.utils.Bucket; import org.zstack.utils.DebugUtils; import org.zstack.utils.JsonWrapper; import org.zstack.utils.Utils; import org.zstack.utils.logging.CLogger; import org.zstack.utils.serializable.SerializableHelper; import javax.persistence.Tuple; import javax.persistence.TypedQuery; import java.io.IOException; import java.sql.Timestamp; import java.util.*; /** */ public class JobQueueFacadeImpl2 implements JobQueueFacade, CloudBusEventListener, Component, ManagementNodeChangeListener { private static final CLogger logger = Utils.getLogger(JobQueueFacadeImpl2.class); private static final String LOCK_NAME = "JobQueueFacade.lock"; private static final String ORPHAN_JOB_LOCK_NAME = "JobQueueFacade.orphanJobLock"; private static final int LOCK_TIMEOUT = 60; private Map<Long, JobWrapper> wrappers = Collections.synchronizedMap(new HashMap<Long, JobWrapper>()); @Autowired private DatabaseFacade dbf; @Autowired private CloudBus bus; @Autowired private ErrorFacade errf; private volatile boolean stopped = false; private EventSubscriberReceipt unsubscriber; @Override public boolean handleEvent(Event e) { if (!(e instanceof JobEvent)) { return false; } JobEvent je = (JobEvent) e; JobWrapper jw = wrappers.get(je.getJobId()); if (jw == null) { return false; } if (je.isSuccess()) { Object ret = je.getReturnValue() != null ? je.getReturnValue().get() : null; jw.success(ret); } else { jw.fail(je.getErrorCode()); } return false; } @Override public boolean start() { unsubscriber = bus.subscribeEvent(this, new JobEvent()); stopped = false; return true; } @Override public boolean stop() { stopped = true; if (unsubscriber != null) { unsubscriber.unsubscribeAll(); } return true; } private void restartQueue(JobQueueVO qvo, String mgmtId) { SimpleQuery<JobQueueEntryVO> q = dbf.createQuery(JobQueueEntryVO.class); q.select(JobQueueEntryVO_.id, JobQueueEntryVO_.name); q.add(JobQueueEntryVO_.jobQueueId, SimpleQuery.Op.EQ, qvo.getId()); q.add(JobQueueEntryVO_.issuerManagementNodeId, SimpleQuery.Op.NULL); List<Tuple> ts = q.listTuple(); for (Tuple t : ts) { logger.debug(String.format("[Job Removed]: job[id:%s, name:%s] because its issuer management node[id:%s] became available", t.get(0), t.get(1), mgmtId)); dbf.removeByPrimaryKey((Long) t.get(0), JobQueueEntryVO.class); } q = dbf.createQuery(JobQueueEntryVO.class); q.add(JobQueueEntryVO_.state, SimpleQuery.Op.IN, JobState.Pending, JobState.Processing); q.add(JobQueueEntryVO_.jobQueueId, SimpleQuery.Op.EQ, qvo.getId()); q.orderBy(JobQueueEntryVO_.id, SimpleQuery.Od.ASC); long count = q.count(); if (count == 0) { logger.debug(String.format("[JobQueue Removed]: id:%s, no Pending or Processing job remaining in this queue, remove it", qvo.getId())); return; } List<JobQueueEntryVO> es = q.list(); for (JobQueueEntryVO e : es) { if (e.getState() == JobState.Processing && !e.isRestartable()) { dbf.remove(e); JobEvent evt = new JobEvent(); evt.setErrorCode(errf.instantiateErrorCode(SysErrors.MANAGEMENT_NODE_UNAVAILABLE_ERROR, String.format("management node[id:%s] becomes unavailable, job[name:%s, id:%s] is not restartable", mgmtId, e.getName(), e.getId()))); bus.publish(evt); logger.debug(String.format("[Job Removed]: job[id:%s, name:%s] because it's not restartable", e.getId(), e.getName())); continue; } logger.debug(String.format("[Job Restart]: job[id:%s, name:%s] in queue[id:%s] is restarting as its previous worker node[id:%s] became unavailable", e.getId(), e.getName(), qvo.getId(), mgmtId)); execute(qvo.getName(), qvo.getOwner(), e, new NopeReturnValueCompletion(), null); return; } } private void takeOverJobs(String mgmtId) { GLock lock = new GLock(ORPHAN_JOB_LOCK_NAME, LOCK_TIMEOUT); lock.lock(); try { logger.debug(String.format("management node[id:%s] starts taking over jobs of left management node[%s]", Platform.getManagementServerId(), mgmtId)); SimpleQuery<JobQueueVO> qq = dbf.createQuery(JobQueueVO.class); qq.add(JobQueueVO_.workerManagementNodeId, SimpleQuery.Op.NULL); List<JobQueueVO> queues = qq.list(); logger.debug(String.format("[Orphan Queue found]: management node is going to take over %s orphan queues", queues.size())); for (JobQueueVO queue : queues) { restartQueue(queue, mgmtId); } } finally { lock.unlock(); } } @Override public void nodeJoin(String nodeId) { } @Override public void nodeLeft(String nodeId) { takeOverJobs(nodeId); } @Override public void iAmDead(String nodeId) { } @Override public void iJoin(String nodeId) { } private interface JobWrapper { void run(); void success(Object ret); void fail(ErrorCode err); } public void execute(String queueName, String owner, Job job) { execute(queueName, owner, job, new NopeCompletion()); } private <T> void execute(final String queueName, final String owner, final JobQueueEntryVO entry, final ReturnValueCompletion<T> completion, final Class<? extends T> returnType) { new JobWrapper() { private Long myJobId; @Transactional private JobQueueVO saveJob() throws IOException { JobQueueVO ret = null; String sql = "select queue from JobQueueVO queue where queue.name = :queueName"; TypedQuery<JobQueueVO> q = dbf.getEntityManager().createQuery(sql, JobQueueVO.class); q.setParameter("queueName", queueName); JobQueueVO qvo = null; try { qvo = q.getSingleResult(); } catch (EmptyResultDataAccessException ne) { // no queue yet } if (qvo == null) { qvo = new JobQueueVO(); qvo.setName(queueName); qvo.setOwner(owner); qvo.setWorkerManagementNodeId(Platform.getManagementServerId()); dbf.getEntityManager().persist(qvo); dbf.getEntityManager().flush(); dbf.getEntityManager().refresh(qvo); logger.debug(String.format("[JobQueue created] id: %s, owner: %s, queue name: %s", qvo.getId(), owner, queueName)); ret = qvo; } else if (qvo.getWorkerManagementNodeId() == null) { qvo.setWorkerManagementNodeId(Platform.getManagementServerId()); dbf.getEntityManager().merge(qvo); ret = qvo; } entry.setJobQueueId(qvo.getId()); entry.setIssuerManagementNodeId(Platform.getManagementServerId()); entry.setState(JobState.Pending); JobQueueEntryVO ne = dbf.getEntityManager().merge(entry); dbf.getEntityManager().flush(); dbf.getEntityManager().refresh(ne); logger.debug(String.format("[Job added] job queue name: %s, job class name: %s, job id: %s", qvo.getName(), ne.getName(), ne.getId())); myJobId = ne.getId(); wrappers.put(myJobId, this); return ret; } private void jobFail(JobQueueEntryVO jvo, ErrorCode err) { jvo.setDoneDate(new Timestamp(new Date().getTime())); jvo.setState(JobState.Error); dbf.update(jvo); JobEvent evt = new JobEvent(); evt.setJobId(jvo.getId()); evt.setErrorCode(err); bus.publish(evt); } private void jobDone(JobQueueEntryVO jvo, Object ret) { jvo.setDoneDate(new Timestamp(new Date().getTime())); jvo.setState(JobState.Completed); dbf.update(jvo); JobEvent evt = new JobEvent(); evt.setJobId(jvo.getId()); if (ret != null) { evt.setReturnValue(JsonWrapper.wrap(ret)); } bus.publish(evt); } private JobQueueEntryVO findJob(JobQueueVO qvo) { SimpleQuery<JobQueueEntryVO> q = dbf.createQuery(JobQueueEntryVO.class); q.add(JobQueueEntryVO_.state, SimpleQuery.Op.EQ, JobState.Pending); q.add(JobQueueEntryVO_.jobQueueId, SimpleQuery.Op.EQ, qvo.getId()); q.setLimit(1); q.orderBy(JobQueueEntryVO_.id, SimpleQuery.Od.ASC); return q.find(); } private Bucket takeJob(final JobQueueVO qvo) { GLock lock = new GLock(LOCK_NAME, LOCK_TIMEOUT); lock.lock(); try { JobQueueEntryVO jobe = findJob(qvo); if (jobe == null) { // nothing to do, release queue dbf.remove(qvo); logger.debug(String.format("[JobQueue released, no pending task, delete the queue] last owner: %s, queue name: %s, queue id: %s", qvo.getOwner(), qvo.getName(), qvo.getId())); return null; } while (true) { try { JobContextObject ctx = SerializableHelper.readObject(jobe.getContext()); Job theJob = ctx.load(); jobe.setState(JobState.Processing); jobe = dbf.updateAndRefresh(jobe); return Bucket.newBucket(jobe, theJob); } catch (Exception e1) { String err = String.format("[Job de-serialize failed, the job will be marked as Error] queue name: %s, job id: %s, %s", qvo.getName(), jobe.getId(), e1.getMessage()); logger.warn(err, e1); jobFail(jobe, errf.stringToInternalError(err)); jobe = findJob(qvo); } } } finally { lock.unlock(); } } @AsyncThread private void process(final JobQueueVO qvo) { if (stopped) { logger.warn(String.format("[Job Facade Stopped]: stop processing job")); return; } Bucket ret = takeJob(qvo); if (ret == null) { return; } final JobQueueEntryVO e = ret.get(0); final Job job = ret.get(1); logger.debug(String.format("[Job Start] start executing job[id:%s, name:%s]", e.getId(), e.getName())); job.run(new ReturnValueCompletion<Object>(null) { @Override public void success(Object returnValue) { try { jobDone(e, returnValue); logger.debug(String.format("[Job Success] job[id:%s, name:%s] succeed", e.getId(), e.getName())); } catch (Throwable t){ logger.warn(String.format("unhandled exception happened when calling %s", job.getClass().getName()), t); jobFail(e, errf.stringToInternalError(t.getMessage())); } finally { process(qvo); } } @Override public void fail(ErrorCode errorCode) { try { jobFail(e, errorCode); logger.debug(String.format("[Job Failure] job[id:%s, name:%s] failed", e.getId(), e.getName())); } catch (Throwable t){ logger.warn(String.format("unhandled exception happened when calling %s", job.getClass().getName()), t); jobFail(e, errf.stringToInternalError(t.getMessage())); } finally { process(qvo); } } }); } @Override public void run() { if (stopped) { logger.warn(String.format("[Job Facade Stopped]: skip to run job[queueName:%s, owner:%s, name:%s]", queueName, owner, entry.getName())); return; } try { GLock lock = new GLock(LOCK_NAME, LOCK_TIMEOUT); JobQueueVO qvo = null; lock.lock(); try { qvo = saveJob(); } finally { lock.unlock(); } if (qvo != null) { process(qvo); } } catch (IOException e1) { throw new CloudRuntimeException(String.format("unable to serialize job: %s", entry.getName()), e1); } } @Override public void success(Object ret) { DebugUtils.Assert(myJobId!=null, "how can myJobId be null???"); wrappers.remove(myJobId); completion.success((T)ret); } @Override public void fail(ErrorCode err) { DebugUtils.Assert(myJobId!=null, "how can myJobId be null???"); wrappers.remove(myJobId); completion.fail(err); } }.run(); } @Override public <T> void execute(final String queueName, final String owner, final Job job, final ReturnValueCompletion<T> completion, final Class<? extends T> returnType) { try { JobQueueEntryVO e = new JobQueueEntryVO(); JobContextObject ctx = new JobContextObject(job); byte[] bits = SerializableHelper.writeObject(ctx); e.setContext(bits); e.setRestartable(job.getClass().isAnnotationPresent(RestartableJob.class)); e.setName(job.getClass().getName()); execute(queueName, owner, e, completion, returnType); } catch (IOException e1) { throw new CloudRuntimeException(e1); } } @Override public void execute(String queueName, String owner, Job job, final Completion completion) { execute(queueName, owner, job, new ReturnValueCompletion<Object>(completion) { @Override public void success(Object returnValue) { completion.success(); } @Override public void fail(ErrorCode errorCode) { completion.fail(errorCode); } }, null); } @Override public void deleteJobQueue(String queueName) { } @Override public void evictOwner(String owner) { } @Override public List<String> listAllQueue() { return null; } @Override public List<String> listQueue(String namePattern) { return null; } @Override public long getPendingJobNumber(String queueName) { return 0; } @Override public List<String> listQueueHasPendingJob() { return null; } @Override public boolean startQueueIfPendingJob(String queueName, String owner) { return false; } @Override public boolean startQueueIfPendingJob(String queueName, String owner, boolean newThread) { return false; } }