/** * Copyright (C) 2009 - present by OpenGamma Inc. and the OpenGamma group of companies * * Please see distribution for license. */ package com.opengamma.engine.calcnode; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.Map; import java.util.Queue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ScheduledThreadPoolExecutor; import org.apache.http.concurrent.Cancellable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.opengamma.engine.calcnode.stats.CalculationNodeStatisticsGatherer; import com.opengamma.engine.calcnode.stats.DiscardingNodeStatisticsGatherer; import com.opengamma.engine.function.blacklist.DummyFunctionBlacklistMaintainer; import com.opengamma.engine.function.blacklist.FunctionBlacklistMaintainer; import com.opengamma.util.ArgumentChecker; import com.opengamma.util.async.Cancelable; /** * Manages a set of JobInvokers and dispatches jobs to them for execution. */ public class JobDispatcher implements JobInvokerRegister { private static final Logger s_logger = LoggerFactory.getLogger(JobDispatcher.class); /* package */static final int DEFAULT_MAX_JOB_ATTEMPTS = 2; /* package */static final long DEFAULT_MAX_JOB_EXECUTION_QUERY_TIMEOUT = 5000; /* package */static final String DEFAULT_JOB_FAILURE_NODE_ID = "NOT EXECUTED"; private final Queue<DispatchableJob> _pending = new LinkedList<DispatchableJob>(); private final Queue<JobInvoker> _invokers = new ConcurrentLinkedQueue<JobInvoker>(); private final Map<JobInvoker, Collection<Capability>> _capabilityCache = new ConcurrentHashMap<JobInvoker, Collection<Capability>>(); /** * Maximum number of times a job will be submitted in its entirety to remote nodes before it gets partitioned to isolate an individual failure. */ private int _maxJobAttempts = DEFAULT_MAX_JOB_ATTEMPTS; private String _jobFailureNodeId = DEFAULT_JOB_FAILURE_NODE_ID; private CapabilityRequirementsProvider _capabilityRequirementsProvider = new StaticCapabilityRequirementsProvider(); /** * Maximum number of milliseconds a job can be with an invoker for before it is abandoned. */ private long _maxJobExecutionTime; /** * How often to query an invoker that has outstanding jobs. */ private long _maxJobExecutionTimeQuery = DEFAULT_MAX_JOB_EXECUTION_QUERY_TIMEOUT; private ScheduledThreadPoolExecutor _jobTimeoutExecutor; private CalculationNodeStatisticsGatherer _statisticsGatherer = new DiscardingNodeStatisticsGatherer(); private FunctionBlacklistMaintainer _blacklistUpdate = new DummyFunctionBlacklistMaintainer(); public JobDispatcher() { } public JobDispatcher(final JobInvoker invoker) { registerJobInvoker(invoker); } public JobDispatcher(final Collection<JobInvoker> invokers) { for (JobInvoker invoker : invokers) { registerJobInvoker(invoker); } } public int getMaxJobAttempts() { return _maxJobAttempts; } public void setMaxJobAttempts(final int maxJobAttempts) { _maxJobAttempts = maxJobAttempts; } public void setJobFailureNodeId(final String jobFailureNodeId) { _jobFailureNodeId = jobFailureNodeId; } public String getJobFailureNodeId() { return _jobFailureNodeId; } public long getMaxJobExecutionTime() { return _maxJobExecutionTime; } public FunctionBlacklistMaintainer getFunctionBlacklistMaintainer() { return _blacklistUpdate; } public void setFunctionBlacklistMaintainer(final FunctionBlacklistMaintainer blacklistUpdate) { ArgumentChecker.notNull(blacklistUpdate, "blacklistUpdate"); _blacklistUpdate = blacklistUpdate; } protected ScheduledThreadPoolExecutor getJobTimeoutExecutor() { return _jobTimeoutExecutor; } /** * Sets the maximum time for a job to be with an invoker in milliseconds. To disable the upper limit, * pass 0 or negative. This doesn't affect jobs already launched; only ones that are invoked after * the call. * * @param maxJobExecutionTime time in milliseconds */ public synchronized void setMaxJobExecutionTime(final long maxJobExecutionTime) { _maxJobExecutionTime = maxJobExecutionTime; if (maxJobExecutionTime > 0) { if (_jobTimeoutExecutor == null) { _jobTimeoutExecutor = new ScheduledThreadPoolExecutor(1); _jobTimeoutExecutor.setMaximumPoolSize(1); } } } public void setMaxJobExecutionTimeQuery(final long maxJobExecutionTimeQuery) { if (maxJobExecutionTimeQuery <= 0) { throw new IllegalArgumentException("maxJobExecutionTimeQuery must be greater than 0ms"); } _maxJobExecutionTimeQuery = maxJobExecutionTimeQuery; } public long getMaxJobExecutionTimeQuery() { return _maxJobExecutionTimeQuery; } public void setStatisticsGatherer(final CalculationNodeStatisticsGatherer statisticsGatherer) { _statisticsGatherer = statisticsGatherer; } public CalculationNodeStatisticsGatherer getStatisticsGatherer() { return _statisticsGatherer; } public void setCapabilityRequirementsProvider(final CapabilityRequirementsProvider capabilityRequirementsProvider) { ArgumentChecker.notNull(capabilityRequirementsProvider, "capabilityRequirementsProvider"); _capabilityRequirementsProvider = capabilityRequirementsProvider; } public CapabilityRequirementsProvider getCapabilityRequirementsProvider() { return _capabilityRequirementsProvider; } protected Queue<DispatchableJob> getPending() { return _pending; } protected Queue<JobInvoker> getInvokers() { return _invokers; } protected Map<JobInvoker, Collection<Capability>> getCapabilityCache() { return _capabilityCache; } @Override public synchronized void registerJobInvoker(final JobInvoker invoker) { ArgumentChecker.notNull(invoker, "invoker"); s_logger.debug("Registering job invoker {}", invoker); getInvokers().add(invoker); getCapabilityCache().put(invoker, invoker.getCapabilities()); if (!getPending().isEmpty()) { retryPending(0L); } } // caller must already own monitor private void retryPending(final long failJobsBefore) { s_logger.debug("Retrying pending operations"); final Iterator<DispatchableJob> iterator = getPending().iterator(); while (iterator.hasNext()) { final DispatchableJob job = iterator.next(); if (invoke(job)) { iterator.remove(); } else { if (failJobsBefore <= 0) { if (getInvokers().isEmpty()) { s_logger.debug("No invokers available - not retrying operations"); break; } } else if (job.getJobCreationTime() < failJobsBefore) { iterator.remove(); job.abort(null, "no invokers available after timeout"); } } } } // TODO [ENG-42] schedule retryPending to be called periodically with failJobsBefore set to `System.nanoTime() - a timeout` to cancel jobs which can't be executed at all // TODO [ENG-42] the invoker selection logic is inefficient; it's likely that capability requirements objects won't vary much so comparison against the capabilities of invokers should be cached // TODO [ENG-42] job dispatch should not be O(n) on number of invokers; the caching of capabilities should allow a nearer O(1) selection // caller must already own monitor private boolean invoke(final DispatchableJob job) { if (job.isCompleted()) { s_logger.info("Job {} cancelled", job); return true; } Collection<JobInvoker> retry = null; do { final Iterator<JobInvoker> iterator = getInvokers().iterator(); while (iterator.hasNext()) { final JobInvoker jobInvoker = iterator.next(); if (job.canRunOn(jobInvoker)) { if (job.runOn(jobInvoker)) { s_logger.debug("Invoker {} accepted job {}", jobInvoker, job); // put invoker to the end of the list iterator.remove(); getInvokers().add(jobInvoker); return true; } else { s_logger.debug("Invoker {} refused to execute job {}", jobInvoker, job); iterator.remove(); if (jobInvoker.notifyWhenAvailable(this)) { s_logger.info("Invoker {} requested immediate retry", jobInvoker); if (retry == null) { retry = new LinkedList<JobInvoker>(); } retry.add(jobInvoker); } } } } if (retry != null) { getInvokers().addAll(retry); retry = null; } else { break; } } while (true); s_logger.debug("No invokers available for job {}", job); return false; } protected synchronized void dispatchJobImpl(final DispatchableJob job) { if (!invoke(job)) { s_logger.debug("Adding job to pending set"); getPending().add(job); if (getInvokers() != null) { retryPending(0L); } } } /** * Puts the job into the ready queue, sent to an invoker as soon as one is available. Completion (or timeout) * of the job will result in one or more callbacks to the result receiver. There is always the callback for the * main job. If the job had a tail, a callback will also occur for each tail job. The {@link Cancellable} * callback returned may be used to abort operation. If operation is aborted, results may still be received * if they were too far in the pipeline to be stopped. * * @param job The job to dispatch * @param resultReceiver callback to receive the results * @return A {@link Cancellable} callback to attempt to abort the job */ public Cancelable dispatchJob(final CalculationJob job, final JobResultReceiver resultReceiver) { ArgumentChecker.notNull(job, "job"); ArgumentChecker.notNull(resultReceiver, "resultReceiver"); s_logger.info("Dispatching job {}", job.getSpecification().getJobId()); final DispatchableJob dispatchJob = new StandardJob(this, job, resultReceiver); dispatchJobImpl(dispatchJob); return dispatchJob.getCancelHandle(); } /** * Returns capabilities from all available invokers. * * @return Map of invoker identifier to capability set. */ public Map<String, Collection<Capability>> getAllCapabilities() { final Iterator<Map.Entry<JobInvoker, Collection<Capability>>> invokerCapabilityIterator = getCapabilityCache().entrySet().iterator(); final Map<String, Collection<Capability>> result = new HashMap<String, Collection<Capability>>(); while (invokerCapabilityIterator.hasNext()) { final Map.Entry<JobInvoker, Collection<Capability>> invokerCapability = invokerCapabilityIterator.next(); final String identifier = invokerCapability.getKey().getInvokerId(); if (identifier == null) { invokerCapabilityIterator.remove(); } else { result.put(identifier, invokerCapability.getValue()); } } return result; } }