/** * Copyright (C) 2012 - present by OpenGamma Inc. and the OpenGamma group of companies * * Please see distribution for license. */ package com.opengamma.engine.calcnode; import it.unimi.dsi.fastutil.longs.Long2ObjectMap; import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.opengamma.engine.cache.CacheSelectHint; import com.opengamma.engine.value.ValueSpecification; import com.opengamma.util.tuple.Triple; /** * Standard job with an execution tail that can be retried in the event of failure. * <p> * See {@link DispatchableJob} for a description of standard and "watched" jobs. */ /* package */final class StandardJob extends DispatchableJob { private static final Logger s_logger = LoggerFactory.getLogger(StandardJob.class); private final ConcurrentMap<CalculationJobSpecification, JobResultReceiver> _resultReceivers; private Set<String> _usedJobInvoker; private int _rescheduled; private static List<CalculationJob> getAllJobs(CalculationJob job, List<CalculationJob> jobs) { if (jobs == null) { jobs = new LinkedList<CalculationJob>(); } jobs.add(job); if (job.getTail() != null) { for (CalculationJob tail : job.getTail()) { getAllJobs(tail, jobs); } } return jobs; } /** * Creates a new job for submission to the invokers. * * @param dispatcher the parent dispatcher that manages the invokers * @param job the root job to send * @param resultReceiver the callback for when the job and it's tail completes */ public StandardJob(final JobDispatcher dispatcher, final CalculationJob job, final JobResultReceiver resultReceiver) { super(dispatcher, job); _resultReceivers = new ConcurrentHashMap<CalculationJobSpecification, JobResultReceiver>(); final List<CalculationJob> jobs = getAllJobs(job, null); for (CalculationJob jobref : jobs) { _resultReceivers.put(jobref.getSpecification(), resultReceiver); } } @Override protected JobResultReceiver getResultReceiver(final CalculationJobResult result) { return _resultReceivers.remove(result.getSpecification()); } @Override protected boolean isLastResult() { return _resultReceivers.isEmpty(); } /** * Change the cache hints on a job. Tail jobs run on the same node as their parent but if we split them into discreet jobs any values previously produced by their parents into the private cache must * now go into the shared cache. * * @param job the job to process, not null * @param the adjusted job, not null */ /* package */static CalculationJob adjustCacheHints(final CalculationJob job, final Map<ValueSpecification, Triple<CalculationJob, ? extends Set<ValueSpecification>, ? extends Set<ValueSpecification>>> outputs) { // (job, private, public) final Triple<CalculationJob, ? extends Set<ValueSpecification>, ? extends Set<ValueSpecification>> jobValues = Triple .of(job, new HashSet<ValueSpecification>(), new HashSet<ValueSpecification>()); final CacheSelectHint hint = job.getCacheSelectHint(); for (CalculationJobItem item : job.getJobItems()) { for (ValueSpecification input : item.getInputs()) { final Triple<CalculationJob, ? extends Set<ValueSpecification>, ? extends Set<ValueSpecification>> producer = outputs.get(input); if (producer == null) { // Input produced by a previous job, so must be in the shared cache assert !hint.isPrivateValue(input); jobValues.getThird().add(input); } else if (producer.getFirst() != job) { // Input produced by a previous job into the private cache -- rewrite to the shared assert hint.isPrivateValue(input); jobValues.getThird().add(input); if (producer.getSecond().remove(input)) { producer.getThird().add(input); } } } for (ValueSpecification output : item.getOutputs()) { if (hint.isPrivateValue(output)) { // Private output -- may be subject to a rewrite jobValues.getSecond().add(output); outputs.put(output, jobValues); } else { // Shared output jobValues.getThird().add(output); } } } // Rewriting the tail can further adjust the sets in our data triple from the original private/shared distribution final Collection<CalculationJob> oldTail = job.getTail(); final Collection<CalculationJob> newTail; if (oldTail != null) { newTail = new ArrayList<CalculationJob>(oldTail.size()); for (CalculationJob tail : oldTail) { newTail.add(adjustCacheHints(tail, outputs)); } } else { newTail = null; } // Recalculate the smallest hint for our rewritten data final CacheSelectHint newHint; if (jobValues.getSecond().size() > jobValues.getThird().size()) { newHint = CacheSelectHint.sharedValues(jobValues.getThird()); } else { newHint = CacheSelectHint.privateValues(jobValues.getSecond()); } s_logger.debug("Rewriting {} to {}", hint, newHint); // Construct the rewritten job final CalculationJob newJob = new CalculationJob(job.getSpecification(), job.getFunctionInitializationIdentifier(), job.getResolverVersionCorrection(), job.getRequiredJobIds(), job.getJobItems(), newHint); if (newTail != null) { for (CalculationJob tail : newTail) { newJob.addTail(tail); } } return newJob; } /** * A watched job instance that corresponds to one of the original jobs. The job may have a tail. When it completes, new watched job instances will be submitted for each tail job. */ /* package */static final class WholeWatchedJob extends WatchedJob implements JobResultReceiver { private static final class BlockedJob { private final CalculationJob _job; private int _count; public BlockedJob(final CalculationJob job) { _job = job; } } private static final class JobState { private boolean _completed; private List<BlockedJob> _notify; } private static final class Context { private final ConcurrentMap<CalculationJobSpecification, JobResultReceiver> _resultReceivers; private final Long2ObjectMap<JobState> _jobs = new Long2ObjectOpenHashMap<JobState>(); public Context(final ConcurrentMap<CalculationJobSpecification, JobResultReceiver> resultReceivers) { _resultReceivers = resultReceivers; } public JobResultReceiver getResultReceiver(final CalculationJobResult job) { return _resultReceivers.remove(job.getSpecification()); } public synchronized void declareJobPending(final long jobId) { _jobs.put(jobId, new JobState()); } public synchronized List<BlockedJob> declareJobCompletion(final long jobId) { final JobState job = _jobs.remove(jobId); if (job._completed) { // Duplicate completion return null; } job._completed = true; if (job._notify != null) { for (BlockedJob notify : job._notify) { notify._count--; } return job._notify; } else { return Collections.emptyList(); } } public synchronized boolean isRunnable(final CalculationJob job) { if (job.getRequiredJobIds() == null) { return true; } BlockedJob blocked = null; for (long required : job.getRequiredJobIds()) { final JobState state = _jobs.get(required); if ((state == null) || state._completed) { continue; } if (blocked == null) { blocked = new BlockedJob(job); } blocked._count++; if (state._notify == null) { state._notify = new LinkedList<BlockedJob>(); } state._notify.add(blocked); } return blocked == null; } } private final Context _context; private final Collection<CalculationJob> _tail; private WholeWatchedJob(final DispatchableJob creator, final CalculationJob job, final Context context) { super(creator, new CalculationJob(job.getSpecification(), job.getFunctionInitializationIdentifier(), job.getResolverVersionCorrection(), null, job.getJobItems(), job.getCacheSelectHint())); _context = context; _tail = job.getTail(); context.declareJobPending(job.getSpecification().getJobId()); } @Override protected JobResultReceiver getResultReceiver(final CalculationJobResult result) { return this; } @Override public void resultReceived(final CalculationJobResult result) { final List<BlockedJob> blocked = _context.declareJobCompletion(result.getSpecification().getJobId()); if (blocked != null) { // Submit any blocked tail jobs if (!blocked.isEmpty()) { for (BlockedJob job : blocked) { if (job._count == 0) { s_logger.debug("Releasing blocked job {} from {}", job._job, this); getDispatcher().dispatchJobImpl(new WholeWatchedJob(this, job._job, _context)); } } } // Submit any new tail jobs if (_tail != null) { for (CalculationJob job : _tail) { if (_context.isRunnable(job)) { s_logger.debug("Submitting tail job {} from {}", job, this); getDispatcher().dispatchJobImpl(new WholeWatchedJob(this, job, _context)); } } } // Notify the original receiver of the job that completed final JobResultReceiver receiver = _context.getResultReceiver(result); if (receiver != null) { s_logger.debug("Watched job {} complete", this); receiver.resultReceived(result); } else { s_logger.warn("Result already dispatched for watched job {} completed on node {}", this, result.getComputeNodeId()); } } else { s_logger.warn("Watched job {} completed on node {} but is not currently pending", this, result.getComputeNodeId()); } } } /* package */WholeWatchedJob createWholeWatchedJob(final CalculationJob job) { return new WholeWatchedJob(this, job, new WholeWatchedJob.Context(_resultReceivers)); } /* package */WatchedJob createWatchedJob() { if (getJob().getTail() == null) { final List<CalculationJobItem> items = getJob().getJobItems(); switch (items.size()) { case 0: // Daft case, but not prevented return null; case 1: // If this is a single item with no tail then we can report it immediately and abort getDispatcher().getFunctionBlacklistMaintainer().failedJobItem(getJob().getJobItems().get(0)); return null; default: // Job had no tails, so don't need to rewrite the caching final JobResultReceiver receiver = _resultReceivers.remove(getJob().getSpecification()); if (receiver != null) { s_logger.debug("Submitting watched job for {}", this); return new WatchedJob.Whole(this, getJob(), receiver); } else { // No result receiver means we've already completed/aborted or are about to do so return null; } } } else { // Rewrite the private/shared caching information and submit a watched job for the root. Any tail jobs will be submitted after their // parent jobs complete final CalculationJob job = adjustCacheHints(getJob(), new HashMap<ValueSpecification, Triple<CalculationJob, ? extends Set<ValueSpecification>, ? extends Set<ValueSpecification>>>()); s_logger.debug("Submitting adjusted watched job for {}", this); return createWholeWatchedJob(job); } } @Override protected DispatchableJob prepareRetryJob(final JobInvoker jobInvoker) { if ((_usedJobInvoker != null) && _usedJobInvoker.contains(jobInvoker.getInvokerId())) { return createWatchedJob(); } else { _rescheduled++; if (_rescheduled >= getDispatcher().getMaxJobAttempts()) { return createWatchedJob(); } else { s_logger.info("Retrying job {}", this); if (_usedJobInvoker == null) { _usedJobInvoker = new HashSet<String>(); } _usedJobInvoker.add(jobInvoker.getInvokerId()); return this; } } } @Override protected void fail(final CalculationJob job, final CalculationJobResultItem failure) { final JobResultReceiver resultReceiver = _resultReceivers.remove(job.getSpecification()); if (resultReceiver != null) { notifyFailure(job, failure, resultReceiver); } else { s_logger.warn("Job {} already completed at propogation of failure", this); // This can happen if the root job timed out but things had started to complete } if (job.getTail() != null) { for (CalculationJob tail : job.getTail()) { fail(tail, failure); } } } @Override protected boolean isAlive(final JobInvoker jobInvoker) { return jobInvoker.isAlive(_resultReceivers.keySet()); } @Override protected void cancel(final JobInvoker jobInvoker) { jobInvoker.cancel(_resultReceivers.keySet()); } @Override public String toString() { final StringBuilder sb = new StringBuilder('S').append(getJob().getSpecification().getJobId()); if (_rescheduled > 0) { sb.append('(').append(_rescheduled + 1).append(')'); } return sb.toString(); } }