/**
* Copyright (C) 2009 - present by OpenGamma Inc. and the OpenGamma group of companies
*
* Please see distribution for license.
*/
package com.opengamma.engine.calcnode;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.opengamma.OpenGammaRuntimeException;
import com.opengamma.engine.view.ExecutionLogMode;
import com.opengamma.util.async.Cancelable;
/**
* A job and all of its tails that can be dispatched to calculation nodes via an invoker. If the job fails (the local JobInvoker instance will make sure a failure is reported when loss of a remote
* node is detected) it may be rescheduled. If the maximum number of reschedules is exceeded, or the job is failed on the same node twice, it becomes a "watched" job.
* <p>
* A watched job may not have a tail. If a job has a tail, the tails are extracted and rewritten, along with the root job, to use shared cache for values. The original job may then be resubmitted to
* the dispatcher with a job result receiver that will submit the tail job(s) on original job completion. When those tails have complete the original result receiver will be notified.
* <p>
* When a watched job fails, if it contains a single job item that item can be reported to a blacklist maintainer. If the job contains multiple items it is split into two parts, rewritten to use the
* shared cache for values. The first will then be resubmitted with a job result receiver that will submit the second part of the job on first part completion. When the second part of the job
* completed the original callback will be notified.
*/
/* package */abstract class DispatchableJob implements JobInvocationReceiver {
private static final Logger s_logger = LoggerFactory.getLogger(DispatchableJob.class);
protected static final class CancelHandle implements Cancelable {
private volatile DispatchableJob[] _jobs;
private CancelHandle(final DispatchableJob job) {
_jobs = new DispatchableJob[] {job };
}
public synchronized void addCallback(final DispatchableJob job) {
DispatchableJob[] jobs = new DispatchableJob[_jobs.length + 1];
System.arraycopy(_jobs, 0, jobs, 1, _jobs.length);
jobs[0] = job;
_jobs = jobs;
}
public synchronized void removeCallback(final DispatchableJob job) {
final DispatchableJob[] oldJobs = _jobs;
for (int i = 0; i < oldJobs.length; i++) {
if (job == oldJobs[i]) {
final DispatchableJob[] newJobs = new DispatchableJob[oldJobs.length - 1];
if (i > 0) {
System.arraycopy(oldJobs, 0, newJobs, 0, i);
}
if (i < oldJobs.length - 1) {
System.arraycopy(oldJobs, i + 1, newJobs, i, oldJobs.length - (i + 1));
}
_jobs = newJobs;
break;
}
}
}
@Override
public boolean cancel(final boolean mayInterruptIfRunning) {
DispatchableJob[] jobs = _jobs;
if (jobs.length == 0) {
// Nothing here to cancel - probably already finished
return false;
} else {
boolean result = true;
for (DispatchableJob job : jobs) {
result &= job.cancel(mayInterruptIfRunning);
}
return result;
}
}
}
private final JobDispatcher _dispatcher;
private final CalculationJob _job;
private final AtomicBoolean _completed = new AtomicBoolean(false);
private final long _jobCreationTime;
private final CapabilityRequirements _capabilityRequirements;
private final AtomicReference<DispatchableJobTimeout> _timeout = new AtomicReference<DispatchableJobTimeout>();
private final CancelHandle _cancelHandle;
/**
* Creates a new dispatchable job for submission to the invokers.
*
* @param dispatcher the parent dispatcher that manages the invokers
* @param job the root job to send
*/
protected DispatchableJob(final JobDispatcher dispatcher, final CalculationJob job) {
_dispatcher = dispatcher;
_job = job;
_jobCreationTime = System.nanoTime();
_capabilityRequirements = dispatcher.getCapabilityRequirementsProvider().getCapabilityRequirements(job);
_cancelHandle = new CancelHandle(this);
}
/**
* Creates a new dispatchable job for submission to the invokers.
*
* @param creater the source job with the parameters for construction
* @param job the root job to send
*/
protected DispatchableJob(final DispatchableJob creater, final CalculationJob job) {
_dispatcher = creater.getDispatcher();
_job = job;
_jobCreationTime = System.nanoTime();
_capabilityRequirements = _dispatcher.getCapabilityRequirementsProvider().getCapabilityRequirements(job);
_cancelHandle = creater.getCancelHandle();
_cancelHandle.addCallback(this);
}
protected long getDurationNanos() {
return System.nanoTime() - getJobCreationTime();
}
protected CalculationJob getJob() {
return _job;
}
public JobDispatcher getDispatcher() {
return _dispatcher;
}
protected CancelHandle getCancelHandle() {
return _cancelHandle;
}
protected abstract JobResultReceiver getResultReceiver(CalculationJobResult result);
protected abstract boolean isLastResult();
@Override
public void jobCompleted(final CalculationJobResult result) {
final JobResultReceiver resultReceiver = getResultReceiver(result);
if (resultReceiver == null) {
s_logger.warn("Job {} completed on node {} but is not currently pending", this, result.getComputeNodeId());
// Note the above warning can happen if we've been retried
extendTimeout(getDispatcher().getMaxJobExecutionTime(), true);
return;
}
if (isLastResult()) {
// This is the last one to complete. Note that if the last few jobs complete concurrently, both may execute this code.
_completed.set(true);
cancelTimeout(DispatchableJobTimeout.FINISHED);
} else {
// Others are still running, but we can extend the timeout period
extendTimeout(getDispatcher().getMaxJobExecutionTime(), true);
}
s_logger.info("Job {} completed on node {}", this, result.getComputeNodeId());
resultReceiver.resultReceived(result);
final long durationNanos = getDurationNanos();
s_logger.debug("Reported time = {}ms, non-executing job time = {}ms", (double) result.getDuration() / 1000000d, ((double) durationNanos - (double) result.getDuration()) / 1000000d);
if (getDispatcher().getStatisticsGatherer() != null) {
final int size = result.getResultItems().size();
getDispatcher().getStatisticsGatherer().jobCompleted(result.getComputeNodeId(), size, result.getDuration(), getDurationNanos());
}
}
protected abstract DispatchableJob prepareRetryJob(JobInvoker jobInvoker);
@Override
public void jobFailed(final JobInvoker jobInvoker, final String computeNodeId, final Exception exception) {
s_logger.warn("Job {} failed, {}", this, (exception != null) ? exception.getMessage() : "no exception passed");
if (_completed.getAndSet(true) == false) {
cancelTimeout(null);
DispatchableJob retry = prepareRetryJob(jobInvoker);
if (retry != null) {
if (retry == this) {
// We're scheduled to retry
_completed.set(false);
getDispatcher().dispatchJobImpl(this);
} else {
// A replacement has been constructed
getCancelHandle().removeCallback(this);
cancelTimeout(DispatchableJobTimeout.REPLACED);
_completed.set(false);
getDispatcher().dispatchJobImpl(retry);
}
} else {
_completed.set(false);
abort(exception, "node invocation error");
}
if (getDispatcher().getStatisticsGatherer() != null) {
getDispatcher().getStatisticsGatherer().jobFailed(computeNodeId, getDurationNanos());
}
} else {
s_logger.warn("Job {} failed on node {} but we've already completed, aborted or failed", this, computeNodeId);
}
}
protected void notifyFailure(final CalculationJob job, final CalculationJobResultItem failure, final JobResultReceiver resultReceiver) {
final int size = job.getJobItems().size();
final List<CalculationJobResultItem> failureItems = new ArrayList<CalculationJobResultItem>(size);
for (int i = 0; i < size; i++) {
failureItems.add(failure);
}
final CalculationJobResult jobResult = new CalculationJobResult(job.getSpecification(), getDurationNanos(), failureItems, getDispatcher().getJobFailureNodeId());
resultReceiver.resultReceived(jobResult);
}
protected abstract void fail(final CalculationJob job, final CalculationJobResultItem failure);
public void abort(Exception exception, final String alternativeError) {
if (_completed.getAndSet(true) == false) {
cancelTimeout(DispatchableJobTimeout.FINISHED);
if (exception == null) {
s_logger.error("Aborted job {} with {}", this, alternativeError);
exception = new OpenGammaRuntimeException(alternativeError);
exception.fillInStackTrace();
} else {
s_logger.error("Aborted job {} with {}", this, exception);
}
// REVIEW jonathan 2012-11-01 -- where's the 'real' execution log here?
MutableExecutionLog executionLog = new MutableExecutionLog(ExecutionLogMode.INDICATORS);
executionLog.setException(exception);
fail(getJob(), CalculationJobResultItemBuilder.of(executionLog).toResultItem());
} else {
s_logger.warn("Job {} aborted but we've already completed or aborted from another node", this);
}
}
protected abstract boolean isAlive(final JobInvoker jobInvoker);
public void timeout(final long timeAccrued, final JobInvoker jobInvoker) {
s_logger.debug("Timeout on {}, {}ms accrued", jobInvoker.getInvokerId(), timeAccrued);
final long remaining = getDispatcher().getMaxJobExecutionTime() - timeAccrued;
if (remaining > 0) {
if (isAlive(jobInvoker)) {
s_logger.debug("Invoker {} reports job {} still alive", jobInvoker.getInvokerId(), this);
extendTimeout(remaining, false);
return;
} else {
s_logger.warn("Invoker {} reports job {} failure", jobInvoker.getInvokerId(), this);
jobFailed(jobInvoker, "node on " + jobInvoker.getInvokerId(), new OpenGammaRuntimeException("Node reported failure at " + timeAccrued + "ms keepalive"));
}
} else {
jobFailed(jobInvoker, "node on " + jobInvoker.getInvokerId(), new OpenGammaRuntimeException("Invocation limit of " + getDispatcher().getMaxJobExecutionTime() + "ms exceeded"));
}
}
private DispatchableJobTimeout cancelTimeout(final DispatchableJobTimeout flagState) {
final DispatchableJobTimeout timeout = _timeout.getAndSet(flagState);
if (timeout == null) {
s_logger.debug("Job {} timeout transition null to {}", this, flagState);
} else if (timeout.isActive()) {
s_logger.debug("Cancelling timeout for {} on transition to {}", this, flagState);
timeout.cancel();
return timeout;
} else {
s_logger.debug("Job {} timeout transition from {} to {}", new Object[] {this, timeout, flagState });
}
return null;
}
private void extendTimeout(final long remainingMillis, final boolean resetTimeAccrued) {
final DispatchableJobTimeout timeout = _timeout.get();
if ((timeout != null) && timeout.isActive()) {
s_logger.debug("Extending timeout on job {}", getJob().getSpecification().getJobId());
timeout.extend(Math.min(remainingMillis, getDispatcher().getMaxJobExecutionTime()), resetTimeAccrued);
}
}
private CapabilityRequirements getRequirements() {
return _capabilityRequirements;
}
public long getJobCreationTime() {
return _jobCreationTime;
}
public boolean canRunOn(final JobInvoker jobInvoker) {
// TODO: aiwg -- not happy with this approach for capabilities
return getRequirements().satisfiedBy(jobInvoker.getCapabilities());
}
protected abstract void cancel(final JobInvoker jobInvoker);
private boolean cancel(boolean mayInterruptIfRunning) {
s_logger.info("Cancelling job {}", this);
while (_completed.getAndSet(true) != false) {
Thread.yield();
final DispatchableJobTimeout timeout = _timeout.get();
if (timeout.isActive()) {
s_logger.debug("Job {} - currently failing, cancelling or aborting", this);
} else {
if (timeout == DispatchableJobTimeout.CANCELLED) {
s_logger.info("Job {} already cancelled", this);
return true;
} else {
s_logger.info("Job {} - can't cancel: {}", this, timeout);
return false;
}
}
}
final DispatchableJobTimeout timeout = cancelTimeout(DispatchableJobTimeout.CANCELLED);
if (timeout != null) {
final JobInvoker invoker = timeout.getInvoker();
if (invoker != null) {
cancel(invoker);
}
}
return true;
}
public boolean isCompleted() {
return _completed.get();
}
public boolean runOn(final JobInvoker jobInvoker) {
if (!jobInvoker.invoke(getJob(), this)) {
return false;
}
DispatchableJobTimeout timeout = new DispatchableJobTimeout(this, jobInvoker);
if (_timeout.compareAndSet(null, timeout)) {
s_logger.debug("Timeout set for job {}", this);
} else {
timeout.cancel();
if (s_logger.isDebugEnabled()) {
timeout = _timeout.get();
s_logger.debug("Timeout {} for job {}", timeout, this);
}
}
return true;
}
}