/**
* Copyright (C) 2009 - present by OpenGamma Inc. and the OpenGamma group of companies
*
* Please see distribution for license.
*/
package com.opengamma.engine.calcnode;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import org.fudgemsg.FudgeContext;
import org.fudgemsg.FudgeMsgEnvelope;
import org.fudgemsg.MutableFudgeMsg;
import org.fudgemsg.mapping.FudgeDeserializer;
import org.fudgemsg.mapping.FudgeSerializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.opengamma.OpenGammaRuntimeException;
import com.opengamma.engine.cache.AbstractIdentifierMap;
import com.opengamma.engine.cache.IdentifierMap;
import com.opengamma.engine.calcnode.msg.Cancel;
import com.opengamma.engine.calcnode.msg.Execute;
import com.opengamma.engine.calcnode.msg.Failure;
import com.opengamma.engine.calcnode.msg.Invocations;
import com.opengamma.engine.calcnode.msg.IsAlive;
import com.opengamma.engine.calcnode.msg.Ready;
import com.opengamma.engine.calcnode.msg.RemoteCalcNodeMessage;
import com.opengamma.engine.calcnode.msg.RemoteCalcNodeMessageVisitor;
import com.opengamma.engine.calcnode.msg.Result;
import com.opengamma.engine.calcnode.msg.Scaling;
import com.opengamma.engine.calcnode.stats.FunctionCosts;
import com.opengamma.engine.calcnode.stats.FunctionInvocationStatisticsReceiver;
import com.opengamma.engine.function.NoOpFunction;
import com.opengamma.engine.function.blacklist.FunctionBlacklistMaintainer;
import com.opengamma.engine.function.blacklist.FunctionBlacklistQuery;
import com.opengamma.engine.view.ExecutionLogMode;
import com.opengamma.transport.FudgeConnection;
import com.opengamma.transport.FudgeConnectionStateListener;
import com.opengamma.transport.FudgeMessageReceiver;
import com.opengamma.transport.FudgeMessageSender;
/**
* A JobInvoker for invoking a job on a remote node connected by a FudgeConnection.
*/
/* package */class RemoteNodeJobInvoker implements JobInvoker, FudgeMessageReceiver, FudgeConnectionStateListener {
private static final Logger s_logger = LoggerFactory.getLogger(RemoteNodeJobInvoker.class);
private static final class JobInfo {
/**
* The callback to receive notification of the job completion.
*/
private final JobInvocationReceiver _receiver;
/**
* The calculation job.
*/
private final CalculationJob _job;
public JobInfo(final JobInvocationReceiver receiver, final CalculationJob job) {
_receiver = receiver;
_job = job;
}
public JobInvocationReceiver getReceiver() {
return _receiver;
}
public int getLaunchDelta() {
return (_job.getTail() != null) ? _job.getTail().size() - 1 : -1;
}
public CalculationJob getJob() {
return _job;
}
}
private final ConcurrentMap<CalculationJobSpecification, JobInfo> _pendingJobs = new ConcurrentHashMap<CalculationJobSpecification, JobInfo>();
private final ExecutorService _executorService;
private final FudgeMessageSender _fudgeMessageSender;
private final CapabilitySet _capabilitySet = new CapabilitySet();
private volatile int _capacity;
private final AtomicInteger _launched = new AtomicInteger();
private final AtomicReference<JobInvokerRegister> _dispatchCallback = new AtomicReference<JobInvokerRegister>();
private final IdentifierMap _identifierMap;
private final FunctionCosts _functionCosts;
private final FunctionBlacklistQuery _blacklistQuery;
private final FunctionBlacklistMaintainer _blacklistUpdate;
private volatile String _invokerId;
private final RemoteCalcNodeMessageVisitor _messageVisitor = new RemoteCalcNodeMessageVisitor() {
@Override
protected void visitUnexpectedMessage(final RemoteCalcNodeMessage message) {
s_logger.warn("Unexpected message - {}", message);
}
@Override
protected void visitFailureMessage(final Failure message) {
s_logger.info("Received failure for job {}", message.getJob());
if (message.getReady() != null) {
message.getReady().accept(this);
}
// We decrement the count (and re-register) before processing the data as the remote node is already available if it's sent us its data.
final JobInfo job = getPendingJobs().remove(message.getJob());
if (job == null) {
s_logger.warn("Duplicate or failure for cancelled callback {} received", message.getJob());
return;
}
if (_launched.addAndGet(job.getLaunchDelta()) < _capacity) {
// We check for below capacity. We can get "equal" here, but that means there is an invoke taking place which will be dealt with
// by the notifyWhenAvailable that gets called to reschedule the invoker
if (registerIfRequired(true)) {
s_logger.debug("Notified dispatcher of capacity available");
}
}
s_logger.debug("Failed job on {} with message {}", message.getComputeNodeId(), message.getErrorMessage());
jobFailed(job, message.getComputeNodeId(), new OpenGammaRuntimeException(message.getErrorMessage()));
}
@Override
protected void visitInvocationsMessage(final Invocations message) {
s_logger.info("Received invocation statistics");
final Scaling scaling = FunctionInvocationStatisticsReceiver.messageReceived(getFunctionCosts(), message);
if (scaling != null) {
s_logger.debug("Sending scaling message ", scaling);
final MutableFudgeMsg scalingMessage = getFudgeMessageSender().getFudgeContext().newMessage();
FudgeSerializer.addClassHeader(scalingMessage, scaling.getClass(), RemoteCalcNodeMessage.class);
scaling.toFudgeMsg(new FudgeSerializer(getFudgeMessageSender().getFudgeContext()), scalingMessage);
getFudgeMessageSender().send(scalingMessage);
}
}
@Override
protected void visitReadyMessage(final Ready message) {
s_logger.debug("Remote invoker ready message - {}", message);
getCapabilitySet().setParameterCapability(PlatformCapabilities.NODE_COUNT, message.getCapacity());
// [ENG-42] this is where we'd detect any other capability changes
_capacity = message.getCapacity();
final int launched = _launched.get();
if (launched < 0) {
// An additional decrement can happen if there is an error in the original job dispatch
_launched.incrementAndGet();
} else if (launched < _capacity) {
if (registerIfRequired(true)) {
s_logger.info("Remote invoker ready for use by dispatcher, capacity {}", message.getCapacity());
}
} else {
s_logger.info("Remote invoker over capacity {} with {} jobs", message.getCapacity(), launched);
}
}
@Override
protected void visitResultMessage(final Result message) {
s_logger.info("Received result for job {}", message.getResult().getSpecification());
if (message.getReady() != null) {
message.getReady().accept(this);
}
// We decrement the count (and re-register) before processing the data as the remote node is already available if it's sent us its data.
final JobInfo job = getPendingJobs().remove(message.getResult().getSpecification());
if (job == null) {
s_logger.warn("Duplicate or result for cancelled callback {} received", message.getResult().getSpecification());
return;
}
if (_launched.addAndGet(job.getLaunchDelta()) < _capacity) {
// We check for below capacity. We can get "equal" here, but that means there is an invoke taking place which will be dealt with
// by the notifyWhenAvailable that gets called to reschedule the invoker
if (registerIfRequired(true)) {
s_logger.debug("Notified dispatcher of capacity available");
}
}
final CalculationJobResult result = message.getResult();
AbstractIdentifierMap.resolveIdentifiers(getIdentifierMap(), result);
job.getReceiver().jobCompleted(result);
}
};
public RemoteNodeJobInvoker(
final ExecutorService executorService, final Ready initialMessage, final FudgeConnection fudgeConnection,
final IdentifierMap identifierMap, final FunctionCosts functionCosts, final FunctionBlacklistQuery blacklistQuery,
final FunctionBlacklistMaintainer blacklistUpdate) {
_executorService = executorService;
_fudgeMessageSender = fudgeConnection.getFudgeMessageSender();
_identifierMap = identifierMap;
_invokerId = initialMessage.getHostId();
_functionCosts = functionCosts;
_blacklistQuery = blacklistQuery;
_blacklistUpdate = blacklistUpdate;
fudgeConnection.setFudgeMessageReceiver(this);
fudgeConnection.setConnectionStateListener(this);
initialMessage.accept(_messageVisitor);
s_logger.info("Remote node invoker created with capacity {}", _capacity);
}
private CapabilitySet getCapabilitySet() {
return _capabilitySet;
}
protected void addCapabilities(final Collection<Capability> capabilities) {
getCapabilitySet().addCapabilities(capabilities);
}
@Override
public Collection<Capability> getCapabilities() {
return getCapabilitySet().getCapabilities();
}
private ConcurrentMap<CalculationJobSpecification, JobInfo> getPendingJobs() {
return _pendingJobs;
}
private FudgeMessageSender getFudgeMessageSender() {
return _fudgeMessageSender;
}
private ExecutorService getExecutorService() {
return _executorService;
}
private IdentifierMap getIdentifierMap() {
return _identifierMap;
}
private FunctionCosts getFunctionCosts() {
return _functionCosts;
}
private FunctionBlacklistQuery getBlacklistQuery() {
return _blacklistQuery;
}
private FunctionBlacklistMaintainer getBlacklistUpdate() {
return _blacklistUpdate;
}
protected void sendMessage(final RemoteCalcNodeMessage message) {
final FudgeSerializer serializer = new FudgeSerializer(getFudgeMessageSender().getFudgeContext());
getFudgeMessageSender().send(FudgeSerializer.addClassHeader(serializer.objectToFudgeMsg(message), message.getClass(), RemoteCalcNodeMessage.class));
}
private void jobFailed(final JobInvocationReceiver receiver, final CalculationJob job, final String nodeId, final Exception e) {
receiver.jobFailed(this, nodeId, e);
if (job.getTail() == null) {
if (job.getRequiredJobIds() == null) {
final Collection<CalculationJobItem> items = job.getJobItems();
if (items.size() <= 1) {
getBlacklistUpdate().failedJobItems(items);
}
}
}
}
private void jobFailed(final JobInfo job, final String nodeId, final Exception e) {
jobFailed(job.getReceiver(), job.getJob(), nodeId, e);
}
/**
* Replaces any blacklisted job items with no-op functions. This keeps the shape of the job the same and may allow continuation of dependent jobs that can operate on missing inputs.
*/
/* package */static CalculationJob blacklist(final FunctionBlacklistQuery query, final CalculationJob job) {
if (query.isEmpty()) {
return job;
}
final List<CalculationJobItem> originalItems = job.getJobItems();
final int size = originalItems.size();
for (int i = 0; i < size; i++) {
CalculationJobItem item = originalItems.get(i);
if (query.isBlacklisted(item)) {
final List<CalculationJobItem> newItems = new ArrayList<CalculationJobItem>(size);
for (int j = 0; j < i; j++) {
newItems.add(originalItems.get(j));
}
newItems.add(new CalculationJobItem(
NoOpFunction.UNIQUE_ID, item.getFunctionParameters(), item.getComputationTargetSpecification(),
item.getInputIdentifiers(), item.getOutputIdentifiers(), ExecutionLogMode.INDICATORS));
for (int j = i + 1; j < size; j++) {
item = originalItems.get(i);
if (query.isBlacklisted(item)) {
newItems.add(new CalculationJobItem(
NoOpFunction.UNIQUE_ID, item.getFunctionParameters(), item.getComputationTargetSpecification(),
item.getInputIdentifiers(), item.getOutputIdentifiers(), ExecutionLogMode.INDICATORS));
} else {
newItems.add(item);
}
}
return new CalculationJob(job.getSpecification(), job.getFunctionInitializationIdentifier(), job.getResolverVersionCorrection(), job.getRequiredJobIds(), newItems, job.getCacheSelectHint());
}
}
return job;
}
@Override
public boolean invoke(final CalculationJob rootJob, final JobInvocationReceiver receiver) {
while (_launched.incrementAndGet() > _capacity) {
if (_launched.decrementAndGet() >= _capacity) {
s_logger.debug("Capacity reached");
return false;
}
}
s_logger.info("Dispatching job {}", rootJob.getSpecification());
// Don't block the dispatcher with outgoing serialization and I/O
getExecutorService().execute(new Runnable() {
private void sendJob(final CalculationJob job) throws Exception {
getPendingJobs().put(job.getSpecification(), new JobInfo(receiver, job));
AbstractIdentifierMap.convertIdentifiers(getIdentifierMap(), job);
sendMessage(new Execute(blacklist(getBlacklistQuery(), job)));
}
@Override
public void run() {
// Breadth first sending of jobs, just in case some can start before we've sent everything
try {
sendJob(rootJob);
if (rootJob.getTail() != null) {
final Queue<CalculationJob> jobs = new LinkedList<CalculationJob>(rootJob.getTail());
CalculationJob job = jobs.poll();
while (job != null) {
sendJob(job);
if (job.getTail() != null) {
jobs.addAll(job.getTail());
}
job = jobs.poll();
}
}
} catch (Exception e) {
s_logger.warn("Error sending job {}", rootJob.getSpecification().getJobId());
jobFailed(receiver, rootJob, "node on " + getInvokerId(), e);
// Not knowing where the failure occurred, we may get an additional decrement if any of the jobs started completing. This may have
// broken the whole connection which will not be a problem. Otherwise We'll check, and adjust, for this when "Ready" messages
// arrive.
if (_launched.decrementAndGet() < _capacity) {
if (registerIfRequired(true)) {
s_logger.debug("Notified dispatcher of capacity available");
}
}
}
}
});
return true;
}
@Override
public void cancel(final Collection<CalculationJobSpecification> jobs) {
s_logger.info("Cancelling {} jobs at {}", jobs.size(), getInvokerId());
sendMessage(new Cancel(jobs));
}
@Override
public void cancel(final CalculationJobSpecification job) {
s_logger.info("Cancelling {} at {}", job, getInvokerId());
sendMessage(new Cancel(Collections.singleton(job)));
}
/**
* Returns true with the remote client generating failure messages if anything is not alive.
*/
@Override
public boolean isAlive(final Collection<CalculationJobSpecification> jobs) {
s_logger.info("Querying {} jobs at {}", jobs.size(), getInvokerId());
sendMessage(new IsAlive(jobs));
return true;
}
@Override
public boolean isAlive(final CalculationJobSpecification job) {
s_logger.info("Querying {} at {}", job.getJobId(), getInvokerId());
sendMessage(new IsAlive(Collections.singleton(job)));
return true;
}
@Override
public boolean notifyWhenAvailable(final JobInvokerRegister callback) {
_dispatchCallback.set(callback);
if (_launched.get() < _capacity) {
if (registerIfRequired(false)) {
s_logger.debug("Capacity available at notify");
return true;
}
}
return false;
}
private boolean registerIfRequired(final boolean invokeCallback) {
final JobInvokerRegister callback = _dispatchCallback.getAndSet(null);
if (callback != null) {
if (invokeCallback) {
callback.registerJobInvoker(this);
}
return true;
} else {
return false;
}
}
@Override
public void messageReceived(final FudgeContext fudgeContext, final FudgeMsgEnvelope msgEnvelope) {
final FudgeDeserializer deserializer = new FudgeDeserializer(fudgeContext);
final RemoteCalcNodeMessage message = deserializer.fudgeMsgToObject(RemoteCalcNodeMessage.class, msgEnvelope.getMessage());
message.accept(_messageVisitor);
}
@Override
public void connectionFailed(final FudgeConnection connection, final Exception cause) {
s_logger.warn("Client connection {} dropped", connection, cause);
_launched.addAndGet(_capacity); // Force over capacity to prevent any new submissions
final String invokerId = _invokerId;
_invokerId = null;
for (CalculationJobSpecification jobSpec : getPendingJobs().keySet()) {
final JobInfo job = getPendingJobs().remove(jobSpec);
// There could still be late messages arriving from a buffer even though the connection has now failed
if (job != null) {
s_logger.debug("Cancelling pending operation {}", jobSpec);
jobFailed(job, "node on " + invokerId, cause);
}
}
}
@Override
public void connectionReset(final FudgeConnection connection) {
s_logger.info("Connection reset by client");
// We're the server end of a connection, so this isn't going to happen with the socket implementation
}
@Override
public String toString() {
return _fudgeMessageSender.toString();
}
@Override
public String getInvokerId() {
return _invokerId;
}
}