/*
* Copyright (c) 2008-2012 EMC Corporation
* All Rights Reserved
*/
package com.emc.storageos.volumecontroller.impl;
import java.io.IOException;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.URI;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.apache.curator.framework.recipes.locks.Lease;
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.emc.storageos.Controller;
import com.emc.storageos.coordinator.client.service.CoordinatorClient;
import com.emc.storageos.coordinator.client.service.DistributedAroundHook;
import com.emc.storageos.coordinator.client.service.DistributedLockQueueManager;
import com.emc.storageos.coordinator.client.service.DistributedQueue;
import com.emc.storageos.coordinator.client.service.DistributedQueueItemProcessedCallback;
import com.emc.storageos.coordinator.client.service.DistributedSemaphore;
import com.emc.storageos.coordinator.client.service.impl.DistributedQueueConsumer;
import com.emc.storageos.coordinator.exceptions.CoordinatorException;
import com.emc.storageos.exceptions.ClientControllerException;
import com.emc.storageos.exceptions.DeviceControllerException;
import com.emc.storageos.locking.LockRetryException;
import com.emc.storageos.volumecontroller.ControllerException;
import com.emc.storageos.workflow.WorkflowService;
import com.google.common.base.Joiner;
/**
* Main API for queueing / dispatching calls to device specific controller implementations.
*/
public class Dispatcher extends DistributedQueueConsumer<ControlRequest> {
private static final Logger _log = LoggerFactory.getLogger(Dispatcher.class);
// todo separate queue per build?
private static final String QUEUE_NAME = "controller";
private static final int DEFAULT_MAX_THREADS = 10;
private static final long DEFAULT_MAX_WAIT_STOP = 60 * 1000;
private static final int DEFAULT_METHOD_EXECUTOR_POOL_SIZE = 100;
private static final int ACQUIRE_LEASE_WAIT_TIME_SECONDS = 10;
private static final int ACQUIRE_LEASE_RETRY_WAIT_TIME__SECONDS = 10;
private static final int LOCK_RETRY_WAIT_TIME_SECONDS = 60;
private static final int DEFAULT_CONTROLLER_MAX_ITEM = 1000;
private static final int MAX_WORKFLOW_STEPS = 10000;
private static final long STALE_ITEM_THRESHOLD = 24 * 60 * 60 * 1000; // 24 hours in milliseconds
// Define the Queues used by the Dispatcher.
// To add a new Queue, add its name to the QueueName enum, and then add a constructor
// in the DispatcherQueue[] _queues below.
public static enum QueueName {
controller, workflow_outer, workflow_inner;
};
private class DispatcherQueue {
final QueueName _queue_name;
final Integer _method_executor_pool_size;
// _queue_max_item is not set for all queues hence its not a final variable
Integer _queue_max_item;
DistributedQueue<ControlRequest> _queue;
ScheduledThreadPoolExecutor _methodPoolExecutor;
DispatcherQueue(QueueName name, Integer poolSize, Integer maxItem) {
_queue_name = name;
_method_executor_pool_size = poolSize;
_queue_max_item = maxItem;
}
DispatcherQueue(QueueName name, Integer poolSize) {
_queue_name = name;
_method_executor_pool_size = poolSize;
}
public DistributedQueue<ControlRequest> getQueue() {
return _queue;
}
public void setQueue(DistributedQueue<ControlRequest> _queue) {
this._queue = _queue;
}
public QueueName getQueueName() {
return _queue_name;
}
public Integer getMethodExecutorPoolSize() {
return _method_executor_pool_size;
}
public Integer getQueueMaxItem() {
return _queue_max_item;
}
public ScheduledThreadPoolExecutor getMethodPoolExecutor() {
return _methodPoolExecutor;
}
public void setMethodPoolExecutor(ScheduledThreadPoolExecutor executor) {
_methodPoolExecutor = executor;
}
}
DispatcherQueue[] _queues = {
new DispatcherQueue(QueueName.controller, DEFAULT_METHOD_EXECUTOR_POOL_SIZE, DEFAULT_CONTROLLER_MAX_ITEM),
new DispatcherQueue(QueueName.workflow_outer, DEFAULT_METHOD_EXECUTOR_POOL_SIZE / 5),
new DispatcherQueue(QueueName.workflow_inner, DEFAULT_METHOD_EXECUTOR_POOL_SIZE)
};
// Methods to return the queues or a specific queue
private DispatcherQueue[] getQueues() {
return _queues;
}
private DispatcherQueue getDefaultQueue() {
return _queues[0];
}
private DispatcherQueue getQueue(QueueName name) {
for (DispatcherQueue q : getQueues()) {
if (q.getQueueName() == name) {
return q;
}
}
return getDefaultQueue();
}
private DispatcherQueue getQueue(String queueName) {
if (queueName == null) {
return getDefaultQueue();
}
return getQueue(QueueName.valueOf(queueName));
}
private CoordinatorClient _coordinator;
private Map<String, Controller> _controller;
private Map<Controller, Map<String, Method>> _methodMap;
private Map<String, Integer> _deviceMaxConnectionMap;
private final ConcurrentMap<URI, DistributedSemaphore> _deviceSemaphoreMap = new ConcurrentHashMap<URI, DistributedSemaphore>();
private int _acquireLeaseWaitTimeSeconds = ACQUIRE_LEASE_WAIT_TIME_SECONDS;
private int _acquireLeaseRetryWaitTimeSeconds = ACQUIRE_LEASE_RETRY_WAIT_TIME__SECONDS;
private DistributedLockQueueManager<ControlRequest> _lockQueueManager;
/**
* This method implements the logic for acquiring a device-specific semaphore.
* To get a semaphore, the device-specific configuration must include setting up maxConnections
* in _deviceMaxConnectionMap.
*
* @param info Light wrapper of needed device properties (e.g., URI, DeviceType)
* @return DistributedSemaphore instance, if maxConnections exists.
* null, otherwise.
*/
private DistributedSemaphore getSemaphore(DeviceInfo info) {
if (!info.getNeedsLock()) {
return null;
}
DistributedSemaphore deviceSemaphore = _deviceSemaphoreMap.get(info.getURI());
if (deviceSemaphore == null && _deviceMaxConnectionMap != null) {
Integer maxConnections = _deviceMaxConnectionMap.get(info.getType());
if (maxConnections != null) {
synchronized (this) {
try {
deviceSemaphore = _deviceSemaphoreMap.get(info.getURI());
if (deviceSemaphore == null) {
deviceSemaphore = _coordinator.getSemaphore(info.getURI().toString(), maxConnections.intValue());
}
_deviceSemaphoreMap.put(info.getURI(), deviceSemaphore);
} catch (Exception e) {
_log.error("Error getting deviceSemaphore for device: {}", info.getURI().toString(), e);
}
}
}
}
return deviceSemaphore;
}
/**
* A Runnable to enable periodic retries, in the event that the semaphore's leases are exhausted.
* The item is processed and removed from the distributed queue when:
* a) A semaphore cannot be acquired
* b) A semaphore is acquired, and a lease is acquired
*/
private class DeviceMethodInvoker implements Runnable {
private final ControlRequest _item;
private final DispatcherQueue _queue;
private final Controller _innerController;
private final Method _method;
private final DistributedQueueItemProcessedCallback _callback;
private final DistributedSemaphore _deviceSemaphore;
private final Object[] _args;
public DeviceMethodInvoker(ControlRequest item,
DistributedQueueItemProcessedCallback callback) throws DeviceControllerException {
_item = item;
_queue = getQueue(item.getQueueName());
final String targetClassName = item.getTargetClassName();
_innerController = _controller.get(targetClassName);
if (_innerController == null) {
_log.info("Failed Getting target: " + targetClassName);
throw DeviceControllerException.exceptions.unableToDispatchToController(targetClassName);
}
_method = _methodMap.get(_innerController).get(item.getMethodName());
_args = item.getArg();
_deviceSemaphore = getSemaphore(item.getDeviceInfo());
_callback = callback;
}
@Override
public void run() {
Lease lease = null;
boolean bRetryLease = false;
boolean bInvocationProblem = false;
boolean bRetryLock = false;
boolean isStale = false;
try {
// Reset the thread name temporarily so that the log lines don't
// reference a thread name that may have already completed its work.
// Any log lines above this line will have a thread name that may
// reference work that may have already been completed.
String defaultName = String.format("%s-thread-%d", Thread.currentThread().getThreadGroup().getName(), Thread
.currentThread().getId());
Thread.currentThread().setName(defaultName);
_log.info("Invoking {}: {}", _method.getName(), _args);
String opId = "";
URI resourceId = new URI("");
if (_args.length > 1) {
if (_args.length > 2
&& _args[_args.length - 2] != null
&& _args[_args.length - 2].getClass().equals(URI.class)) {
resourceId = (URI) _args[_args.length - 2];
}
opId = (String) _args[_args.length - 1];
List<String> stringArgs = new ArrayList<String>();
StringBuilder threadNameBuilder = new StringBuilder();
threadNameBuilder.append(Thread.currentThread().getId()).append('|').
append(_method.getName()).append('|');
for (Object arg : _args) {
if (arg instanceof String) {
stringArgs.add((String) arg);
}
}
threadNameBuilder.append(Joiner.on('|').join(stringArgs));
Thread.currentThread().setName(threadNameBuilder.toString());
}
ControllerUtils.setThreadLocalLogData(resourceId, opId);
long now = System.currentTimeMillis();
long timeSinceItemCreation = now - _item.getTimestamp();
if (timeSinceItemCreation < STALE_ITEM_THRESHOLD) {
if (_deviceSemaphore == null) {
// this device did not specify maxConnections.
_log.info("Dispatching task {}: {}", _method.getName(), _args);
_method.invoke(_innerController, _args);
} else {
lease = _deviceSemaphore.acquireLease(_acquireLeaseWaitTimeSeconds, TimeUnit.SECONDS);
if (lease != null) {
_log.info("Dispatching task {}: {}", _method.getName(), _args);
_method.invoke(_innerController, _args);
} else {
// Could not get a lease. Retry.
_log.info("Rescheduling task {}: {}", _method.getName(), _args);
_queue.getMethodPoolExecutor().schedule(this, _acquireLeaseRetryWaitTimeSeconds, TimeUnit.SECONDS);
bRetryLease = true;
}
}
} else {
_log.info(String.format("Task %s is stale and will not be executed. Timestamp for request was %d (%s), now = %d",
_method.getName(), _item.getTimestamp(), new Date(_item.getTimestamp()).toString(), now));
isStale = true;
}
} catch (InvocationTargetException e) {
Throwable cause = e.getCause();
if (cause instanceof LockRetryException) {
LockRetryException lockEx = (LockRetryException) cause;
_item.setLockGroup(lockEx.getLockIdentifier());
if (!addRequestToLockQueue(lockEx, _item)) {
_log.warn("Rescheduling task {}: {}", _method.getName(), _args);
_queue.getMethodPoolExecutor().schedule(this, LOCK_RETRY_WAIT_TIME_SECONDS, TimeUnit.SECONDS);
bRetryLock = true;
}
} else {
_log.warn("Problem executing task: " + _method.getName() + "; {}", _args, e);
bInvocationProblem = true;
}
} catch (Exception e) {
_log.warn("Problem executing task: " + _method.getName() + "; {}", _args, e);
bInvocationProblem = true;
} finally {
try {
if (_deviceSemaphore != null && lease != null) {
_deviceSemaphore.returnLease(lease);
}
if ((!bRetryLease && !bInvocationProblem && !bRetryLock) || isStale) {
// The method was invoked. Cleanup.
_callback.itemProcessed();
_log.info("Done with task {}: {}", _method.getName(), _args);
}
} catch (Exception e) {
_log.warn("Problem removing task from queue: " + _method.getName() + ", {}", _args, e);
}
}
}
}
/**
* Attempt to push an item onto the lock queue.
*
* @param lockEx LockRetryException instance
* @param item Item to queue
* @return true if the lock is not available and queueing was successful
*/
private boolean addRequestToLockQueue(final LockRetryException lockEx, final ControlRequest item) {
try {
_log.info(String.format("Dispatcher processing LockRetryException key %s remaining time %s",
lockEx.getLockPath(), lockEx.getRemainingWaitTimeSeconds()));
DistributedAroundHook<Boolean> aroundHook = _coordinator.getDistributedOwnerLockAroundHook();
Boolean result = aroundHook.run(new DistributedAroundHook.Action<Boolean>() {
@Override
public Boolean run() {
// Before this method runs, the globalLock will be acquired
try {
return !_coordinator.isDistributedOwnerLockAvailable(lockEx.getLockPath()) &&
_lockQueueManager.queue(lockEx.getLockIdentifier(), item);
} catch (Exception e) {
return false;
}
// After this method runs, the globalLock will be released
}
});
return result.booleanValue();
} catch (Exception e) {
return false;
}
}
/**
* Sets coordinator
*
* @param coordinator
*/
public void setCoordinator(CoordinatorClient coordinator) {
_coordinator = coordinator;
}
/**
* Sets device specific controller implementations
*
* @param controller
*/
public void setController(Set<Controller> controller) {
_controller = new HashMap<String, Controller>();
_methodMap = new HashMap<Controller, Map<String, Method>>();
Iterator<Controller> it = controller.iterator();
while (it.hasNext()) {
Controller c = it.next();
_controller.put(c.getClass().getName(), c);
Map<String, Method> methodMap = new HashMap<String, Method>();
Method[] methods = c.getClass().getMethods();
for (int i = 0; i < methods.length; i++) {
methodMap.put(methods[i].getName(), methods[i]);
}
_methodMap.put(c, methodMap);
}
}
/**
* Sets _deviceMaxConnectionMap <DeviceType, MaxConnections>
*
* @param deviceMaxConnectionMap
*/
public void setDeviceMaxConnectionMap(Map<String, Integer> deviceMaxConnectionMap) {
_deviceMaxConnectionMap = deviceMaxConnectionMap;
}
/**
* Sets _methodExecutorPoolSize, if configured.
*
* @param corePoolSize Specified size of the _methodExecutorPool
*/
public void setMethodExecutorPoolSize(int corePoolSize) {
getDefaultQueue().getMethodPoolExecutor().setMaximumPoolSize(corePoolSize);
}
/**
* Sets _acquireLeaseWaitTimeSeconds if configured.
*
* @param waitTime Specified blocking wait time on acquireLease requests
*/
public void setAcquireLeaseWaitTimeSeconds(int waitTime) {
_acquireLeaseWaitTimeSeconds = waitTime;
}
/**
* Sets _acquireLeaseRetryWaitTimeSeconds if configured.
*
* @param retryWaitTime Specified retry wait time after limited-blocking acquireLease call returns no lease.
*/
public void setAcquireLeaseRetryWaitTimeSeconds(int retryWaitTime) {
_acquireLeaseRetryWaitTimeSeconds = retryWaitTime;
}
/**
* Creates _methodPoolExecutor for each Queue
*/
public void build() {
for (DispatcherQueue q : getQueues()) {
q.setMethodPoolExecutor(
new ScheduledThreadPoolExecutor(q.getMethodExecutorPoolSize()) {
@Override
protected void afterExecute(Runnable r, Throwable t) {
// After executing the runnable, clear the thread local log data
// that was set by the provisioning method invoked by the runnable.
ControllerUtils.clearThreadLocalLogData();
}
}
);
}
}
/**
* Queues a task to the default "controller" queue. This is the original behavior.
*
* @param deviceURI
* @param deviceType
* @param target
* @param method
* @param args
* @throws ControllerException
*/
public void queue(final URI deviceURI, final String deviceType, Object target, String method,
Object... args) throws ControllerException {
queue(getDefaultQueue().getQueueName(), deviceURI, deviceType, true, target, method, args);
}
/**
* Queues a method call against device specific controller
*
* @param queueName of enum QueueName identifies the Dispatcher queue to be
* used
* @param deviceURI
* @param deviceType
* @param target
* @param method
* @param args
*
* @throws ControllerException
*/
public void queue(final QueueName queueName, final URI deviceURI, final String deviceType,
Object target, String method, Object... args) throws ControllerException {
queue(queueName, deviceURI, deviceType, true, target, method, args);
}
/**
* Queues a method call against device specific controller
*
* @param queueName of enum QueueName identifies the Dispatcher queue to be
* used
* @param deviceURI
* @param deviceType
* @param lockDevice indicates whether a semaphore should be acquired for the device
* @param target
* @param method
* @param args
*
* @throws ControllerException
*/
public void queue(final QueueName queueName, final URI deviceURI, final String deviceType, boolean lockDevice,
Object target, String method, Object... args) throws ControllerException {
ControlRequest req = new ControlRequest(queueName.name(),
new DeviceInfo(deviceURI, deviceType, lockDevice), target, method, args);
try {
if (QueueName.controller.equals(queueName)) {
checkZkStepToWorkflowSize();
}
getQueue(queueName).getQueue().put(req);
} catch (final CoordinatorException e) {
throw ClientControllerException.retryables.queueToBusy();
} catch (final ClientControllerException e) {
throw ClientControllerException.retryables.queueToBusy();
} catch (final KeeperException e) {
_log.error("Exception occurred while queueing item", e);
throw ClientControllerException.fatals.unableToQueueJob(deviceURI);
} catch (final Exception e) {
throw ClientControllerException.fatals.unableToQueueJob(deviceURI, e);
}
_log.info("Queued task {}: {} ", method, args);
}
/**
* Place back onto the Dispatcher an existing ControlRequest instance that would
* have been held in a lock queue.
*
* @See {@link DistributedLockQueueManager}
*
* @param item An existing ControlRequest.
* @throws Exception
*/
public void queue(ControlRequest item) throws Exception {
try {
if (QueueName.controller.toString().equalsIgnoreCase(item.getQueueName())) {
checkZkStepToWorkflowSize();
}
getQueue(item.getQueueName()).getQueue().put(item);
} catch (final CoordinatorException e) {
throw ClientControllerException.retryables.queueToBusy();
} catch (final ClientControllerException e) {
throw ClientControllerException.retryables.queueToBusy();
} catch (final KeeperException e) {
throw ClientControllerException.fatals.unableToQueueJob(item.getDeviceInfo().getURI());
} catch (final Exception e) {
throw ClientControllerException.fatals.unableToQueueJob(item.getDeviceInfo().getURI(), e);
}
_log.info("Queued existing task {}: {} ", item.getMethodName(), item.getArg());
}
/**
* This method checks the size of the total number of steps across all the running
* workflows in zoo keeper if it reaches the default limit then it throws
* QueueTooBusyException
*
* @throws Exception
*/
private void checkZkStepToWorkflowSize() throws Exception {
int zkStep2WorkflowSize = WorkflowService.getZkStep2WorkflowSize();
if (zkStep2WorkflowSize > MAX_WORKFLOW_STEPS) {
_log.error("Queue is too busy. More than " + MAX_WORKFLOW_STEPS + " zookeeper step2workflow found.");
throw ClientControllerException.retryables.queueToBusy();
}
}
/**
* Starts dispatcher
*
* @throws Exception
*/
public void start() throws Exception {
_coordinator.start();
// todo make max threads configurable
// _queue = _coordinator.getQueue(QUEUE_NAME, this, new ControlRequestSerializer(), DEFAULT_MAX_THREADS);
for (DispatcherQueue q : getQueues()) {
if (q.getQueueMaxItem() != null) {
q.setQueue(_coordinator.getQueue(q.getQueueName().name(), this,
new ControlRequestSerializer(), DEFAULT_MAX_THREADS,
q.getQueueMaxItem()));
} else {
q.setQueue(_coordinator.getQueue(q.getQueueName().name(), this,
new ControlRequestSerializer(), DEFAULT_MAX_THREADS));
}
}
}
/**
* Stops dispatcher
*
* @throws IOException
*/
public void stop() throws IOException {
// todo make wait time configurable
for (DispatcherQueue q : getQueues()) {
q.getQueue().stop(DEFAULT_MAX_WAIT_STOP);
}
}
@Override
public void consumeItem(ControlRequest item, DistributedQueueItemProcessedCallback callback) throws Exception {
DispatcherQueue queue = getQueue(item.getQueueName());
queue.getMethodPoolExecutor().execute(new DeviceMethodInvoker(item, callback));
}
/**
* Container class from device properties that we like to give to ControlRequest
*/
public static class DeviceInfo implements Serializable {
private URI uri;
private String type;
private boolean needsLock;
public DeviceInfo() {
}
public DeviceInfo(URI deviceURI, String deviceType, boolean lockDevice) {
uri = deviceURI;
type = deviceType;
needsLock = lockDevice;
}
public void setURI(URI deviceURI) {
uri = deviceURI;
}
public void setType(String deviceType) {
type = deviceType;
}
public URI getURI() {
return uri;
}
public String getType() {
return type;
}
/**
* @return the lock
*/
public boolean getNeedsLock() {
return needsLock;
}
/**
* @param lock the lock to set
*/
public void setNeedsLock(boolean lock) {
this.needsLock = lock;
}
}
public Map<String, Controller> getControllerMap() {
return _controller;
}
@Override
public boolean isBusy(String queue) {
// For the provioning operations, basically all the nodes (with large thread pool)
// would get similar load naturally. More nodes and longer running, more evenly.
// If Dispatcher needs better load balance, it could enhance it from here.
return false;
}
public void setLockQueueManager(DistributedLockQueueManager lockQueueManager) {
_lockQueueManager = lockQueueManager;
}
}