/*
* ALMA - Atacama Large Millimiter Array
* (c) European Southern Observatory, 2005
* Copyright by ESO (in the framework of the ALMA collaboration),
* All rights reserved
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package alma.ACS.MasterComponentImpl;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.logging.Level;
import java.util.logging.Logger;
import alma.ACS.ACSComponent;
import alma.ACS.ACSComponentOperations;
import alma.ACS.ComponentStates;
import alma.ACS.PingableResourceOperations;
import alma.acs.logging.AcsLogLevel;
/**
* Monitor for any kind of resource whose state a subsystem master component
* wants to observe.
* <p>
* This class is used by <code>MasterComponentImplBase</code>
* and should not be exposed to subsystem developers directly.
* However, the contained interfaces {@link SubsysResourceMonitor.ResourceChecker}
* and {@link SubsysResourceMonitor.ResourceErrorHandler} can be implemented
* by master components to supply customized checkers and error handlers
* for various resource types and error recovery strategies.
*
* @author hsommer
* @since ACS 6.0
*/
public class SubsysResourceMonitor {
/**
* The delay between two monitoring calls to resource.
*/
public final int defaultDelaySeconds;
private final ScheduledThreadPoolExecutor scheduler;
private final Random random;
private final Logger logger;
private final ExecutorService monitorCallThreadPool;
private final Set<ResourceCheckRunner> resourceRunners;
private boolean isShuttingDown;
private boolean isShutDown;
/**
* Ctor, with default value for <code>delaySeconds = 10</code>.
* @see #SubsysResourceMonitor(Logger, ThreadFactory, int)
*/
SubsysResourceMonitor(Logger logger, ThreadFactory threadFactory) {
this(logger, threadFactory, 10);
}
/**
* @param logger Logger to be used by this object
* @param threadFactory all threads for scheduling and calling the resources will be created by this factory.
* @param defaultDelaySeconds the default delay between finishing one monitoring call and starting the next call to the same resource.
* Values <= 1 will be changed to == 1. This default can be overridden in method
* {@link #monitorResource(alma.ACS.MasterComponentImpl.SubsysResourceMonitor.ResourceChecker, alma.ACS.MasterComponentImpl.SubsysResourceMonitor.ResourceErrorHandler, int)}.
*/
SubsysResourceMonitor(Logger logger, ThreadFactory threadFactory, int defaultDelaySeconds) {
this.defaultDelaySeconds = Math.max(1, defaultDelaySeconds);
this.logger = logger;
monitorCallThreadPool = Executors.newCachedThreadPool(threadFactory);
resourceRunners = new HashSet<ResourceCheckRunner>();
isShuttingDown = false;
isShutDown = false;
scheduler = new ScheduledThreadPoolExecutor(1, threadFactory);
random = new Random(System.currentTimeMillis());
}
/**
* Same as {@link #monitorResource(alma.ACS.MasterComponentImpl.SubsysResourceMonitor.ResourceChecker, alma.ACS.MasterComponentImpl.SubsysResourceMonitor.ResourceErrorHandler, int)},
* but with the default delay instead of a delay parameter.
*/
<T> void monitorResource(ResourceChecker<T> checker, ResourceErrorHandler<T> err) {
monitorResource(checker, err, defaultDelaySeconds);
}
/**
* Starts to periodically monitor the resource that <code>checker</code> contains.
* The monitor delay length is given by {@link #delaySeconds}.
* <p>
* In order to randomize check times even if many resources get signed up for monitoring one after the other,
* the initial delay before the first check is run is taken randomly between 1 second and the period time.
* @param checker
* @param delaySeconds determines the delay between ending a check call and starting the next one. If <1, then the default is used.
* @throws IllegalStateException if {@link #destroy(long, TimeUnit)} has been called.
* @throws IllegalArgumentException if any of the arguments are <code>null</code><code>,
* or if checker.getResource()</code> or <code>checker.getResourceName()</code> returns <code>null</code>.
*/
<T> void monitorResource(ResourceChecker<T> checker, ResourceErrorHandler<T> err, int delaySeconds) {
if (isShuttingDown) {
throw new IllegalStateException("Resource monitor is already destroyed.");
}
if (checker == null || checker.getResource() == null || checker.getResourceName() == null) {
throw new IllegalArgumentException("ResourceChecker must be non-null and must deliver non-null resource and resource name.");
}
if (err == null) {
throw new IllegalArgumentException("ResourceErrorHandler must not be null");
}
if (delaySeconds < 1) {
delaySeconds = defaultDelaySeconds;
}
synchronized (resourceRunners) {
for (ResourceCheckRunner<T> otherRunner : resourceRunners) {
ResourceChecker<T> otherChecker = otherRunner.getResourceChecker();
Object otherResource = otherChecker.getResource();
String otherResourceName = otherChecker.getResourceName();
// @TODO: enforce that no 2 resources can have the same name (important for #stopResourceMonitoring)
if (otherResource == checker.getResource()) {
String msg = "Resource '" + checker.getResourceName() + "' is already being monitored. ";
if (!otherResourceName.equals(checker.getResourceName())) {
msg += "However it was known under the different name '" + otherResourceName + "'! ";
}
msg += "Will re-schedule the monitoring now.";
logger.info(msg);
Future<?> future = otherRunner.getScheduleFuture();
future.cancel(true);
resourceRunners.remove(otherRunner);
break;
}
}
SubsysResourceMonitor.ResourceCheckRunner<T> checkRunner = new SubsysResourceMonitor.ResourceCheckRunner<T>(checker, err, logger, monitorCallThreadPool);
int initialDelaySeconds = random.nextInt(delaySeconds);
Future<?> future = scheduler.scheduleWithFixedDelay(checkRunner, initialDelaySeconds, delaySeconds, TimeUnit.SECONDS);
checkRunner.setScheduleFuture(future);
resourceRunners.add(checkRunner);
// logger.info("Will monitor resource '" + checker.getResourceName() + "'.");
}
}
/**
* For testing only.
* Returns the number of actively running threads, plus the number of tasks in the queue that is used for scheduling the monitoring calls.
*/
public int getNumberOfMonitorTasks() {
//logger.info("*** scheduler active count =" + scheduler.getActiveCount() + "; scheduler queue size = " + scheduler.getQueue().size());
return ( scheduler.getActiveCount() + scheduler.getQueue().size() );
}
/**
* For testing only!
* <p>
* Gets the <code>ResourceCheckRunner</code> that is used for running the monitor checks
* that use the given <code>ResourceChecker</code>.
* @return the corresponding <code>ResourceCheckRunner</code>, or <code>null</code> if no runner matches.
*/
SubsysResourceMonitor.ResourceCheckRunner getResourceCheckRunner(ResourceChecker checker) {
for (ResourceCheckRunner runner : resourceRunners) {
if (runner.getResourceChecker() == checker) {
return runner;
}
}
return null;
}
/**
* Suspends monitoring of all resources until {@link #resume()} is called.
* Currently running monitor calls do not get stopped, only future calls are prevented.
* <p>
* Note that the monitoring queue remains intact, while the monitor call itself becomes a no-op.
*/
public void suspend() {
for (ResourceCheckRunner runner : resourceRunners) {
runner.suspend();
}
}
/**
* Resumes monitoring of all resources.
* <p>
* This method has no effect if monitoring has not been previously suspended.
* @see #suspend()
*/
public void resume() {
for (ResourceCheckRunner runner : resourceRunners) {
runner.resume();
}
}
/**
* Stops monitoring a given resource.
* @param resourceName unique resource name
*/
public void stopMonitoring(String resourceName) {
synchronized (resourceRunners) {
for (ResourceCheckRunner runner : resourceRunners) {
if (resourceName.equals(runner.getResourceChecker().getResourceName())) {
runner.suspend(); // to invalidate an ongoing monitor call
runner.getScheduleFuture().cancel(false);
resourceRunners.remove(runner);
// don't break here, just in case we have more than 1 resource of that name (which of course should not happen)
}
}
}
}
/**
* Stops monitoring all resources.
*/
public void stopMonitoringAll() {
synchronized (resourceRunners) {
for (ResourceCheckRunner runner : resourceRunners) {
runner.suspend(); // to invalidate an ongoing monitor call
runner.getScheduleFuture().cancel(false);
}
resourceRunners.clear();
}
}
/**
* Cancels monitoring of all resources and leaves this object in an unusable state.
* <p>
* Impl note: this method is synchronized so that a second call can return immediately, but only if the first call has finished.
* @param timeout the timeout for waiting that the internal threads, queues, jobs etc are freed, or <code>0</code> to not wait at all.
* @param unit
* @throws InterruptedException
*/
synchronized void destroy(long timeout, TimeUnit unit) throws InterruptedException {
isShuttingDown = true;
if (isShutDown) {
return;
}
monitorCallThreadPool.shutdownNow();
scheduler.shutdownNow();
if (timeout > 0) {
long timeoutMillis = unit.toMillis(timeout); // to safely divide by 2 in case it was given as 1 second
monitorCallThreadPool.awaitTermination(timeoutMillis/2, TimeUnit.MILLISECONDS);
scheduler.awaitTermination(timeoutMillis/2, TimeUnit.MILLISECONDS);
} else {
isShutDown = scheduler.isTerminated();
}
}
/**
* The <code>Runnable</code> used for the scheduling queue of <code>SubsysResourceMonitor</code>.
*/
static class ResourceCheckRunner<T> implements Runnable {
private volatile int callTimeoutSeconds = 60;
private final ResourceChecker<T> resourceChecker;
private final ResourceErrorHandler<T> err;
private final Logger logger;
private final ExecutorService threadPool;
private Future<?> scheduleFuture;
private volatile boolean isSuspended;
private volatile boolean lastCheckSucceeded;
ResourceCheckRunner(ResourceChecker<T> resourceChecker, ResourceErrorHandler<T> err, Logger logger, ExecutorService threadPool) {
this.resourceChecker = resourceChecker;
this.err = err;
this.logger = logger;
this.threadPool = threadPool;
isSuspended = false;
lastCheckSucceeded = true;
}
/**
* Sets the future that was obtained from the scheduler when starting the monitoring job. The future object can
* be used to cancel the execution of this check runner. Unfortunately this object is not yet available at
* construction time, that's why we have this separate setter method.
*/
void setScheduleFuture(Future<?> scheduleFuture) {
this.scheduleFuture = scheduleFuture;
}
Future<?> getScheduleFuture() {
return scheduleFuture;
}
/**
* To be called from run()
*/
private void notifyRecovery() {
err.resourceRecovered(resourceChecker.getResource());
}
public void run() {
if (isSuspended) {
return;
}
// run the check in a thread from the thread pool
class CheckStateCallerWithTimeout implements Runnable {
private volatile boolean timeout = false;
public void run() {
String badState = resourceChecker.checkState();
if (badState != null && !timeout) { // we don't want to report a bad state after a timeout, since the timeout has already been reported
lastCheckSucceeded = false;
try {
err.badState(resourceChecker.getResource(), badState);
} catch (Exception e) {
logger.log(Level.WARNING, "Failed to propagate offending state of resource '" + resourceChecker.getResourceName() + "' to the error handler!", e);
}
}
else if (!lastCheckSucceeded) {
// all is well, but previous check failed
notifyRecovery();
lastCheckSucceeded = true;
}
}
void cancel() {
// it's cleaner to let this thread die and simply ignore its results after a timeout
timeout = true;
}
}
CheckStateCallerWithTimeout checkStateCallerWithTimeout = new CheckStateCallerWithTimeout();
Future future = threadPool.submit(checkStateCallerWithTimeout);
long timeBeforeCall = System.currentTimeMillis();
Throwable callError = null;
boolean wasTimedOut = false;
String timedOutDescription = null; // introduced to debug http://jira.alma.cl/browse/AIV-5983
try {
future.get(callTimeoutSeconds, TimeUnit.SECONDS);
} catch (TimeoutException e) {
wasTimedOut = true;
timedOutDescription = "TimeoutException after " + (System.currentTimeMillis() - timeBeforeCall) + " ms.";
} catch (InterruptedException e) {
if (System.currentTimeMillis() - callTimeoutSeconds >= timeBeforeCall) {
// most likely a timeout occurred.
// TODO: check why we did not get a TimeoutException
wasTimedOut = true;
timedOutDescription = "InterruptedException after " + (System.currentTimeMillis() - timeBeforeCall) + " ms; interpreting as timeout.";
}
else {
// some other strange InterruptedException.
callError = e;
timedOutDescription = "InterruptedException after " + (System.currentTimeMillis() - timeBeforeCall) + " ms.";
}
// TODO: check how CORBA timeout behaves, and whether a corba exception would be wrapped as an ExecutionException
// } catch (???CORBATimeoutEx??? e) {
// wasTimedOut = true;
// }
} catch (ExecutionException ex) {
callError = ex.getCause();
if (callError instanceof org.omg.CORBA.TRANSIENT) {
// Corba failed to connect to the server, for example because a container process has disappeared
wasTimedOut = true;
timedOutDescription = "TRANSIENT after " + (System.currentTimeMillis() - timeBeforeCall) + " ms.";
}
} catch (Throwable thr) { // unexpected
callError = thr;
}
finally {
long currTime = System.currentTimeMillis();
if ( !wasTimedOut && currTime - timeBeforeCall >= 10000) {
logger.log(Level.WARNING, "Too much time taken (" + (currTime - timeBeforeCall) + " seconds), however we didn't time out (" + callTimeoutSeconds + " seconds) for resource '" + resourceChecker.getResourceName() + ". ");
}
if (wasTimedOut ) {
checkStateCallerWithTimeout.cancel(); // to suppress a possible later bad-state message
}
}
// perhaps isSuspended was set while calling, so we check again
if (isSuspended) {
return;
}
// analyze result and react
boolean beyondRepair = false;
if (wasTimedOut) {
lastCheckSucceeded = false;
try {
logger.log(AcsLogLevel.DEBUG, "About to call error handler " + err.getClass().getSimpleName() + "#resourceUnreachable(" + resourceChecker.getResourceName()
+ "). Timeout detail: " + timedOutDescription);
// notify the error handler
// TODO: call in separate thread with timeout. Decide about value of "beyondRepair" if method 'resourceUnreachable' times out
beyondRepair = err.resourceUnreachable(resourceChecker.getResource());
} catch (Throwable thr) {
logger.log(Level.WARNING, "Failed to propagate unavailability of resource '" + resourceChecker.getResourceName() + "' to the error handler!", thr);
}
}
else if (callError != null) {
// the asynchronous call "resourceChecker.checkState()" failed, but not because of a timeout.
// This is not expected, and we must log the exception.
lastCheckSucceeded = false;
logger.log(Level.WARNING, "Failed to check the status of resource '" + resourceChecker.getResourceName() + "' because of an exception.", callError);
try {
// notify the error handler
// @TODO: call in separate thread with timeout. Decide about value of "beyondRepair" if method 'resourceUnreachable' times out
beyondRepair = err.resourceUnreachable(resourceChecker.getResource());
} catch (Throwable thr) {
logger.log(Level.WARNING, "Failed to propagate unavailability of resource '" + resourceChecker.getResourceName() + "' to the error handler!", thr);
}
}
if (beyondRepair) {
String msg = "Resource '" + resourceChecker.getResourceName() + "' appears permanently unreachable and will no longer be monitored.";
logger.info(msg);
if (scheduleFuture != null) {
scheduleFuture.cancel(true);
}
else {
// this should never be necessary, but if so, it should also cancel the scheduled job
throw new RuntimeException(msg);
}
}
}
/**
* Gets the timeout value in seconds, which is used to abandon hanging resource checker tasks.
*/
int getCallTimeoutSeconds() {
return callTimeoutSeconds;
}
/**
* For testing only.
*/
void setCallTimeoutSeconds(int timeout) {
callTimeoutSeconds = timeout;
}
ResourceChecker<T> getResourceChecker() {
return resourceChecker;
}
void suspend() {
isSuspended = true;
}
void resume() {
isSuspended = false;
}
}
/**
* Encapsulates the details of a particular resource,
* so that all resources (components, offshoots, databases, ...) can
* be monitored in the same way.
* @param <T> The type of the resource object, for example an ACS component type
* @see ComponentChecker
*/
public static interface ResourceChecker<T> {
/**
* This method tries to connect to the monitored resource and check its state if applicable.
* If this call does not return within a certain time, then resource unavailability will be assumed.
* @return name of an offending state or status if one is found, otherwise <code>null</code>.
*/
public String checkState();
public T getResource();
/**
* Returns a name that identifies the resource.
* The name is used for log messages. It should be unique within a master component,
* although currently no use is made of uniqueness.
*/
public String getResourceName();
}
/**
* Implementation of <code>ResourceChecker</code> for ACS components. Calls
* {@link ACSComponentOperations#componentState()} to determine responsiveness and state of the component resource.
*/
public static class ComponentChecker<T extends ACSComponent> implements ResourceChecker<T>
{
private final T comp;
/**
* We keep the component name separately because later when there are problems it may no longer be possible to
* obtain it remotely.
*/
private String compName;
ComponentChecker(T comp) {
this.comp = comp;
this.compName = comp.name(); // todo: timeout and exception
}
public String checkState() {
ComponentStates state = comp.componentState();
if (state.value() != ComponentStates.COMPSTATE_OPERATIONAL.value()) {
return state.toString();
} else {
return null;
}
}
public T getResource() {
return comp;
}
public String getResourceName() {
return compName;
}
}
/**
* A custom <code>ResourceChecker</code> for objects implementing
* PingableResource interface.
* <p>
* @TODO: Since ACS 9.1 the ping() method has parameters, incl. one for recursion.
* The current choice is fast=false, recursive=false, but maybe this should be adjusted.
*/
public static class PingableResourceChecker<T extends PingableResourceOperations> implements
SubsysResourceMonitor.ResourceChecker<T> {
private T resource;
private String resourceName;
public PingableResourceChecker(T resource, String resourceName) {
this.resource = resource;
this.resourceName = resourceName;
}
public String checkState() {
String errMsg = null;
if (!resource.ping(false, false, -1)) {
errMsg = "ping(false, false, -1) failed.";
}
return errMsg;
}
public T getResource() {
return resource;
}
public String getResourceName() {
return resourceName;
}
}
/**
* Error handler that gets notified when a monitored resource becomes unavailable or degraded.
* <p>
* By implementing a custom error handler, a master component can attempt first to cure the situation, or go into
* ERROR state by calling <code>doTransition(SubsystemStateEvent.SUBSYSEVENT_ERROR);</code>.
*/
public interface ResourceErrorHandler<T>
{
/**
* Called when the resource could not be reached at all because of a timeout or network/middleware communication errors.
* The resource object is passed to allow using one handler for many resources.
* <p>
* The return value controls whether monitoring of this resource will be stopped:
* <ol>
* <li><code>true</code> means that the error handler decided that this resource is unreachable beyond repair,
* and that no further monitoring calls should be made. This can avoid potential problems with an increasing
* number of hanging calls and eventually stopping the respective threads.
* <li><code>false</code> means that monitoring calls should continue.
* </ol>
*/
abstract boolean resourceUnreachable(T resource);
/**
* Called when {@link SubsysResourceMonitor} was found in a bad state, but still replied in time.
*
* @param resource
* The resource object is passed to allow using one handler for many resources.
* @param stateName
* Name of the bad state the resource was found in. If the resource does not support named states,
* it may return any String that indicates the problem. For example, {@linkplain PingableResourceChecker}
* returns <code>"ping() failed."</code> which is then used as the <code>stateName</code>.
* @see ResourceChecker#checkState()
*/
abstract void badState(T resource, String stateName);
/**
* Notification that the monitored resource has recovered after a previous failure or timeout. This notification
* can only work if monitoring has continued after the problem was detected, which is always the case for
* <code>badState</code> problems, but depends on the return value of <code>resourceUnreachable</code> in case of timeout.
* problems.
*
* @since ACS 8.0.0 (has existed in sub-interface RecoverableResourceErrorHandler since 6.0.3)
*/
abstract void resourceRecovered(T resource);
}
}