/*
* RHQ Management Platform
* Copyright (C) 2005-2013 Red Hat, Inc.
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation version 2 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*/
package org.rhq.core.pc.inventory;
import static org.rhq.core.domain.measurement.AvailabilityType.DOWN;
import static org.rhq.core.domain.measurement.AvailabilityType.UNKNOWN;
import static org.rhq.core.domain.measurement.AvailabilityType.UP;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.rhq.core.domain.measurement.Availability;
import org.rhq.core.domain.measurement.AvailabilityType;
import org.rhq.core.pluginapi.availability.AvailabilityFacet;
/**
* Proxy class for executing availability checks. Checks are done using a
* supplied thread pool. If the resource availability does not return within one
* second, the next call to {@link #getAvailability()} will return the
* calculated availability, if available.
*
* With the potential of having thousands, and even tens of thousands, of instances
* of this proxy, we must ensure that we keep it as lean as possible to reduce
* memory footprint of the agent. For example, we do not create a logger object for
* every proxy. Instead, LOG is static. This should be OK for how this proxy is used.
*
* @author Elias Ross
* @author Jay Shaughnessy
* @author John Mazzitelli
*/
public class AvailabilityProxy implements AvailabilityFacet, Callable<AvailabilityType> {
private static final Log LOG = LogFactory.getLog(AvailabilityProxy.class); // purposefully static, don't create one per proxy
/**
* How long to wait for a resource to return their availability *immediately* (in ms).
* If a resource takes longer than this, then the number of timeouts is incremented, and then
* the container will just assume availability will be returned asynchronously for this resource.
*/
private static final int AVAIL_SYNC_TIMEOUT;
/**
* Number of consecutive avail sync timeouts before we assume the resource's avail checking can not meet the async
* timeout. At that point stop slowing things down waiting for the timeout and instead, for this resource,
* rely only on the async results. In other words, stop trying to report live avail if live avail checking is
* consistently too slow. Max = 127. We use a byte here to save space.
*/
private static final byte AVAIL_SYNC_TIMEOUT_LIMIT;
/**
* How long to wait for an *async* future to return a resource availability (in ms).
* If a resource takes longer than this during an async call (via a thread from the executor thread pool)
* and another request comes in for the availability, then that async call will be canceled and a new
* one will be resubmitted, restarting the clock. This just helps clean up any hung threads waiting
* for an availability that is just taking too much time to complete.
*/
private static final int AVAIL_ASYNC_TIMEOUT;
static {
int syncAvailTimeout;
try {
// unlikely to be changed but back-door configurable
syncAvailTimeout = Integer.parseInt(System.getProperty("rhq.agent.plugins.availability-scan.sync-timeout",
"1000"));
} catch (Throwable t) {
syncAvailTimeout = 1000;
}
AVAIL_SYNC_TIMEOUT = syncAvailTimeout;
byte syncAvailTimeoutLimit;
try {
// unlikely to be changed but back-door configurable
syncAvailTimeoutLimit = Byte.parseByte(System.getProperty(
"rhq.agent.plugins.availability-scan.sync-timeout-limit", "5"));
} catch (Throwable t) {
syncAvailTimeoutLimit = 5;
}
if (syncAvailTimeoutLimit > 127) {
syncAvailTimeoutLimit = 127;
}
AVAIL_SYNC_TIMEOUT_LIMIT = syncAvailTimeoutLimit;
int asyncAvailTimeout;
try {
// unlikely to be changed but back-door configurable
asyncAvailTimeout = Integer.parseInt(System.getProperty(
"rhq.agent.plugins.availability-scan.async-timeout", "60000"));
} catch (Throwable t) {
asyncAvailTimeout = 60000;
}
AVAIL_ASYNC_TIMEOUT = asyncAvailTimeout;
}
private Future<AvailabilityType> availabilityFuture = null;
private volatile Thread current;
private long lastSubmitTime = 0;
private final ResourceContainer resourceContainer;
/**
* Number of consecutive avail sync timeouts for the resource. This value is reset if availability is
* returned synchronously (within the timeout period). There is currently no way to 'reset' this (short
* of agent restart) after it has triggered, meaning the resource will no longer try to report live avail.
*/
private byte availSyncConsecutiveTimeouts = 0;
/**
* Constructs a new proxy.
*/
public AvailabilityProxy(ResourceContainer resourceContainer) {
this.resourceContainer = resourceContainer;
}
@Override
public AvailabilityType call() throws Exception {
current = Thread.currentThread();
ClassLoader originalContextClassLoader = current.getContextClassLoader();
try {
Thread.currentThread().setContextClassLoader(this.resourceContainer.getResourceClassLoader());
return this.resourceContainer.getResourceComponent().getAvailability();
} finally {
current.setContextClassLoader(originalContextClassLoader);
}
}
/**
* Returns the current or most currently reported availability. If {@link AvailabilityType#UNKNOWN} is returned,
* then the availability is being computed.
* <p/>
* This method is is not designed to be called concurrently, so it is synchronized to ensure one avail check
* completes before another is processed. This protects against live checks (or test code) interfering with
* scheduled checks.
*
* @throws org.rhq.core.pc.inventory.TimeoutException
* if an async check exceeds AVAIL_ASYNC_TIMEOUT
*/
@Override
synchronized public AvailabilityType getAvailability() {
AvailabilityType avail = UNKNOWN;
try {
// If the avail check timed out, or if we are not attempting synchronous checks (due to
// exceeding the consecutive timeout limit) then the future will exist.
if (availabilityFuture != null) {
if (availabilityFuture.isDone()) {
// hold onto and report the last known value if necessary
avail = processAvail(availabilityFuture.get());
} else {
// We are still waiting on the previously submitted async avail check - let's just return
// the last one we got. Note that if the future is not done after a large amount of time,
// then it means this thread could somehow be hung or otherwise stuck and not returning. Not good.
// In this case, throw a detailed exception to the avail checker.
long elapsedTime = System.currentTimeMillis() - lastSubmitTime;
if (elapsedTime > getAsyncTimeout()) {
Throwable t = new Throwable();
if (current != null) {
t.setStackTrace(current.getStackTrace());
}
String msg = "Availability check ran too long [" + elapsedTime + "ms], canceled for ["
+ this.resourceContainer + "]; Stack trace includes the timed out thread's stack trace.";
availabilityFuture.cancel(true);
// try again, maybe the situation will resolve in time for the next check
availabilityFuture = this.resourceContainer.submitAvailabilityCheck(this);
lastSubmitTime = System.currentTimeMillis();
throw new TimeoutException(msg, t);
} else {
return getLastAvailabilityType();
}
}
}
// request a thread to do an avail check
availabilityFuture = this.resourceContainer.submitAvailabilityCheck(this);
lastSubmitTime = System.currentTimeMillis();
// if we have exceeded the timeout too many times in a row assume that this is a slow
// resource and stop performing synchronous checks, which would likely fail to return fast enough anyway.
if (availSyncConsecutiveTimeouts < getSyncTimeoutLimit()) {
// attempt to get availability synchronously
avail = processAvail(availabilityFuture.get(getSyncTimeout(), TimeUnit.MILLISECONDS));
// success (failure will throw exception)
availSyncConsecutiveTimeouts = 0;
availabilityFuture = null;
} else if (availSyncConsecutiveTimeouts == getSyncTimeoutLimit()) {
// log one time that we are disabling synchronous checks for this resource
++availSyncConsecutiveTimeouts;
if (LOG.isDebugEnabled()) {
LOG.debug("Disabling synchronous availability collection for [" + resourceContainer + "]; ["
+ getSyncTimeoutLimit() + "] consecutive timeouts exceeding [" + getSyncTimeout() + "ms]");
}
}
} catch (InterruptedException e) {
LOG.debug("InterruptedException; shut down is (likely) in progress.");
availabilityFuture.cancel(true);
availabilityFuture = null;
Thread.currentThread().interrupt();
return UNKNOWN;
} catch (ExecutionException e) {
availabilityFuture = null; // undefine, so in next run new (no longer failed) instance is scheduled
throw new RuntimeException("Availability check failed : " + e.getCause().getMessage(), e.getCause());
} catch (java.util.concurrent.TimeoutException e) {
// failed to get avail synchronously. next call to the future will return availability (we hope)
++availSyncConsecutiveTimeouts;
}
return avail;
}
/**
* Ensure the return value of {@link AvailabilityFacet#getAvailability()} satisfies the method contract. Note
* that the {{@link #getAvailability()} is also allowed to return {@link AvailabilityType#UNKNOWN}.
*
* @param type
* @return
*/
private AvailabilityType processAvail(AvailabilityType type) {
AvailabilityType result = type;
switch (type) {
case UP:
case DOWN:
case MISSING:
break;
default:
if (LOG.isDebugEnabled()) {
LOG.debug("ResourceComponent [" + this.resourceContainer + "] getAvailability() returned " + type
+ ". This is invalid and is being replaced with DOWN.");
}
result = DOWN;
}
// whenever changing to UP we reset the timeout counter. This is because DOWN resources often respond
// slowly to getAvailability() calls (for example, waiting for a connection attempt to time out). When a
// resource comes up we should give it a chance to respond quickly and provide live avail.
AvailabilityType lastAvail = getLastAvailabilityType();
if (result != getLastAvailabilityType()) {
if (result == UP) {
if (availSyncConsecutiveTimeouts >= getSyncTimeoutLimit()) {
if (LOG.isDebugEnabled()) {
LOG.debug("Enabling synchronous availability collection for [" + resourceContainer
+ "]; Availability has just changed from [" + lastAvail + "] to UP.");
}
}
availSyncConsecutiveTimeouts = 0;
}
}
return result;
}
private AvailabilityType getLastAvailabilityType() {
Availability av = this.resourceContainer.getAvailability();
if (av != null) {
AvailabilityType avt = av.getAvailabilityType();
return (avt != null) ? avt : AvailabilityType.UNKNOWN;
} else {
return AvailabilityType.UNKNOWN;
}
}
/**
* Override point. Typically for testing.
* @return something other than the env var setting.
*/
protected long getAsyncTimeout() {
return AVAIL_ASYNC_TIMEOUT;
}
/**
* Override point. Typically for testing.
* @return something other than the env var setting.
*/
protected long getSyncTimeout() {
return AVAIL_SYNC_TIMEOUT;
}
/**
* Override point. Typically for testing.
* @return something other than the env var setting.
*/
protected byte getSyncTimeoutLimit() {
return AVAIL_SYNC_TIMEOUT_LIMIT;
}
protected boolean isSyncDisabled() {
return availSyncConsecutiveTimeouts >= getSyncTimeoutLimit();
}
/**
* Debug string.
*/
@Override
public String toString() {
return "AvailabilityProxy [resource=" + resourceContainer + ", lastSubmitTime="
+ new java.util.Date(lastSubmitTime) + ", availabilityFuture=" + availabilityFuture + ", current="
+ current + ", timeouts=" + availSyncConsecutiveTimeouts + "]";
}
}