/*
* Licensed to the Apache Software Foundation (ASF) under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional information regarding
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License. You may obtain a
* copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
/**
*
*/
package org.apache.geode;
import org.apache.geode.distributed.internal.DistributionConfig;
import org.apache.geode.internal.SystemFailureTestHook;
import org.apache.geode.internal.admin.remote.RemoteGfManagerAgent;
import org.apache.geode.internal.cache.GemFireCacheImpl;
import org.apache.geode.internal.i18n.LocalizedStrings;
/**
* Catches and responds to JVM failure
* <p>
* This class represents a catastrophic failure of the system, especially the Java virtual machine.
* Any class may, at any time, indicate that a system failure has occurred by calling
* {@link #initiateFailure(Error)} (or, less commonly, {@link #setFailure(Error)}).
* <p>
* In practice, the most common type of failure that is likely to be reported by an otherwise
* healthy JVM is {@link OutOfMemoryError}. However, GemFire will report any occurrence of
* {@link VirtualMachineError} as a JVM failure.
* <p>
* When a failure is reported, you must assume that the JVM has <em>broken its fundamental execution
* contract</em> with your application. No programming invariant can be assumed to be true, and your
* entire application must be regarded as corrupted.
* <h1>Failure Hooks</h1> GemFire uses this class to disable its distributed system (group
* communication) and any open caches. It also provides a hook for you to respond to after GemFire
* disables itself.
* <h1>Failure WatchDog</h1> When {@link #startThreads()} is called, a "watchdog" {@link Thread} is
* started that periodically checks to see if system corruption has been reported. When system
* corruption is detected, this thread proceeds to:
* <p>
* <ol>
* <li><em>Close GemFire</em> -- Group communication is ceased (this cache member recuses itself
* from the distributed system) and the cache is further poisoned (it is pointless to try to cleanly
* close it at this point.).
* <p>
* After this has successfully ended, we launch a</li>
* <li><em>failure action</em>, a user-defined Runnable {@link #setFailureAction(Runnable)}. By
* default, this Runnable performs nothing. If you feel you need to perform an action before exiting
* the JVM, this hook gives you a means of attempting some action. Whatever you attempt should be
* extremely simple, since your Java execution environment has been corrupted.
* <p>
* GemStone recommends that you employ
* <a href="http://wrapper.tanukisoftware.org/doc/english/introduction.html"> Java Service
* Wrapper</a> to detect when your JVM exits and to perform appropriate failure and restart actions.
* </li>
* <li>Finally, if the application has granted the watchdog permission to exit the JVM (via
* {@link #setExitOK(boolean)}), the watchdog calls {@link System#exit(int)} with an argument of 1.
* If you have not granted this class permission to close the JVM, you are <em>strongly</em> advised
* to call it in your failure action (in the previous step).</li>
* </ol>
* <p>
* Each of these actions will be run exactly once in the above described order. However, if either
* step throws any type of error ({@link Throwable}), the watchdog will assume that the JVM is still
* under duress (esp. an {@link OutOfMemoryError}), will wait a bit, and then retry the failed
* action.
* <p>
* It bears repeating that you should be very cautious of any Runnables you ask this class to run.
* By definition the JVM is <em>very sick</em> when failure has been signalled.
* <p>
* <h1>Failure Proctor</h1> In addition to the failure watchdog, {@link #startThreads()} creates a
* second thread (the "proctor") that monitors free memory. It does this by examining
* {@link Runtime#freeMemory() free memory}, {@link Runtime#totalMemory() total memory} and
* {@link Runtime#maxMemory() maximum memory}. If the amount of available memory stays below a given
* {@link #setFailureMemoryThreshold(long) threshold}, for more than {@link #WATCHDOG_WAIT} seconds,
* the watchdog is notified.
* <p>
* Note that the proctor can be effectively disabled by
* {@link SystemFailure#setFailureMemoryThreshold(long) setting} the failure memory threshold to a
* negative value.
* <p>
* The proctor is a second line of defense, attempting to detect OutOfMemoryError conditions in
* circumstances where nothing alerted the watchdog. For instance, a third-party jar might
* incorrectly handle this error and leave your virtual machine in a "stuck" state.
* <p>
* Note that the proctor does not relieve you of the obligation to follow the best practices in the
* next section.
* <h1>Best Practices</h1>
* <h2>Catch and Handle VirtualMachineError</h2> If you feel obliged to catch <em>either</em>
* {@link Error}, or {@link Throwable}, you <em>must</em>also check for {@link VirtualMachineError}
* like so:
* <p>
*
* <pre>
catch (VirtualMachineError err) {
SystemFailure.{@link #initiateFailure(Error) initiateFailure}(err);
// If this ever returns, rethrow the error. We're poisoned
// now, so don't let this thread continue.
throw err;
}
* </pre>
*
* <h2>Periodically Check For Errors</h2> Check for serious system errors at appropriate points in
* your algorithms. You may elect to use the {@link #checkFailure()} utility function, but you are
* not required to (you could just see if {@link SystemFailure#getFailure()} returns a non-null
* result).
* <p>
* A job processing loop is a good candidate, for instance, in
* org.apache.org.jgroups.protocols.UDP#run(), which implements {@link Thread#run}:
* <p>
*
* <pre>
for (;;) {
SystemFailure.{@link #checkFailure() checkFailure}();
if (mcast_recv_sock == null || mcast_recv_sock.isClosed()) break;
if (Thread.currentThread().isInterrupted()) break;
...
* </pre>
*
* <h2>Create Logging ThreadGroups</h2> If you create any Thread, a best practice is to catch severe
* errors and signal failure appropriately. One trick to do this is to create a ThreadGroup that
* handles uncaught exceptions by overriding
* {@link ThreadGroup#uncaughtException(Thread, Throwable)} and to declare your thread as a member
* of that {@link ThreadGroup}. This also has a significant side-benefit in that most uncaught
* exceptions can be detected:
* <p>
*
* <pre>
ThreadGroup tg = new ThreadGroup("Worker Threads") {
public void uncaughtException(Thread t, Throwable e) {
// Do this *before* any object allocation in case of
// OutOfMemoryError (for instance)
if (e instanceof VirtualMachineError) {
SystemFailure.{@link #setFailure(Error) setFailure}((VirtualMachineError)e); // don't throw
}
String s = "Uncaught exception in thread " + t;
system.getLogWriter().severe(s, e);
}
Thread t = new Thread(myRunnable, tg, "My Thread");
t.start();
}; *
* </pre>
* <p>
* <h2>Catches of Error and Throwable Should Check for Failure</h2> Keep in mind that peculiar or
* flat-out<em>impossible</em> exceptions may ensue after a VirtualMachineError has been thrown
* <em>anywhere</em> in your virtual machine. Whenever you catch {@link Error} or {@link Throwable},
* you should also make sure that you aren't dealing with a corrupted JVM:
* <p>
*
* <pre>
catch (Throwable t) {
// Whenever you catch Error or Throwable, you must also
// catch VirtualMachineError (see above). However, there is
// _still_ a possibility that you are dealing with a cascading
// error condition, so you also need to check to see if the JVM
// is still usable:
SystemFailure.{@link #checkFailure() checkFailure}();
...
}
* </pre>
*
* @since GemFire 5.1
*/
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "DM_GC",
justification = "This class performs System.gc as last ditch effort during out-of-memory condition.")
public final class SystemFailure {
/**
* Time to wait during stopWatchdog and stopProctor. Not final for tests
*/
static int SHUTDOWN_WAIT = 1000;
/**
* Preallocated error messages\ LocalizedStrings may use memory (in the form of an iterator) so we
* must get the translated messages in advance.
**/
static final String JVM_CORRUPTION =
LocalizedStrings.SystemFailure_JVM_CORRUPTION_HAS_BEEN_DETECTED.toLocalizedString();
static final String CALLING_SYSTEM_EXIT =
LocalizedStrings.SystemFailure_SINCE_THIS_IS_A_DEDICATED_CACHE_SERVER_AND_THE_JVM_HAS_BEEN_CORRUPTED_THIS_PROCESS_WILL_NOW_TERMINATE_PERMISSION_TO_CALL_SYSTEM_EXIT_INT_WAS_GIVEN_IN_THE_FOLLOWING_CONTEXT
.toLocalizedString();
public static final String DISTRIBUTION_HALTED_MESSAGE =
LocalizedStrings.SystemFailure_DISTRIBUTION_HALTED_DUE_TO_JVM_CORRUPTION.toLocalizedString();
public static final String DISTRIBUTED_SYSTEM_DISCONNECTED_MESSAGE =
LocalizedStrings.SystemFailure_DISTRIBUTED_SYSTEM_DISCONNECTED_DUE_TO_JVM_CORRUPTION
.toLocalizedString();
/**
* the underlying failure
*
* This is usually an instance of {@link VirtualMachineError}, but it is not required to be such.
*
* @see #getFailure()
* @see #initiateFailure(Error)
*/
protected static volatile Error failure = null;
/**
* user-defined runnable to run last
*
* @see #setFailureAction(Runnable)
*/
private static volatile Runnable failureAction = new Runnable() {
public void run() {
System.err.println(JVM_CORRUPTION);
failure.printStackTrace();
}
};
/**
* @see #setExitOK(boolean)
*/
private static volatile boolean exitOK = false;
/**
* If we're going to exit the JVM, I want to be accountable for who told us it was OK.
*/
private static volatile Throwable exitExcuse;
/**
* Indicate whether it is acceptable to call {@link System#exit(int)} after failure processing has
* completed.
* <p>
* This may be dynamically modified while the system is running.
*
* @param newVal true if it is OK to exit the process
* @return the previous value
*/
public static boolean setExitOK(boolean newVal) {
boolean result = exitOK;
exitOK = newVal;
if (exitOK) {
exitExcuse = new Throwable("SystemFailure exitOK set");
} else {
exitExcuse = null;
}
return result;
}
// merge42180: Added this method while merging 42180. It should have already be here through
// different merges or will come later
/**
* Returns true if the given Error is a fatal to the JVM and it should be shut down. Code should
* call {@link #initiateFailure(Error)} or {@link #setFailure(Error)} if this returns true.
*/
public static boolean isJVMFailureError(Error err) {
// all VirtualMachineErrors are not fatal to the JVM, in particular
// StackOverflowError is not
return err instanceof OutOfMemoryError || err instanceof UnknownError;
}
/**
* Disallow instance creation
*/
private SystemFailure() {
}
/**
* Synchronizes access to state variables, used to notify the watchdog when to run
*
* @see #notifyWatchDog()
* @see #startProctor()
* @see #startWatchDog()
*/
private static final Object failureSync = new Object();
/**
* True if we have closed GemFire
*
* @see #emergencyClose()
*/
private static volatile boolean gemfireCloseCompleted = false;
/**
* True if we have completed the user-defined failure action
*
* @see #setFailureAction(Runnable)
*/
private static volatile boolean failureActionCompleted = false;
/**
* This is a logging ThreadGroup, created only once.
*/
private final static ThreadGroup tg;
static {
tg = new ThreadGroup("SystemFailure Watchdog Threads") {
// If the watchdog is correctly written, this will never get executed.
// However, there's no reason for us not to eat our own dog food
// (har, har) -- see the javadoc above.
@Override
public void uncaughtException(Thread t, Throwable e) {
// Uhhh...if the watchdog is running, we *know* there's some
// sort of serious error, no need to check for it here.
System.err.println("Internal error in SystemFailure watchdog:" + e);
e.printStackTrace();
}
};
}
/**
* This is the amount of time, in seconds, the watchdog periodically awakens to see if the system
* has been corrupted.
* <p>
* The watchdog will be explicitly awakened by calls to {@link #setFailure(Error)} or
* {@link #initiateFailure(Error)}, but it will awaken of its own accord periodically to check for
* failure even if the above calls do not occur.
* <p>
* This can be set with the system property <code>gemfire.WATCHDOG_WAIT</code>. The default is 15
* sec.
*/
static public final int WATCHDOG_WAIT =
Integer.getInteger(DistributionConfig.GEMFIRE_PREFIX + "WATCHDOG_WAIT", 15).intValue();
/**
* This is the watchdog thread
*
* @guarded.By {@link #failureSync}
*/
private static Thread watchDog;
private static volatile boolean isCacheClosing = false;
/**
* Should be invoked when GemFire cache is being created.
*/
public static void signalCacheCreate() {
isCacheClosing = false;
}
/**
* Should be invoked when GemFire cache is closing or closed.
*/
public static void signalCacheClose() {
isCacheClosing = true;
if (proctor != null) {
proctor.interrupt();
}
if (watchDog != null) {
watchDog.interrupt();
}
}
/**
* Start the watchdog thread, if it isn't already running.
*/
private static void startWatchDog() {
if (failureActionCompleted) {
// Our work is done, don't restart
return;
}
synchronized (failureSync) {
if (watchDog != null && watchDog.isAlive()) {
return;
}
watchDog = new Thread(tg, new Runnable() {
public void run() {
runWatchDog();
}
}, "SystemFailure WatchDog");
watchDog.setDaemon(true);
watchDog.start();
}
}
private static void stopWatchDog() {
Thread watchDogSnapshot = null;
synchronized (failureSync) {
stopping = true;
if (watchDog != null && watchDog.isAlive()) {
failureSync.notifyAll();
watchDogSnapshot = watchDog;
}
}
if (watchDogSnapshot != null) {
try {
watchDogSnapshot.join(100);
} catch (InterruptedException ignore) {
}
if (watchDogSnapshot.isAlive()) {
watchDogSnapshot.interrupt();
try {
watchDogSnapshot.join(SHUTDOWN_WAIT);
} catch (InterruptedException ignore) {
}
}
}
}
/**
* This is the run loop for the watchdog thread.
*/
static protected void runWatchDog() {
boolean warned = false;
logFine(WATCHDOG_NAME, "Starting");
try {
basicLoadEmergencyClasses();
} catch (ExceptionInInitializerError e) {
// Uhhh...are we shutting down?
boolean noSurprise = false;
Throwable cause = e.getCause();
if (cause != null) {
if (cause instanceof IllegalStateException) {
String msg = cause.getMessage();
if (msg.indexOf("Shutdown in progress") >= 0) {
noSurprise = true;
}
}
}
if (!noSurprise) {
logWarning(WATCHDOG_NAME, "Unable to load GemFire classes: ", e);
}
// In any event, we're toast
return;
} catch (CancelException e) {
// ignore this because we are shutting down anyway
} catch (Throwable t) {
logWarning(WATCHDOG_NAME, "Unable to initialize watchdog", t);
return;
}
for (;;) {
if (stopping) {
return;
}
try {
if (isCacheClosing) {
break;
}
// Sleep or get notified...
synchronized (failureSync) {
if (stopping) {
return;
}
logFine(WATCHDOG_NAME, "Waiting for disaster");
try {
failureSync.wait(WATCHDOG_WAIT * 1000);
} catch (InterruptedException e) {
// Ignore
}
if (stopping) {
return;
}
}
// Poke nose in the air, take a sniff...
if (failureActionCompleted) {
// early out, for testing
logInfo(WATCHDOG_NAME, "all actions completed; exiting");
}
if (failure == null) {
// Tail wag. Go back to sleep.
logFine(WATCHDOG_NAME, "no failure detected");
continue;
}
// BOW WOW WOW WOW WOW! Corrupted system.
if (!warned) {
warned = logWarning(WATCHDOG_NAME, "failure detected", failure);
}
// If any of the following fail, we will go back to sleep and
// retry.
if (!gemfireCloseCompleted) {
logInfo(WATCHDOG_NAME, "closing GemFire");
try {
emergencyClose();
} catch (Throwable t) {
logWarning(WATCHDOG_NAME, "trouble closing GemFire", t);
continue; // go back to sleep
}
gemfireCloseCompleted = true;
}
if (!failureActionCompleted) {
// avoid potential race condition setting the runnable
Runnable r = failureAction;
if (r != null) {
logInfo(WATCHDOG_NAME, "running user's runnable");
try {
r.run();
} catch (Throwable t) {
logWarning(WATCHDOG_NAME, "trouble running user's runnable", t);
continue; // go back to sleep
}
}
failureActionCompleted = true;
}
stopping = true;
stopProctor();
if (exitOK) {
logWarning(WATCHDOG_NAME,
// No "+" in this long message, we're out of memory!
CALLING_SYSTEM_EXIT, exitExcuse);
// ATTENTION: there are VERY FEW places in GemFire where it is
// acceptable to call System.exit. This is one of those
// places...
System.exit(1);
}
// Our job here is done
logInfo(WATCHDOG_NAME, "exiting");
return;
} catch (Throwable t) {
// We *never* give up. NEVER EVER!
logWarning(WATCHDOG_NAME, "thread encountered a problem: " + t, t);
}
} // for
}
/**
* Spies on system statistics looking for low memory threshold
*
* Well, if you're gonna have a watchdog, why not a watch CAT????
*
* @guarded.By {@link #failureSync}
* @see #minimumMemoryThreshold
*/
private static Thread proctor;
/**
* This mutex controls access to {@link #firstStarveTime} and {@link #minimumMemoryThreshold}.
* <p>
* I'm hoping that a fat lock is never created here, so that an object allocation isn't necessary
* to acquire this mutex. You'd have to have A LOT of contention on this mutex in order for a fat
* lock to be created, which indicates IMHO a serious problem in your applications.
*/
private static final Object memorySync = new Object();
/**
* This is the minimum amount of memory that the proctor will tolerate before declaring a system
* failure.
*
* @see #setFailureMemoryThreshold(long)
* @guarded.By {@link #memorySync}
*/
static long minimumMemoryThreshold =
Long.getLong(DistributionConfig.GEMFIRE_PREFIX + "SystemFailure.chronic_memory_threshold",
1048576).longValue();
/**
* This is the interval, in seconds, that the proctor thread will awaken and poll system free
* memory.
*
* The default is 1 sec. This can be set using the system property
* <code>gemfire.SystemFailure.MEMORY_POLL_INTERVAL</code>.
*
* @see #setFailureMemoryThreshold(long)
*/
static final public long MEMORY_POLL_INTERVAL =
Long.getLong(DistributionConfig.GEMFIRE_PREFIX + "SystemFailure.MEMORY_POLL_INTERVAL", 1)
.longValue();
/**
* This is the maximum amount of time, in seconds, that the proctor thread will tolerate seeing
* free memory stay below {@link #setFailureMemoryThreshold(long)}, after which point it will
* declare a system failure.
*
* The default is 15 sec. This can be set using the system property
* <code>gemfire.SystemFailure.MEMORY_MAX_WAIT</code>.
*
* @see #setFailureMemoryThreshold(long)
*/
static final public long MEMORY_MAX_WAIT = Long
.getLong(DistributionConfig.GEMFIRE_PREFIX + "SystemFailure.MEMORY_MAX_WAIT", 15).longValue();
/**
* Flag that determines whether or not we monitor memory on our own. If this flag is set, we will
* check freeMemory, invoke GC if free memory gets low, and start throwing our own
* OutOfMemoryException if
*
* The default is false, so this monitoring is turned off. This monitoring has been found to be
* unreliable in non-Sun VMs when the VM is under stress or behaves in unpredictable ways.
*
* @since GemFire 6.5
*/
static final public boolean MONITOR_MEMORY =
Boolean.getBoolean(DistributionConfig.GEMFIRE_PREFIX + "SystemFailure.MONITOR_MEMORY");
/**
* Start the proctor thread, if it isn't already running.
*
* @see #proctor
*/
private static void startProctor() {
if (failure != null) {
// no point!
notifyWatchDog();
return;
}
synchronized (failureSync) {
if (proctor != null && proctor.isAlive()) {
return;
}
proctor = new Thread(tg, new Runnable() {
public void run() {
runProctor();
}
}, "SystemFailure Proctor");
proctor.setDaemon(true);
proctor.start();
}
}
private static void stopProctor() {
Thread proctorSnapshot = null;
synchronized (failureSync) {
stopping = true;
proctorSnapshot = proctor;
}
if (proctorSnapshot != null && proctorSnapshot.isAlive()) {
proctorSnapshot.interrupt();
try {
proctorSnapshot.join(SHUTDOWN_WAIT);
} catch (InterruptedException ignore) {
}
}
}
/**
* Symbolic representation of an invalid starve time
*/
static private final long NEVER_STARVED = Long.MAX_VALUE;
/**
* this is the last time we saw memory starvation
*
* @guarded.By {@link #memorySync}}}
*/
static private long firstStarveTime = NEVER_STARVED;
/**
* This is the previous measure of total memory. If it changes, we reset the proctor's starve
* statistic.
*/
static private long lastTotalMemory = 0;
/**
* This is the run loop for the proctor thread (formally known as the "watchcat" (grin)
*/
static protected void runProctor() {
// Note that the javadocs say this can return Long.MAX_VALUE.
// If it does, the proctor will never do its job...
final long maxMemory = Runtime.getRuntime().maxMemory();
// Allocate this error in advance, since it's too late once
// it's been detected!
final OutOfMemoryError oome = new OutOfMemoryError(
LocalizedStrings.SystemFailure_0_MEMORY_HAS_REMAINED_CHRONICALLY_BELOW_1_BYTES_OUT_OF_A_MAXIMUM_OF_2_FOR_3_SEC
.toLocalizedString(new Object[] {PROCTOR_NAME, Long.valueOf(minimumMemoryThreshold),
Long.valueOf(maxMemory), Integer.valueOf(WATCHDOG_WAIT)}));
// Catenation, but should be OK when starting up
logFine(PROCTOR_NAME,
"Starting, threshold = " + minimumMemoryThreshold + "; max = " + maxMemory);
for (;;) {
if (isCacheClosing) {
break;
}
if (stopping) {
return;
}
try {
// *** catnap...
try {
Thread.sleep(MEMORY_POLL_INTERVAL * 1000);
} catch (InterruptedException e) {
// ignore
}
if (stopping) {
return;
}
// *** Twitch ear, take a bath...
if (failureActionCompleted) {
// it's all over, we're late
return;
}
if (failure != null) {
notifyWatchDog(); // wake the dog, just in case
logFine(PROCTOR_NAME, "Failure has been reported, exiting");
return;
}
if (!MONITOR_MEMORY) {
continue;
}
// *** Sit up, stretch...
long totalMemory = Runtime.getRuntime().totalMemory();
if (totalMemory < maxMemory) {
// We haven't finished growing the heap, so no worries...yet
if (DEBUG) {
// This message has catenation, we don't want this in
// production code :-)
logFine(PROCTOR_NAME,
"totalMemory (" + totalMemory + ") < maxMemory (" + maxMemory + ")");
}
firstStarveTime = NEVER_STARVED;
continue;
}
if (lastTotalMemory < totalMemory) {
// Don't get too impatient if the heap just now grew
lastTotalMemory = totalMemory; // now we're maxed
firstStarveTime = NEVER_STARVED; // reset the clock
continue;
}
lastTotalMemory = totalMemory; // make a note of this
// *** Hey, is that the food bowl?
// At this point, freeMemory really indicates how much
// trouble we're in.
long freeMemory = Runtime.getRuntime().freeMemory();
if (freeMemory == 0) {
/*
* This is to workaround X bug #41821 in JRockit. Often, Jrockit returns 0 from
* Runtime.getRuntime().freeMemory() Allocating this one object and calling again seems to
* workaround the problem.
*/
new Object();
freeMemory = Runtime.getRuntime().freeMemory();
}
// Grab the threshold and starve time once, under mutex, because
// it's publicly modifiable.
long curThreshold;
long lastStarveTime;
synchronized (memorySync) {
curThreshold = minimumMemoryThreshold;
lastStarveTime = firstStarveTime;
}
if (freeMemory >= curThreshold /* enough memory */
|| curThreshold == 0 /* disabled */) {
// Memory is FINE, reset everything
if (DEBUG) {
// This message has catenation, we don't want this in
// production code :-)
logFine(PROCTOR_NAME, "Current free memory is: " + freeMemory);
}
if (lastStarveTime != NEVER_STARVED) {
logFine(PROCTOR_NAME, "...low memory has self-corrected.");
}
synchronized (memorySync) {
firstStarveTime = NEVER_STARVED;
}
continue;
}
// Memory is low
// *** Leap to feet, nose down, tail switching...
long now = System.currentTimeMillis();
if (lastStarveTime == NEVER_STARVED) {
// first sighting
if (DEBUG) {
// Catenation in this message, don't put in production
logFine(PROCTOR_NAME,
"Noting current memory " + freeMemory + " is less than threshold " + curThreshold);
} else {
logWarning(PROCTOR_NAME,
"Noting that current memory available is less than the currently designated threshold",
null);
}
synchronized (memorySync) {
firstStarveTime = now;
}
System.gc(); // at least TRY...
continue;
}
// *** squirm, wait for the right moment...wait...wait...
if (now - lastStarveTime < MEMORY_MAX_WAIT * 1000) {
// Very recent; problem may correct itself.
if (DEBUG) {
// catenation
logFine(PROCTOR_NAME, "...memory is still below threshold: " + freeMemory);
} else {
logWarning(PROCTOR_NAME,
"Noting that current memory available is still below currently designated threshold",
null);
}
continue;
}
// *** Meow! Meow! MEOWWWW!!!!!
// Like any smart cat, let the Dog do all the work.
logWarning(PROCTOR_NAME, "Memory is chronically low; setting failure!", null);
SystemFailure.setFailure(oome);
notifyWatchDog();
return; // we're done!
} catch (Throwable t) {
logWarning(PROCTOR_NAME, "thread encountered a problem", t);
// We *never* give up. NEVER EVER!
}
} // for
}
/**
* Enables some fine logging
*/
static private final boolean DEBUG = false;
/**
* If true, we track the progress of emergencyClose on System.err
*/
static public final boolean TRACE_CLOSE = false;
static protected final String WATCHDOG_NAME = "SystemFailure Watchdog";
static protected final String PROCTOR_NAME = "SystemFailure Proctor";
/**
* break any potential circularity in {@link #loadEmergencyClasses()}
*/
private static volatile boolean emergencyClassesLoaded = false;
/**
* Since it requires object memory to unpack a jar file, make sure this JVM has loaded the classes
* necessary for closure <em>before</em> it becomes necessary to use them.
* <p>
* Note that just touching the class in order to load it is usually sufficient, so all an
* implementation needs to do is to reference the same classes used in {@link #emergencyClose()}.
* Just make sure to do it while you still have memory to succeed!
*/
public static void loadEmergencyClasses() {
// This method was called to basically load this class
// and invoke its static initializers. Now that we don't
// use statics to start the threads all we need to do is
// call startThreads. The watchdog thread will call basicLoadEmergencyClasses.
startThreads();
}
private static void basicLoadEmergencyClasses() {
if (emergencyClassesLoaded)
return;
emergencyClassesLoaded = true;
SystemFailureTestHook.loadEmergencyClasses(); // bug 50516
GemFireCacheImpl.loadEmergencyClasses();
RemoteGfManagerAgent.loadEmergencyClasses();
}
/**
* Attempt to close any and all GemFire resources.
*
* The contract of this method is that it should not acquire any synchronization mutexes nor
* create any objects.
* <p>
* The former is because the system is in an undefined state and attempting to acquire the mutex
* may cause a hang.
* <p>
* The latter is because the likelihood is that we are invoking this method due to memory
* exhaustion, so any attempt to create an object will also cause a hang.
* <p>
* This method is not meant to be called directly (but, well, I guess it could). It is public to
* document the contract that is implemented by <code>emergencyClose</code> in other parts of the
* system.
*/
public static void emergencyClose() {
// Make the cache (more) useless and inaccessible...
if (TRACE_CLOSE) {
System.err.println("SystemFailure: closing GemFireCache");
}
GemFireCacheImpl.emergencyClose();
// Arcane strange DS's exist in this class:
if (TRACE_CLOSE) {
System.err.println("SystemFailure: closing admins");
}
RemoteGfManagerAgent.emergencyClose();
// If memory was the problem, make an explicit attempt at
// this point to clean up.
System.gc(); // This will fail if we're out of memory?/
if (TRACE_CLOSE) {
System.err.println("SystemFailure: end of emergencyClose");
}
}
/**
* Throw the system failure.
*
* This method does not return normally.
* <p>
* Unfortunately, attempting to create a new Throwable at this point may cause the thread to hang
* (instead of generating another OutOfMemoryError), so we have to make do with whatever Error we
* have, instead of wrapping it with one pertinent to the current context. See bug 38394.
*
* @throws Error
*/
static private void throwFailure() throws InternalGemFireError, Error {
// Do not return normally...
if (failure != null)
throw failure;
}
/**
* Notifies the watchdog thread (assumes that {@link #failure} has been set)
*/
private static void notifyWatchDog() {
startWatchDog(); // just in case
synchronized (failureSync) {
failureSync.notifyAll();
}
}
/**
* Utility function to check for failures. If a failure is detected, this methods throws an
* AssertionFailure.
*
* @see #initiateFailure(Error)
* @throws InternalGemFireError if the system has been corrupted
* @throws Error if the system has been corrupted and a thread-specific AssertionError cannot be
* allocated
*/
public static void checkFailure() throws InternalGemFireError, Error {
if (failure == null) {
return;
}
notifyWatchDog();
throwFailure();
}
/**
* Signals that a system failure has occurred and then throws an AssertionError.
*
* @param f the failure to set
* @throws IllegalArgumentException if f is null
* @throws InternalGemFireError always; this method does not return normally.
* @throws Error if a thread-specific AssertionError cannot be allocated.
*/
public static void initiateFailure(Error f) throws InternalGemFireError, Error {
SystemFailure.setFailure(f);
throwFailure();
}
/**
* Set the underlying system failure, if not already set.
* <p>
* This method does not generate an error, and should only be used in circumstances where
* execution needs to continue, such as when re-implementing
* {@link ThreadGroup#uncaughtException(Thread, Throwable)}.
*
* @param failure the system failure
* @throws IllegalArgumentException if you attempt to set the failure to null
*/
public static void setFailure(Error failure) {
if (failure == null) {
throw new IllegalArgumentException(
LocalizedStrings.SystemFailure_YOU_ARE_NOT_PERMITTED_TO_UNSET_A_SYSTEM_FAILURE
.toLocalizedString());
}
if (SystemFailureTestHook.errorIsExpected(failure)) {
return;
}
// created (OutOfMemoryError), and no stack frames are created
// (StackOverflowError). There is a slight chance that the
// very first error may get overwritten, but this avoids the
// potential of object creation via a fat lock
SystemFailure.failure = failure;
notifyWatchDog();
}
/**
* Returns the catastrophic system failure, if any.
* <p>
* This is usually (though not necessarily) an instance of {@link VirtualMachineError}.
* <p>
* A return value of null indicates that no system failure has yet been detected.
* <p>
* Object synchronization can implicitly require object creation (fat locks in JRockit for
* instance), so the underlying value is not synchronized (it is a volatile). This means the
* return value from this call is not necessarily the <em>first</em> failure reported by the JVM.
* <p>
* Note that even if it <em>were</em> synchronized, it would only be a proximal indicator near the
* time that the JVM crashed, and may not actually reflect the underlying root cause that
* generated the failure. For instance, if your JVM is running short of memory, this Throwable is
* probably an innocent victim and <em>not</em> the actual allocation (or series of allocations)
* that caused your JVM to exhaust memory.
* <p>
* If this function returns a non-null value, keep in mind that the JVM is very limited. In
* particular, any attempt to allocate objects may fail if the original failure was an
* OutOfMemoryError.
*
* @return the failure, if any
*/
public static Error getFailure() {
return failure;
}
/**
* Sets a user-defined action that is run in the event that failure has been detected.
* <p>
* This action is run <em>after</em> the GemFire cache has been shut down. If it throws any error,
* it will be reattempted indefinitely until it succeeds. This action may be dynamically modified
* while the system is running.
* <p>
* The default action prints the failure stack trace to System.err.
*
* @see #initiateFailure(Error)
* @param action the Runnable to use
* @return the previous action
*/
public static Runnable setFailureAction(Runnable action) {
Runnable old = SystemFailure.failureAction;
SystemFailure.failureAction = action;
return old;
}
/**
* Set the memory threshold under which system failure will be notified.
*
* This value may be dynamically modified while the system is running. The default is 1048576
* bytes. This can be set using the system property
* <code>gemfire.SystemFailure.chronic_memory_threshold</code>.
*
* @param newVal threshold in bytes
* @return the old threshold
* @see Runtime#freeMemory()
*/
public static long setFailureMemoryThreshold(long newVal) {
long result;
synchronized (memorySync) {
result = minimumMemoryThreshold;
minimumMemoryThreshold = newVal;
firstStarveTime = NEVER_STARVED; // reset
}
startProctor(); // just in case
return result;
}
// /**
// * For use by GemStone Quality Assurance Only
// *
// * @deprecated TODO remove this
// */
// public static void reset() {
// System.gc();
// logWarning("DJP", "do not commit SystemFailure#reset", null);
// failure = null;
// failureAction = new Runnable() {
// public void run() {
// System.err.println("(SystemFailure) JVM corruption has been detected!");
// failure.printStackTrace();
// }
// };
// gemfireCloseCompleted = false;
// failureActionCompleted = false;
// synchronized (failureSync) {
// if (watchDog != null) {
// watchDog.interrupt();
// }
// watchDog = null;
// if (watchCat != null) {
// watchCat.interrupt();
// }
// watchCat = null;
// }
//
// startWatchDog();
// startWatchCat();
// }
static private boolean logStdErr(String kind, String name, String s, Throwable t) {
// As far as I can tell, this code path doesn't allocate
// any objects!!!!
try {
System.err.print(name);
System.err.print(": [");
System.err.print(kind);
System.err.print("] ");
System.err.println(s);
if (t != null) {
t.printStackTrace();
}
return true;
} catch (Throwable t2) {
// out of luck
return false;
}
}
/**
* Logging can require allocation of objects, so we wrap the logger so that failures are silently
* ignored.
*
* @param s string to print
* @param t the call stack, if any
* @return true if the warning got printed
*/
static protected boolean logWarning(String name, String s, Throwable t) {
return logStdErr("warning", name, s, t);
// if (PREFER_STDERR) {
// return logStdErr("warning", name, s, t);
// }
// try {
// log.warning(name + ": " + s, t);
// return true;
// }
// catch (Throwable t2) {
// return logStdErr("warning", name, s, t);
// }
}
/**
* Logging can require allocation of objects, so we wrap the logger so that failures are silently
* ignored.
*
* @param s string to print
*/
static protected void logInfo(String name, String s) {
logStdErr("info", name, s, null);
// if (PREFER_STDERR) {
// logStdErr("info", name, s, null);
// return;
// }
// try {
// log.info(name + ": " + s);
// }
// catch (Throwable t) {
// logStdErr("info", name, s, t);
// }
}
/**
* Logging can require allocation of objects, so we wrap the logger so that failures are silently
* ignored.
*
* @param s string to print
*/
static protected void logFine(String name, String s) {
if (DEBUG) {
logStdErr("fine", name, s, null);
}
// if (DEBUG && PREFER_STDERR) {
// logStdErr("fine", name, s, null);
// return;
// }
// try {
// log.fine(name + ": " + s);
// }
// catch (Throwable t) {
// if (DEBUG) {
// logStdErr("fine", name, s, null);
// }
// }
}
private static volatile boolean stopping;
/**
* This starts up the watchdog and proctor threads. This method is called when a Cache is created.
*/
public static void startThreads() {
stopping = false;
startWatchDog();
startProctor();
}
/**
* This stops the threads that implement this service. This method is called when a Cache is
* closed.
*/
public static void stopThreads() {
// this method fixes bug 45409
stopping = true;
stopProctor();
stopWatchDog();
}
static Thread getWatchDogForTest() {
return watchDog;
}
static Thread getProctorForTest() {
return proctor;
}
}