/* * RHQ Management Platform * Copyright (C) 2005-2008 Red Hat, Inc. * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.rhq.enterprise.agent; import java.lang.management.ManagementFactory; import java.lang.management.MemoryMXBean; import java.lang.management.MemoryPoolMXBean; import java.lang.management.MemoryUsage; import java.util.ArrayList; import java.util.List; import mazz.i18n.Logger; import org.rhq.enterprise.agent.AgentRestartCounter.AgentRestartReason; import org.rhq.enterprise.agent.i18n.AgentI18NFactory; import org.rhq.enterprise.agent.i18n.AgentI18NResourceKeys; /** * This is a thread that will periodically check the health of the VM * (e.g. check the memory usage within the VM to detect if * memory is critically low), and if the health is poor it will put the * agent into hibernate mode, which will essentially shutdown the agent, * let it pause for some amount of time, then restart the agent. This * will hopefully clear up the poor VM condition. * * @author John Mazzitelli */ public class VMHealthCheckThread extends Thread { private static final Logger LOG = AgentI18NFactory.getLogger(VMHealthCheckThread.class); /** * The agent that will be hibernated if the VM is critically sick. */ private final AgentMain agent; /** * The amount of time in milliseconds that this thread will sleep in between checks */ private final long interval; /** * If the amount of used heap memory is larger than this percentage of max heap memory * then the VM will be considered critically low on heap. */ private final float heapThreshold; /** * If the amount of used non-heap memory is larger than this percentage of max non-heap memory, * then the VM will be considered critically low on heap. */ private final float nonheapThreshold; /** * If <code>true</code>, the thread will explicitly ask for garbage collection to occur when * memory is critical. If <code>false</code>, the thread will merely report when memory is critical, * but it will not attempt to correct the situation itself - it will assume the garabage collector * will trigger at the appropriate time. */ private final boolean performGC; /** * These are names used to identify MemoryPoolMXBeans that are to be monitored. */ private final List<String> memoryPoolsToMonitor; public VMHealthCheckThread(AgentMain agent) { super("RHQ VM Health Check Thread"); setDaemon(false); this.agent = agent; AgentConfiguration config = agent.getConfiguration(); if (config != null) { this.interval = config.getVMHealthCheckIntervalMsecs(); this.heapThreshold = config.getVMHealthCheckLowHeapMemThreshold(); this.nonheapThreshold = config.getVMHealthCheckLowNonHeapMemThreshold(); } else { // this should never happen, but I'm paranoid this.interval = 5000L; this.heapThreshold = 0.90f; this.nonheapThreshold = 0.90f; } // TODO: put these in agent configuration this.memoryPoolsToMonitor = new ArrayList<String>(); String memoryPoolNames = System.getProperty("rhq.agent.vm-health-check.mem-pools-to-check", "perm gen"); for (String memoryPoolName : memoryPoolNames.split(",")) { this.memoryPoolsToMonitor.add(memoryPoolName.toLowerCase()); // lowercase so our checks are case-insensitive } String gcProp = System.getProperty("rhq.agent.vm-health-check.perform-gc", "true"); this.performGC = Boolean.parseBoolean(gcProp); return; } /** * Tells this thread to stop checking. This will block and wait for the thread to die. */ public void stopChecking() { interrupt(); try { join(interval); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } /** * Returns true if we are out of memory. */ boolean isOutOfMemory() { MemoryMXBean memoryMxBean = ManagementFactory.getMemoryMXBean(); List<MemoryPoolMXBean> memoryPoolMxBeans = getMemoryPoolMXBeansToMonitor(); return checkMemory(memoryMxBean) || checkPoolMemories(memoryPoolMxBeans, memoryMxBean); } @Override public void run() { LOG.debug(AgentI18NResourceKeys.VM_HEALTH_CHECK_THREAD_STARTED, this.interval); try { // perform an initial sleep to prevent us from trying to stop while agent is still starting synchronized (this) { wait(this.interval); } while (!isInterrupted()) { try { if (isOutOfMemory()) { LOG.fatal(AgentI18NResourceKeys.VM_HEALTH_CHECK_SEES_MEM_PROBLEM); restartAgent(60000L); continue; } // TODO: if our memory is good, we might have to check and make sure we are // the only thread running. Under an odd and rare circumstance (if // restartAgent fails to completely start the agent but did manage to // start another VM check thread and failed to "re-shutdown" the agent) // there will end up being more than one of these threads running. // We'll need to make sure we kill all threads but one. // go to sleep before we check again synchronized (this) { wait(this.interval); } } catch (VirtualMachineError vme) { // We're too late - OOM probably happening now. // Try to do as little as possible here (no logging, no creating objects) // and immediately try to shutdown our agent and restart it. restartAgent(0L); } } } catch (InterruptedException e) { // exit } catch (Throwable t) { LOG.error(AgentI18NResourceKeys.VM_HEALTH_CHECK_THREAD_EXCEPTION, t); } LOG.debug(AgentI18NResourceKeys.VM_HEALTH_CHECK_THREAD_STOPPED); } /** * This will {@link AgentMain#shutdown()} the agent, pause for the given number of milliseconds, then * {@link AgentMain#start()} the agent again. * * @param pause number of milliseconds before restarting the agent after shutting down */ private void restartAgent(long pause) throws Exception { // immediately attempt to shutdown the agent which should free up alot of VM resources (memory/threads) try { this.agent.shutdown(); } catch (Throwable t) { // this is bad, we can't even shutdown the agent. // but this thread is our only hope to recover, so do not stop the thread now // let it continue and see if we can recover the next time Thread.interrupted(); // clear the interrupted status to ensure our thread doesn't abort Thread.sleep(30000L); // give our thread time to breath - do avoid fast infinite looping that might occur return; } // If we are told to wait before restarting, do so here. We want to wait because its possible // some external influence (downed server or downed managed resource) is causing our agent // to misbehave. In that case, we'll want to wait a bit to give time for that external resource // to correct itself and thus allow the agent to get back to normal itself. if (pause > 0) { Thread.sleep(pause); } // now that the agent is shutdown and we've paused a bit, let's try to restart it try { this.agent.start(); } catch (Throwable t) { // uh-oh, we can't start the agent for some reason; our thread is our last and only hope to recover // first try to shutdown again, in case start() got half way there but couldn't finish try { this.agent.shutdown(); // TODO: purging spool: agentConfig.getDataDirectory() + agentConfig.getClientSenderCommandSpoolFileName() } catch (Throwable ignore) { // at this point, we may (or may not) have two VM check threads running, what should we do? } // do not stop the thread - let it continue and see if we can recover the next time Thread.interrupted(); // clear the interrupted status to ensure our thread doesn't abort return; } // At this point, we have "rebooted" the agent - our memory usage should be back to normal. this.agent.getAgentRestartCounter().restartedAgent(AgentRestartReason.VM_HEALTH_CHECK); // This thread is done interrupt(); } /** * Checks the VM's memory subsystem and if it detects the VM is critically * low on memory, <code>true</code> will be returned. * * @param bean the platform MBean that contains the memory statistics * * @return <code>true</code> if the VM is critically low on memory */ private boolean checkMemory(MemoryMXBean bean) { boolean heapCritical = false; boolean nonheapCritical = false; try { heapCritical = isCriticallyLow(bean.getHeapMemoryUsage(), this.heapThreshold, "VM heap"); nonheapCritical = isCriticallyLow(bean.getNonHeapMemoryUsage(), this.nonheapThreshold, "VM nonheap"); if (heapCritical || nonheapCritical) { // uh-oh, we are low on memory, before we say we are truly critical, try to GC try { if (this.performGC) { LOG.warn(AgentI18NResourceKeys.VM_HEALTH_CHECK_THREAD_GC); bean.gc(); } // let see what our memory usage is now heapCritical = isCriticallyLow(bean.getHeapMemoryUsage(), this.heapThreshold, "VM heap"); nonheapCritical = isCriticallyLow(bean.getNonHeapMemoryUsage(), this.nonheapThreshold, "VM nonheap"); } catch (Throwable t) { // something bad is happening, let's return true and see if we can recover return true; } } } catch (Throwable t) { // this should never happen unless something odd occurred. // let's return true only if we have previously detected critically low memory } return heapCritical || nonheapCritical; } /** * Checks the given pools' memories and if it detects the pool is critically * low on memory, <code>true</code> will be returned. * * @param memoryPoolMxBeans the MBeans that contain the memory statistics * @param memoryMxBean the memory MX bean, used to perform GC if we need to * * @return <code>true</code> if one of the pools is critically low on memory */ private boolean checkPoolMemories(List<MemoryPoolMXBean> memoryPoolMxBeans, MemoryMXBean memoryMxBean) { boolean critical = false; boolean allValid = true; try { for (MemoryPoolMXBean bean : memoryPoolMxBeans) { if (bean.isValid()) { critical = isCriticallyLow(bean.getUsage(), this.heapThreshold, bean.getName()); if (critical) { // uh-oh, we are low on memory, before we say we are truly critical, try to GC try { if (this.performGC) { LOG.warn(AgentI18NResourceKeys.VM_HEALTH_CHECK_THREAD_GC); memoryMxBean.gc(); } // let see what our memory usage is now critical = isCriticallyLow(bean.getUsage(), this.heapThreshold, bean.getName()); } catch (Throwable t) { // something bad is happening, let's return true and see if we can recover return true; } } } else { allValid = false; } } } catch (Throwable t) { // this should never happen unless something odd occurred. // let's return true only if we have previously detected critically low memory } // we aren't critical, but for some reason, one of our MBeans aren't valid anymore, re-obtain them if (!critical && !allValid) { memoryPoolMxBeans.clear(); memoryPoolMxBeans.addAll(getMemoryPoolMXBeansToMonitor()); } return critical; } /** * Returns <code>true</code> if the given memory usage indicates that * memory is critically low. * * @param memoryUsage * @param d the percentage of used memory to max available memory that is * the threshold to be considered critical. e.g. If this is 0.9, that means * if the used memory is 90% or higher of the max, then there is * a critical shortest of free memory and true will be returned * @param type the type of memory * * @return <code>true</code> if the amount of used memory is over the threshold */ private boolean isCriticallyLow(MemoryUsage memoryUsage, float thresholdPercentage, String type) { final long used = memoryUsage.getUsed(); final long max = memoryUsage.getMax(); if ((max > -1) && (used > (max * thresholdPercentage))) { LOG.warn(AgentI18NResourceKeys.VM_HEALTH_CHECK_THREAD_MEM_LOW, type, thresholdPercentage, memoryUsage); return true; } return false; } /** * Gets a list of all the memory pool MBeans that are to be monitored. * * @return the list of MBeans that need to be monitored */ private List<MemoryPoolMXBean> getMemoryPoolMXBeansToMonitor() { final List<MemoryPoolMXBean> memoryPoolMxBeansToMonitor = new ArrayList<MemoryPoolMXBean>(); if (!this.memoryPoolsToMonitor.isEmpty()) { final List<MemoryPoolMXBean> allMemoryPoolMxBeans = ManagementFactory.getMemoryPoolMXBeans(); for (MemoryPoolMXBean memoryPoolMXBean : allMemoryPoolMxBeans) { if (this.memoryPoolsToMonitor.contains(memoryPoolMXBean.getName().toLowerCase())) { memoryPoolMxBeansToMonitor.add(memoryPoolMXBean); } } } return memoryPoolMxBeansToMonitor; } }