/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.util; import java.lang.management.GarbageCollectorMXBean; import java.lang.management.ManagementFactory; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.base.Stopwatch; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; /** * Class which sets up a simple thread which runs in a loop sleeping * for a short interval of time. If the sleep takes significantly longer * than its target time, it implies that the JVM or host machine has * paused processing, which may cause other problems. If such a pause is * detected, the thread logs a message. */ @InterfaceAudience.Private public class JvmPauseMonitor { private static final Log LOG = LogFactory.getLog( JvmPauseMonitor.class); /** The target sleep time */ private static final long SLEEP_INTERVAL_MS = 500; /** log WARN if we detect a pause longer than this threshold */ private final long warnThresholdMs; private static final String WARN_THRESHOLD_KEY = "jvm.pause.warn-threshold.ms"; private static final long WARN_THRESHOLD_DEFAULT = 10000; /** log INFO if we detect a pause longer than this threshold */ private final long infoThresholdMs; private static final String INFO_THRESHOLD_KEY = "jvm.pause.info-threshold.ms"; private static final long INFO_THRESHOLD_DEFAULT = 1000; private Thread monitorThread; private volatile boolean shouldRun = true; public JvmPauseMonitor(Configuration conf) { this.warnThresholdMs = conf.getLong(WARN_THRESHOLD_KEY, WARN_THRESHOLD_DEFAULT); this.infoThresholdMs = conf.getLong(INFO_THRESHOLD_KEY, INFO_THRESHOLD_DEFAULT); } public void start() { Preconditions.checkState(monitorThread == null, "Already started"); monitorThread = new Daemon(new Monitor()); monitorThread.start(); } public void stop() { shouldRun = false; monitorThread.interrupt(); try { monitorThread.join(); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } private String formatMessage(long extraSleepTime, Map<String, GcTimes> gcTimesAfterSleep, Map<String, GcTimes> gcTimesBeforeSleep) { Set<String> gcBeanNames = Sets.intersection( gcTimesAfterSleep.keySet(), gcTimesBeforeSleep.keySet()); List<String> gcDiffs = Lists.newArrayList(); for (String name : gcBeanNames) { GcTimes diff = gcTimesAfterSleep.get(name).subtract( gcTimesBeforeSleep.get(name)); if (diff.gcCount != 0) { gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff.toString()); } } String ret = "Detected pause in JVM or host machine (eg GC): " + "pause of approximately " + extraSleepTime + "ms\n"; if (gcDiffs.isEmpty()) { ret += "No GCs detected"; } else { ret += Joiner.on("\n").join(gcDiffs); } return ret; } private Map<String, GcTimes> getGcTimes() { Map<String, GcTimes> map = Maps.newHashMap(); List<GarbageCollectorMXBean> gcBeans = ManagementFactory.getGarbageCollectorMXBeans(); for (GarbageCollectorMXBean gcBean : gcBeans) { map.put(gcBean.getName(), new GcTimes(gcBean)); } return map; } private static class GcTimes { private GcTimes(GarbageCollectorMXBean gcBean) { gcCount = gcBean.getCollectionCount(); gcTimeMillis = gcBean.getCollectionTime(); } private GcTimes(long count, long time) { this.gcCount = count; this.gcTimeMillis = time; } private GcTimes subtract(GcTimes other) { return new GcTimes(this.gcCount - other.gcCount, this.gcTimeMillis - other.gcTimeMillis); } @Override public String toString() { return "count=" + gcCount + " time=" + gcTimeMillis + "ms"; } private long gcCount; private long gcTimeMillis; } private class Monitor implements Runnable { @Override public void run() { Stopwatch sw = new Stopwatch(); Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes(); while (shouldRun) { sw.reset().start(); try { Thread.sleep(SLEEP_INTERVAL_MS); } catch (InterruptedException ie) { return; } long extraSleepTime = sw.elapsedMillis() - SLEEP_INTERVAL_MS; Map<String, GcTimes> gcTimesAfterSleep = getGcTimes(); if (extraSleepTime > warnThresholdMs) { LOG.warn(formatMessage( extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep)); } else if (extraSleepTime > infoThresholdMs) { LOG.info(formatMessage( extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep)); } gcTimesBeforeSleep = gcTimesAfterSleep; } } } /** * Simple 'main' to facilitate manual testing of the pause monitor. * * This main function just leaks memory into a list. Running this class * with a 1GB heap will very quickly go into "GC hell" and result in * log messages about the GC pauses. */ public static void main(String []args) throws Exception { new JvmPauseMonitor(new Configuration()).start(); List<String> list = Lists.newArrayList(); int i = 0; while (true) { list.add(String.valueOf(i++)); } } }