/*
* Licensed to the Apache Software Foundation (ASF) under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional information regarding
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License. You may obtain a
* copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.geode.internal.cache;
import org.apache.geode.cache.DiskAccessException;
import org.apache.geode.distributed.internal.DistributionConfig;
import org.apache.geode.distributed.internal.InternalDistributedSystem;
import org.apache.geode.internal.i18n.LocalizedStrings;
import org.apache.geode.internal.logging.LogService;
import org.apache.geode.internal.logging.LoggingThreadGroup;
import org.apache.geode.internal.logging.log4j.LocalizedMessage;
import org.apache.geode.internal.logging.log4j.LogMarker;
import org.apache.logging.log4j.Logger;
import java.io.File;
import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.*;
public class DiskStoreMonitor {
private static final Logger logger = LogService.getLogger();
private static final boolean DISABLE_MONITOR =
Boolean.getBoolean(DistributionConfig.GEMFIRE_PREFIX + "DISK_USAGE_DISABLE_MONITORING");
// private static final boolean AUTO_RECONNECT =
// Boolean.getBoolean("gemfire.DISK_USAGE_ENABLE_AUTO_RECONNECT");
private static final int USAGE_CHECK_INTERVAL = Integer
.getInteger(DistributionConfig.GEMFIRE_PREFIX + "DISK_USAGE_POLLING_INTERVAL_MILLIS", 10000);
private static final float LOG_WARNING_THRESHOLD_PCT =
Integer.getInteger(DistributionConfig.GEMFIRE_PREFIX + "DISK_USAGE_LOG_WARNING_PERCENT", 99);
enum DiskState {
NORMAL, WARN, CRITICAL;
public static DiskState select(double actual, double warn, double critical,
boolean belowMinimum) {
if (critical > 0 && (actual > critical || belowMinimum)) {
return CRITICAL;
} else if (warn > 0 && actual > warn) {
return WARN;
}
return NORMAL;
}
}
/**
* Validates the warning percent.
*
* @param val the value to check
*/
public static void checkWarning(float val) {
if (val < 0 || val > 100) {
throw new IllegalArgumentException(
LocalizedStrings.DiskWriteAttributesFactory_DISK_USAGE_WARNING_INVALID_0
.toLocalizedString(Float.valueOf(val)));
}
}
/**
* Validates the critical percent.
*
* @param val the value to check
*/
public static void checkCritical(float val) {
if (val < 0 || val > 100) {
throw new IllegalArgumentException(
LocalizedStrings.DiskWriteAttributesFactory_DISK_USAGE_CRITICAL_INVALID_0
.toLocalizedString(Float.valueOf(val)));
}
}
private final ScheduledExecutorService exec;
private final Map<DiskStoreImpl, Set<DirectoryHolderUsage>> disks;
private final LogUsage logDisk;
// // this is set when we go into auto_reconnect mode
// private volatile DirectoryHolderUsage criticalDisk;
volatile DiskStateAction _testAction;
interface DiskStateAction {
void handleDiskStateChange(DiskState state);
}
public DiskStoreMonitor() {
disks = new ConcurrentHashMap<DiskStoreImpl, Set<DirectoryHolderUsage>>();
logDisk = new LogUsage(getLogDir());
if (logger.isTraceEnabled(LogMarker.DISK_STORE_MONITOR)) {
logger.trace(LogMarker.DISK_STORE_MONITOR, "Disk monitoring is {}",
(DISABLE_MONITOR ? "disabled" : "enabled"));
logger.trace(LogMarker.DISK_STORE_MONITOR, "Log directory usage warning is set to {}%",
LOG_WARNING_THRESHOLD_PCT);
logger.trace(LogMarker.DISK_STORE_MONITOR, "Scheduling disk usage checks every {} ms",
USAGE_CHECK_INTERVAL);
}
if (DISABLE_MONITOR) {
exec = null;
} else {
final ThreadGroup tg = LoggingThreadGroup.createThreadGroup(
LocalizedStrings.DiskStoreMonitor_ThreadGroup.toLocalizedString(), logger);
exec = Executors.newScheduledThreadPool(1, new ThreadFactory() {
@Override
public Thread newThread(Runnable r) {
Thread t = new Thread(tg, r, "DiskStoreMonitor");
t.setDaemon(true);
return t;
}
});
// always monitor the log dir, even if there are no disk stores
exec.scheduleWithFixedDelay(new Runnable() {
@Override
public void run() {
try {
checkUsage();
} catch (Exception e) {
logger.error(LocalizedMessage.create(LocalizedStrings.DiskStoreMonitor_ERR), e);
}
}
}, 0, USAGE_CHECK_INTERVAL, TimeUnit.MILLISECONDS);
}
}
public void addDiskStore(DiskStoreImpl ds) {
if (logger.isTraceEnabled(LogMarker.DISK_STORE_MONITOR)) {
logger.trace(LogMarker.DISK_STORE_MONITOR, "Now monitoring disk store {}", ds.getName());
}
Set<DirectoryHolderUsage> du = new HashSet<DirectoryHolderUsage>();
for (DirectoryHolder dir : ds.getDirectoryHolders()) {
du.add(new DirectoryHolderUsage(ds, dir));
}
disks.put(ds, du);
}
public void removeDiskStore(DiskStoreImpl ds) {
if (logger.isTraceEnabled(LogMarker.DISK_STORE_MONITOR)) {
logger.trace(LogMarker.DISK_STORE_MONITOR, "No longer monitoring disk store {}",
ds.getName());
}
disks.remove(ds);
}
public boolean isNormal(DiskStoreImpl ds, DirectoryHolder dir) {
Set<DirectoryHolderUsage> dirs = disks.get(ds);
if (dirs != null) {
for (DirectoryHolderUsage du : dirs) {
if (du.dir == dir) {
return du.getState() == DiskState.NORMAL;
}
}
}
// only a postive negatory :-)
return true;
}
public void close() {
// only shutdown if we're not waiting for the critical disk to return to normal
if (exec != null /* && criticalDisk == null */) {
exec.shutdownNow();
}
disks.clear();
}
private void checkUsage() {
// // 1) Check critical disk if needed
// if (criticalDisk != null) {
// criticalDisk.update(
// criticalDisk.disk.getDiskUsageWarningPercentage(),
// criticalDisk.disk.getDiskUsageCriticalPercentage());
// return;
// }
// 2) Check disk stores / dirs
for (Entry<DiskStoreImpl, Set<DirectoryHolderUsage>> entry : disks.entrySet()) {
DiskStoreImpl ds = entry.getKey();
for (DiskUsage du : entry.getValue()) {
DiskState update =
du.update(ds.getDiskUsageWarningPercentage(), ds.getDiskUsageCriticalPercentage());
if (update == DiskState.CRITICAL) {
break;
}
}
}
// 3) Check log dir
logDisk.update(LOG_WARNING_THRESHOLD_PCT, 100);
}
private File getLogDir() {
File log = null;
GemFireCacheImpl gci = GemFireCacheImpl.getInstance();
if (gci != null) {
InternalDistributedSystem ds = gci.getDistributedSystem();
if (ds != null) {
DistributionConfig conf = ds.getConfig();
if (conf != null) {
log = conf.getLogFile();
if (log != null) {
log = log.getParentFile();
}
}
}
}
if (log == null) {
// assume current directory
log = new File(".");
}
return log;
}
abstract class DiskUsage {
private DiskState state;
DiskUsage() {
state = DiskState.NORMAL;
}
public synchronized DiskState getState() {
return state;
}
public DiskState update(float warning, float critical) {
DiskState current;
synchronized (this) {
current = state;
}
// don't bother checking if the the limits are disabled
if (!(warning > 0 || critical > 0)) {
return current;
}
if (!dir().exists()) {
if (logger.isTraceEnabled(LogMarker.DISK_STORE_MONITOR)) {
logger.trace(LogMarker.DISK_STORE_MONITOR, "Skipping check of non-existent directory {}",
dir().getAbsolutePath());
}
return current;
}
long min = getMinimumSpace();
if (logger.isTraceEnabled(LogMarker.DISK_STORE_MONITOR)) {
logger.trace(LogMarker.DISK_STORE_MONITOR,
"Checking usage for directory {}, minimum free space is {} MB", dir().getAbsolutePath(),
min);
}
long start = System.nanoTime();
long remaining = dir().getUsableSpace();
long total = dir().getTotalSpace();
long elapsed = System.nanoTime() - start;
double use = 100.0 * (total - remaining) / total;
recordStats(total, remaining, elapsed);
String pct = Math.round(use) + "%";
if (logger.isTraceEnabled(LogMarker.DISK_STORE_MONITOR)) {
logger.trace(LogMarker.DISK_STORE_MONITOR,
"Directory {} has {} bytes free out of {} ({} usage)", dir().getAbsolutePath(),
remaining, total, pct);
}
boolean belowMin = remaining < 1024 * 1024 * min;
DiskState next = DiskState.select(use, warning, critical, belowMin);
if (next == current) {
return next;
}
synchronized (this) {
state = next;
}
handleStateChange(next, pct);
return next;
}
protected abstract File dir();
protected abstract long getMinimumSpace();
protected abstract void recordStats(long total, long free, long elapsed);
protected abstract void handleStateChange(DiskState next, String pct);
}
class LogUsage extends DiskUsage {
private final File dir;
public LogUsage(File dir) {
this.dir = dir;
}
protected void handleStateChange(DiskState next, String pct) {
Object[] args = new Object[] {dir.getAbsolutePath(), pct};
switch (next) {
case NORMAL:
logger.info(LogMarker.DISK_STORE_MONITOR,
LocalizedMessage.create(LocalizedStrings.DiskStoreMonitor_LOG_DISK_NORMAL, args));
break;
case WARN:
case CRITICAL:
logger.warn(LogMarker.DISK_STORE_MONITOR,
LocalizedMessage.create(LocalizedStrings.DiskStoreMonitor_LOG_DISK_WARNING, args));
break;
}
}
@Override
protected long getMinimumSpace() {
return DiskStoreImpl.MIN_DISK_SPACE_FOR_LOGS;
}
@Override
protected File dir() {
return dir;
}
@Override
protected void recordStats(long total, long free, long elapsed) {}
}
class DirectoryHolderUsage extends DiskUsage {
private final DiskStoreImpl disk;
private final DirectoryHolder dir;
public DirectoryHolderUsage(DiskStoreImpl disk, DirectoryHolder dir) {
this.disk = disk;
this.dir = dir;
}
protected void handleStateChange(DiskState next, String pct) {
if (_testAction != null) {
logger.info(LogMarker.DISK_STORE_MONITOR, "Invoking test handler for state change to {}",
next);
_testAction.handleDiskStateChange(next);
}
Object[] args = new Object[] {dir.getDir(), disk.getName(), pct};
String msg = "Critical disk usage threshold exceeded for volume "
+ dir.getDir().getAbsolutePath() + ": " + pct + " full";
switch (next) {
case NORMAL:
logger.warn(LogMarker.DISK_STORE_MONITOR,
LocalizedMessage.create(LocalizedStrings.DiskStoreMonitor_DISK_NORMAL, args));
// // try to restart cache after we return to normal operations
// if (AUTO_RECONNECT && this == criticalDisk) {
// performReconnect(msg);
// }
break;
case WARN:
logger.warn(LogMarker.DISK_STORE_MONITOR,
LocalizedMessage.create(LocalizedStrings.DiskStoreMonitor_DISK_WARNING, args));
break;
case CRITICAL:
logger.error(LogMarker.DISK_STORE_MONITOR,
LocalizedMessage.create(LocalizedStrings.DiskStoreMonitor_DISK_CRITICAL, args));
try {
// // prepare for restart
// if (AUTO_RECONNECT) {
// disk.getCache().saveCacheXmlForReconnect();
// criticalDisk = this;
// }
} finally {
// pull the plug
disk.handleDiskAccessException(new DiskAccessException(msg, disk));
}
break;
}
}
// private void performReconnect(String msg) {
// try {
// // don't try to reconnect before the cache is closed
// disk._testHandleDiskAccessException.await();
//
// // now reconnect, clear out the var first so a close can interrupt the
// // reconnect
// criticalDisk = null;
// boolean restart = disk.getCache().getDistributedSystem().tryReconnect(true, msg,
// disk.getCache());
// if (LogMarker.DISK_STORE_MONITOR || logger.isDebugEnabled()) {
// String pre = restart ? "Successfully" : "Unsuccessfully";
// logger.info(LocalizedStrings.DEBUG, pre + " attempted to restart cache");
// }
// } catch (InterruptedException e) {
// Thread.currentThread().interrupt();
// } finally {
// close();
// }
// }
@Override
protected File dir() {
return dir.getDir();
}
@Override
protected long getMinimumSpace() {
return DiskStoreImpl.MIN_DISK_SPACE_FOR_LOGS + disk.getMaxOplogSize();
}
@Override
protected void recordStats(long total, long free, long elapsed) {
dir.getDiskDirectoryStats().addVolumeCheck(total, free, elapsed);
}
}
}