/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb; import org.apache.zookeeper_voltpatches.KeeperException; import org.voltcore.logging.VoltLogger; import org.voltcore.utils.CoreUtils; import org.voltdb.AuthSystem.AuthUser; import org.voltdb.client.BatchTimeoutOverrideType; import org.voltdb.client.ClientResponse; import org.voltdb.client.SyncCallback; import org.voltdb.compiler.deploymentfile.DrRoleType; import org.voltdb.compiler.deploymentfile.ResourceMonitorType; import org.voltdb.compiler.deploymentfile.SystemSettingsType; import org.voltdb.snmp.FaultFacility; import org.voltdb.snmp.SnmpTrapSender; import org.voltdb.snmp.ThresholdType; import org.voltdb.utils.MiscUtils; import org.voltdb.utils.PlatformProperties; import org.voltdb.utils.SystemStatsCollector; import org.voltdb.utils.SystemStatsCollector.Datum; import java.util.HashMap; import java.util.Map; import java.util.concurrent.ExecutionException; /** * Used to periodically check if the server's resource utilization is above the configured limits * and pause the server. */ public class HealthMonitor implements Runnable, Promotable { private static final VoltLogger m_logger = new VoltLogger("HOST"); private String m_rssLimitStr; private long m_rssLimit; private int m_resourceCheckInterval; private DiskResourceChecker m_diskLimitConfig; private boolean m_snmpMemoryTrapSent = false; private SnmpTrapSender m_snmpTrapSender; private String m_snmpRssLimitStr; private long m_snmpRssLimit; private ThresholdType m_snmpRssCriteria; private Map<Byte,Boolean> m_snmpDRTrapSent = new HashMap<>(); private boolean m_isLeader; public HealthMonitor(SystemSettingsType systemSettings, SnmpTrapSender snmpTrapSender) { if (systemSettings == null || systemSettings.getResourcemonitor() == null) { return; } ResourceMonitorType config = systemSettings.getResourcemonitor(); m_resourceCheckInterval = config.getFrequency(); if (config.getMemorylimit() != null) { m_rssLimitStr = config.getMemorylimit().getSize().trim(); // configured value is in GB. Convert it to bytes double dblLimit = getMemoryLimitSize(m_rssLimitStr); m_rssLimit = Double.valueOf(dblLimit).longValue(); } m_diskLimitConfig = new DiskResourceChecker(systemSettings.getResourcemonitor().getDisklimit(), snmpTrapSender); // for snmp trap m_snmpTrapSender = snmpTrapSender; if (config.getMemorylimit() != null) { m_snmpRssLimitStr = config.getMemorylimit().getAlert().trim(); // configured value is in GB. Convert it to bytes double dblLimit = getMemoryLimitSize(m_snmpRssLimitStr); m_snmpRssLimit = Double.valueOf(dblLimit).longValue(); m_snmpRssCriteria = m_snmpRssLimitStr.endsWith("%") ? ThresholdType.PERCENT : ThresholdType.LIMIT; } } public boolean hasResourceLimitsConfigured() { return ((m_rssLimit > 0 || m_snmpRssLimit > 0 || (m_diskLimitConfig!=null && m_diskLimitConfig.hasLimitsConfigured())) && m_resourceCheckInterval > 0); } public int getResourceCheckInterval() { return m_resourceCheckInterval; } public void logResourceLimitConfigurationInfo() { if (hasResourceLimitsConfigured()) { m_logger.info("Resource limit monitoring configured to run every " + m_resourceCheckInterval + " seconds"); if (m_rssLimit > 0) { m_logger.info("RSS limit: " + getRssLimitLogString(m_rssLimit, m_rssLimitStr)); } if (MiscUtils.isPro() && m_snmpRssLimit > 0) { m_logger.info("RSS SNMP notification limit: " + getRssLimitLogString(m_snmpRssLimit, m_snmpRssLimitStr)); } if (m_diskLimitConfig!=null) { m_diskLimitConfig.logConfiguredLimits(); } } else { m_logger.info("No resource usage limit monitoring configured"); } } private String getRssLimitLogString(long rssLimit, String rssLimitStr) { String rssWithUnit = getValueWithUnit(rssLimit); return (rssLimitStr.endsWith("%") ? rssLimitStr + " (" + rssWithUnit + ")" : rssWithUnit); } @Override public void run() { if (getClusterOperationMode() != OperationMode.RUNNING) { return; } // check DRRole stats if it's responsible if (m_isLeader) { checkDRRole(); } if (isOverMemoryLimit() || m_diskLimitConfig.isOverLimitConfiguration()) { SyncCallback cb = new SyncCallback(); if (getConnectionHadler().callProcedure(getInternalUser(), true, BatchTimeoutOverrideType.NO_TIMEOUT, cb, "@Pause")) { try { cb.waitForResponse(); } catch (InterruptedException e) { m_logger.error("Interrupted while pausing cluster for resource overusage", e); return; } ClientResponse r = cb.getResponse(); if (r.getStatus() != ClientResponse.SUCCESS) { m_logger.error("Unable to pause cluster for resource overusage: " + r.getStatusString()); } } else { m_logger.error("Unable to pause cluster for resource overusage: failed to invoke @Pause"); } } } private OperationMode getClusterOperationMode() { return VoltDB.instance().getMode(); } private InternalConnectionHandler getConnectionHadler() { return VoltDB.instance().getClientInterface().getInternalConnectionHandler(); } private AuthUser getInternalUser() { return VoltDB.instance().getCatalogContext().authSystem.getInternalAdminUser(); } private void checkDRRole() { SyncCallback cb = new SyncCallback(); if (getConnectionHadler().callProcedure(getInternalUser(), false, BatchTimeoutOverrideType.NO_TIMEOUT, cb, "@Statistics", "DRROLE",0)) { try { cb.waitForResponse(); } catch (InterruptedException e) { m_logger.error("Interrupted while retrieving cluster for DRROLE STATS", e); return; } ClientResponse r = cb.getResponse(); if (r.getStatus() != ClientResponse.SUCCESS) { // timeout could happen if hostdown if (m_logger.isDebugEnabled()) { m_logger.debug("Unable to retrieve DRROLE STATS: " + r.getStatusString()); } return; } VoltTable result = r.getResults()[0]; while (result.advanceRow()) { DrRoleType drRole = DrRoleType.fromValue(result.getString(DRRoleStats.CN_ROLE).toLowerCase()); DRRoleStats.State state = DRRoleStats.State.valueOf(result.getString(DRRoleStats.CN_STATE)); byte remoteCluster = (byte) result.getLong(DRRoleStats.CN_REMOTE_CLUSTER_ID); if (m_logger.isDebugEnabled()) { m_logger.debug("DRROLE stats: Role:" + drRole + " State:" + state + " Remote Cluster ID:" + remoteCluster); } if (drRole == DrRoleType.NONE) { continue; } if (DRRoleStats.State.STOPPED == state) { if (!m_snmpDRTrapSent.getOrDefault(remoteCluster, false)) { m_snmpTrapSender.statistics(FaultFacility.DR, String.format("Database Replication ROLE: %s break with Remote Cluster %d.", drRole, remoteCluster)); m_snmpDRTrapSent.put(remoteCluster, true); } } else { // reset if (m_snmpDRTrapSent.getOrDefault(remoteCluster, false)) { m_snmpDRTrapSent.put(remoteCluster, false); } } } } else { m_logger.error("Unable to retrieve DRROLE STATS:: failed to invoke @Statistics DRROLE, 0."); } } private boolean isOverMemoryLimit() { if (m_rssLimit<=0 && m_snmpRssLimit<=0) { return false; } Datum datum = SystemStatsCollector.getRecentSample(); if (datum == null) { // this will be null if stats has not run yet m_logger.warn("No stats are available from stats collector. Skipping resource check."); return false; } if (m_logger.isDebugEnabled()) { m_logger.debug("RSS=" + datum.rss + " Configured rss limit=" + m_rssLimit + " Configured SNMP rss limit=" + m_snmpRssLimit); } if (MiscUtils.isPro()) { if (m_snmpRssLimit > 0 && datum.rss >= m_snmpRssLimit) { if (!m_snmpMemoryTrapSent) { m_snmpTrapSender.resource(m_snmpRssCriteria, FaultFacility.MEMORY, m_snmpRssLimit, datum.rss, String.format("SNMP resource limit exceeded. RSS limit %s on %s. Current RSS size %s.", getRssLimitLogString(m_snmpRssLimit, m_snmpRssLimitStr), CoreUtils.getHostnameOrAddress(), getValueWithUnit(datum.rss))); m_snmpMemoryTrapSent = true; } } else { if (m_snmpRssLimit > 0 && m_snmpMemoryTrapSent) { m_snmpTrapSender.resourceClear(m_snmpRssCriteria, FaultFacility.MEMORY, m_snmpRssLimit, datum.rss, String.format("SNMP resource limit cleared. RSS limit %s on %s. Current RSS size %s.", getRssLimitLogString(m_snmpRssLimit, m_snmpRssLimitStr), CoreUtils.getHostnameOrAddress(), getValueWithUnit(datum.rss))); m_snmpMemoryTrapSent = false; } } } if (m_rssLimit > 0 && datum.rss >= m_rssLimit) { m_logger.error(String.format( "Resource limit exceeded. RSS limit %s on %s. Setting database to read-only. " + "Use \"voltadmin resume\" command once resource constraint is corrected.", getRssLimitLogString(m_rssLimit,m_rssLimitStr), CoreUtils.getHostnameOrAddress())); m_logger.error(String.format("Resource limit exceeded. Current RSS size %s.", getValueWithUnit(datum.rss))); return true; } else { return false; } } public static String getValueWithUnit(long value) { if (value >= 1073741824L) { return String.format("%.2f GB", (value/1073741824.0)); } else if (value >= 1048576) { return String.format("%.2f MB", (value/1048576.0)); } else { return value + " bytes"; } } // package-private for junit double getMemoryLimitSize(String sizeStr) { if (sizeStr==null || sizeStr.length()==0) { return 0; } try { if (sizeStr.charAt(sizeStr.length()-1)=='%') { // size as a percentage of total available memory int perc = Integer.parseInt(sizeStr.substring(0, sizeStr.length()-1)); if (perc<0 || perc > 99) { throw new IllegalArgumentException("Invalid memory limit percentage: " + sizeStr); } return PlatformProperties.getPlatformProperties().ramInMegabytes*1048576L*perc/100.0; } else { // size in GB double size = Double.parseDouble(sizeStr)*1073741824L; if (size<0) { throw new IllegalArgumentException("Invalid memory limit value: " + sizeStr); } return size; } } catch(NumberFormatException e) { throw new IllegalArgumentException("Invalid memory limit value " + sizeStr + ". Memory limit must be configued as a percentage of total available memory or as GB value"); } } @Override public void acceptPromotion() throws InterruptedException, ExecutionException, KeeperException { m_isLeader = true; } }