/* * Licensed to the Apache Software Foundation (ASF) under one or more contributor license * agreements. See the NOTICE file distributed with this work for additional information regarding * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. You may obtain a * copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. */ package org.apache.geode.admin.internal; import org.apache.geode.CancelException; import org.apache.geode.admin.*; import org.apache.geode.cache.CacheFactory; import org.apache.geode.distributed.internal.*; import org.apache.geode.internal.*; import org.apache.geode.internal.i18n.LocalizedStrings; import org.apache.geode.internal.cache.CachePerfStats; import org.apache.geode.internal.cache.GemFireCacheImpl; import org.apache.geode.internal.statistics.GemFireStatSampler; import org.apache.geode.internal.statistics.platform.ProcessStats; import java.util.*; /** * Contains the logic for evaluating the health of a GemFire distributed system member according to * the thresholds provided in a {@link MemberHealthConfig}. * * @see VMStats * @see ProcessStats * @see DMStats * * * @since GemFire 3.5 */ /** * */ class MemberHealthEvaluator extends AbstractHealthEvaluator { /** The config from which we get the evaluation criteria */ private MemberHealthConfig config; /** The description of the member being evaluated */ private String description; // /** Statistics about this VM (may be null) */ // private VMStatsContract vmStats; /** Statistics about this process (may be null) */ private ProcessStats processStats; /** Statistics about the distribution manager */ private DMStats dmStats; /** The previous value of the reply timeouts stat */ private long prevReplyTimeouts; ////////////////////// Constructors ////////////////////// /** * Creates a new <code>MemberHealthEvaluator</code> */ MemberHealthEvaluator(GemFireHealthConfig config, DM dm) { super(config, dm); this.config = config; InternalDistributedSystem system = dm.getSystem(); GemFireStatSampler sampler = system.getStatSampler(); if (sampler != null) { // Sampling is enabled // this.vmStats = sampler.getVMStats(); this.processStats = sampler.getProcessStats(); } this.dmStats = dm.getStats(); StringBuffer sb = new StringBuffer(); sb.append("Application VM member "); sb.append(dm.getId()); int pid = OSProcess.getId(); if (pid != 0) { sb.append(" with pid "); sb.append(pid); } this.description = sb.toString(); } //////////////////// Instance Methods //////////////////// @Override protected String getDescription() { return this.description; } /** * Checks to make sure that the {@linkplain ProcessStats#getProcessSize VM's process size} is less * than the {@linkplain MemberHealthConfig#getMaxVMProcessSize threshold}. If not, the status is * "okay" health. */ void checkVMProcessSize(List status) { // There is no need to check isFirstEvaluation() if (this.processStats == null) { return; } long vmSize = this.processStats.getProcessSize(); long threshold = this.config.getMaxVMProcessSize(); if (vmSize > threshold) { String s = LocalizedStrings.MemberHealthEvaluator_THE_SIZE_OF_THIS_VM_0_MEGABYTES_EXCEEDS_THE_THRESHOLD_1_MEGABYTES .toLocalizedString(new Object[] {Long.valueOf(vmSize), Long.valueOf(threshold)}); status.add(okayHealth(s)); } } /** * Checks to make sure that the size of the distribution manager's * {@linkplain DMStats#getOverflowQueueSize() overflow} message queue does not exceed the * {@linkplain MemberHealthConfig#getMaxMessageQueueSize threshold}. If not, the status is "okay" * health. */ void checkMessageQueueSize(List status) { long threshold = this.config.getMaxMessageQueueSize(); long overflowSize = this.dmStats.getOverflowQueueSize(); if (overflowSize > threshold) { String s = LocalizedStrings.MemberHealthEvaluator_THE_SIZE_OF_THE_OVERFLOW_QUEUE_0_EXCEEDS_THE_THRESHOLD_1 .toLocalizedString( new Object[] {Long.valueOf(overflowSize), Long.valueOf(threshold)}); status.add(okayHealth(s)); } } /** * Checks to make sure that the number of {@linkplain DMStats#getReplyTimeouts reply timeouts} * does not exceed the {@linkplain MemberHealthConfig#getMaxReplyTimeouts threshold}. If not, the * status is "okay" health. */ void checkReplyTimeouts(List status) { if (isFirstEvaluation()) { return; } long threshold = this.config.getMaxReplyTimeouts(); long deltaReplyTimeouts = this.dmStats.getReplyTimeouts() - prevReplyTimeouts; if (deltaReplyTimeouts > threshold) { String s = LocalizedStrings.MemberHealthEvaluator_THE_NUMBER_OF_MESSAGE_REPLY_TIMEOUTS_0_EXCEEDS_THE_THRESHOLD_1 .toLocalizedString( new Object[] {Long.valueOf(deltaReplyTimeouts), Long.valueOf(threshold)}); status.add(okayHealth(s)); } } /** * See if the multicast retransmission ratio is okay */ void checkRetransmissionRatio(List status) { double threshold = this.config.getMaxRetransmissionRatio(); int mcastMessages = this.dmStats.getMcastWrites(); if (mcastMessages > 100000) { // avoid initial state & int overflow // the ratio we actually use here is (retransmit requests) / (mcast datagram writes) // a single retransmit request may include multiple missed messages double ratio = (this.dmStats.getMcastRetransmits() * 1.0) / (this.dmStats.getMcastWrites() * 1.0); if (ratio > threshold) { String s = "The number of message retransmissions (" + ratio + ") exceeds the threshold (" + threshold + ")"; status.add(okayHealth(s)); } } } /** * The function keeps updating the health of the cache based on roles required by the regions and * their reliablity policies. * */ void checkCacheRequiredRolesMeet(List status) { // will have to call here okeyHealth() or poorHealth() // GemFireCache cache = (GemFireCache)CacheFactory.getAnyInstance(); // CachePerfStats cPStats= null; try { GemFireCacheImpl cache = (GemFireCacheImpl) CacheFactory.getAnyInstance(); CachePerfStats cPStats = null; cPStats = cache.getCachePerfStats(); if (cPStats.getReliableRegionsMissingFullAccess() > 0) { // health is okay. int numRegions = cPStats.getReliableRegionsMissingFullAccess(); status.add(okayHealth( LocalizedStrings.MemberHealthEvaluator_THERE_ARE_0_REGIONS_MISSING_REQUIRED_ROLES_BUT_ARE_CONFIGURED_FOR_FULL_ACCESS .toLocalizedString(Integer.valueOf(numRegions)))); } else if (cPStats.getReliableRegionsMissingLimitedAccess() > 0) { // health is poor int numRegions = cPStats.getReliableRegionsMissingLimitedAccess(); status.add(poorHealth( LocalizedStrings.MemberHealthEvaluator_THERE_ARE_0_REGIONS_MISSING_REQUIRED_ROLES_AND_CONFIGURED_WITH_LIMITED_ACCESS .toLocalizedString(Integer.valueOf(numRegions)))); } else if (cPStats.getReliableRegionsMissingNoAccess() > 0) { // health is poor int numRegions = cPStats.getReliableRegionsMissingNoAccess(); status.add(poorHealth( LocalizedStrings.MemberHealthEvaluator_THERE_ARE_0_REGIONS_MISSING_REQUIRED_ROLES_AND_CONFIGURED_WITHOUT_ACCESS .toLocalizedString(Integer.valueOf(numRegions)))); } // else{ // health is good/okay // status.add(okayHealth("All regions have there required roles meet")); // } } catch (CancelException ignore) { } } /** * Updates the previous values of statistics */ private void updatePrevious() { this.prevReplyTimeouts = this.dmStats.getReplyTimeouts(); } @Override protected void check(List status) { checkVMProcessSize(status); checkMessageQueueSize(status); checkReplyTimeouts(status); // will have to add another call to check for roles // missing and reliablity attributed. checkCacheRequiredRolesMeet(status); updatePrevious(); } @Override void close() { } }