/*
* Copyright (c) 2015 EMC Corporation
* All Rights Reserved
*/
package com.emc.storageos.systemservices.impl.util;
import com.emc.storageos.coordinator.client.model.Site;
import com.emc.storageos.coordinator.client.model.SiteNetworkState;
import com.emc.storageos.coordinator.client.model.SiteNetworkState.NetworkHealth;
import com.emc.storageos.coordinator.client.model.SiteState;
import com.emc.storageos.coordinator.client.service.CoordinatorClient;
import com.emc.storageos.coordinator.client.service.DrUtil;
import com.emc.storageos.services.util.AlertsLogger;
import com.emc.storageos.services.util.Waiter;
import java.text.DecimalFormat;
import java.text.MessageFormat;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
/**
* A thread started in syssvc to monitor network health between active and standby site.
* Network health is determined by checking socket connection latency to SOCKET_TEST_PORT
* If latency is less than 150ms then Network health is "Good", if it is greater then Network Health is "Slow"
* If the testPing times out or fails to connect then pin is -1 and NetworkHealth is "Broken""
*/
public class DrSiteNetworkMonitor extends DrHealthMonitor {
private static final Logger _log = LoggerFactory.getLogger(DrSiteNetworkMonitor.class);
private AlertsLogger _alertLog = AlertsLogger.getAlertsLogger();
@Autowired
private MailHandler mailHandler;
@Autowired
private DrUtil drUtil;
@Autowired
private CoordinatorClient coordinatorClient;
private static final int SOCKET_TEST_PORT = 443;
private static final int NETWORK_SLOW_THRESHOLD = 150;
private static final int NETWORK_TIMEOUT = 10 * 1000;
public DrSiteNetworkMonitor() {
}
@Override
public void tick() {
if (shouldStartOnCurrentSite() && drUtil.isLeaderNode()) {
checkPing();
}
}
/**
* Whether we should bring up network monitor. Only active site(or degraded), or paused standby site need run network monitor
*
* @return true if we should start it
*/
private boolean shouldStartOnCurrentSite() {
if (drUtil.isActiveSite()) {
return true;
}
Site localSite = drUtil.getLocalSite();
SiteState state = localSite.getState();
if (state == SiteState.STANDBY_PAUSED || state == SiteState.ACTIVE_DEGRADED) {
return true;
}
_log.debug("This site is not active site or standby paused, no need to do network monitor");
return false;
}
private void checkPing() {
Site localSite = drUtil.getLocalSite();
SiteNetworkState localNetworkState = drUtil.getSiteNetworkState(localSite.getUuid());
if (!NetworkHealth.GOOD.equals(localNetworkState.getNetworkHealth()) || localNetworkState.getNetworkLatencyInMs() != 0) {
localNetworkState.setNetworkLatencyInMs(0);
localNetworkState.setNetworkHealth(NetworkHealth.GOOD);
coordinatorClient.setTargetInfo(localSite.getUuid(), localNetworkState);
}
for (Site site : drUtil.listSites()){
if (drUtil.isLocalSite(site)) {
continue; // skip local site
}
SiteNetworkState siteNetworkState = drUtil.getSiteNetworkState(site.getUuid());
NetworkHealth previousState = siteNetworkState.getNetworkHealth();
String host = site.getVipEndPoint();
double ping = drUtil.testPing(host, SOCKET_TEST_PORT, NETWORK_TIMEOUT);
//if ping successful get an average, format to 3 decimal places
if( ping != -1){
ping = (ping + drUtil.testPing(host, SOCKET_TEST_PORT, NETWORK_TIMEOUT) + drUtil.testPing(host, SOCKET_TEST_PORT,
NETWORK_TIMEOUT)) / 3;
DecimalFormat df = new DecimalFormat("#.###");
ping = Double.parseDouble(df.format(ping));
}
_log.info("Ping: "+ping);
siteNetworkState.setNetworkLatencyInMs(ping);
if (ping > NETWORK_SLOW_THRESHOLD) {
siteNetworkState.setNetworkHealth(NetworkHealth.SLOW);
_log.warn("Network for standby {} is slow",site.getName());
AlertsLogger.getAlertsLogger().warn(String.format("Network for standby %s is Broken:" +
"Latency was reported as %f ms",site.getName(),ping));
}
else if (ping < 0) {
siteNetworkState.setNetworkHealth(NetworkHealth.BROKEN);
_log.error("Network for standby {} is broken",site.getName());
AlertsLogger.getAlertsLogger().error(String.format("Network for standby %s is Broken:" +
"Latency was reported as %s ms",site.getName(),ping));
}
else {
siteNetworkState.setNetworkHealth(NetworkHealth.GOOD);
}
coordinatorClient.setTargetInfo(site.getUuid(), siteNetworkState);
if (drUtil.isActiveSite()) {
SiteState state = site.getState();
if (SiteState.STANDBY_ADDING == state || SiteState.STANDBY_RESUMING == state) {
_log.info("Skip mail alert during add-standby or resume-standby for {}", site.getUuid());
continue;
}
if (!NetworkHealth.BROKEN.equals(previousState)
&& NetworkHealth.BROKEN.equals(siteNetworkState.getNetworkHealth())){
//Add to systemevent log
_alertLog.error(MessageFormat.format("Network connection to site %s has been broken.",site.getName()));
//send email alert
mailHandler.sendSiteNetworkBrokenMail(site);
}
}
}
}
};