/*
* Copyright (c) 2015 EMC Corporation
* All Rights Reserved
*/
package com.emc.storageos.systemservices.impl.healthmonitor;
import java.net.URI;
import java.util.*;
import com.emc.storageos.db.client.DbClient;
import com.emc.storageos.db.client.constraint.AlternateIdConstraint;
import com.emc.storageos.db.client.constraint.NamedElementQueryResultList;
import com.emc.storageos.db.client.constraint.impl.AlternateIdConstraintImpl;
import com.emc.storageos.db.client.impl.DataObjectType;
import com.emc.storageos.db.client.impl.TypeMap;
import com.emc.storageos.db.client.model.UserPreferences;
import com.emc.storageos.security.mail.MailHelper;
import com.emc.storageos.services.util.AlertsLogger;
import com.emc.storageos.security.dbInfo.DbInfoUtils;
import com.google.common.collect.Maps;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import com.emc.storageos.coordinator.client.model.Constants;
import com.emc.storageos.coordinator.client.model.DbOfflineEventInfo;
import com.emc.storageos.coordinator.client.model.Site;
import com.emc.storageos.coordinator.client.service.DrUtil;
import com.emc.storageos.coordinator.client.service.InterProcessLockHolder;
import com.emc.storageos.coordinator.common.Configuration;
import com.emc.storageos.services.util.TimeUtils;
import com.emc.storageos.systemservices.impl.jobs.common.JobConstants;
import com.emc.storageos.systemservices.impl.upgrade.CoordinatorClientExt;
/**
* DbDowntimeTracker is to track the downtime of dbsvc and geodbsvc.
* It monitors dbsvc and geodbsvc online/offline event and record downtime in ZK
*/
public class DbDowntimeTracker {
private static final Logger log = LoggerFactory.getLogger(DbDowntimeTracker.class);
private AlertsLogger _alertLog = AlertsLogger.getAlertsLogger();
private List<String> serviceNames = Arrays.asList(Constants.DBSVC_NAME, Constants.GEODBSVC_NAME);
private static final String DB_TRACKER_LOCK = "dbDowntimeTracker";
// Tracker check service status every 15 mins by default
private static final long TRACKER_CHECK_INTERVAL = JobConstants.LAG_BETWEEN_RUNS_ALERTS * TimeUtils.SECONDS;
private static final long NO_NEED_UPDATE_LIMIT = 5 * TimeUtils.MINUTES;
@Autowired
private CoordinatorClientExt coordinator;
@Autowired
private DbClient dbClient;
private MailHelper mailHelper;
public DbDowntimeTracker() {
}
/**
* Monitor dbsvc and geodbsvc online/offline event and record downtime in ZK
*/
public void run() {
DrUtil drUtil = new DrUtil(coordinator.getCoordinatorClient());
if (drUtil.isStandby()) {
log.info("Current site is standby, no need to monitor dbsvc and geodbsvc status");
return;
}
log.info("Monitoring dbsvc and geodbsvc status");
try (AutoCloseable lock = getTrackerLock()) {
for (Site site : drUtil.listSites()) {
updateSiteDbsvcStatus(site);
}
} catch (Exception e) {
log.warn("Failed to monitor db status", e);
}
}
private void updateSiteDbsvcStatus(Site site) {
String siteId = site.getUuid();
log.info("Start to check db/geodb status for site {}", siteId);
for (String serviceName : serviceNames) {
log.info("Check status for {} begin, site id: {}", serviceName, siteId);
List<String> availableNodes = coordinator.getServiceAvailableNodes(siteId, serviceName);
updateTrackerInfo(site, serviceName, availableNodes);
log.info("Check status for {} finish, site id: {}", serviceName, siteId);
}
}
private AutoCloseable getTrackerLock() throws Exception {
return new InterProcessLockHolder(this.coordinator.getCoordinatorClient(), DB_TRACKER_LOCK, this.log);
}
/**
* Update db offline event info in ZK.
*/
private void updateTrackerInfo(Site site, String serviceName, List<String> activeNodes) {
String siteId = site.getUuid();
log.info("Querying db tracker info from zk");
Configuration config = coordinator.getCoordinatorClient().queryConfiguration(siteId,
Constants.DB_DOWNTIME_TRACKER_CONFIG, serviceName);
DbOfflineEventInfo dbOfflineEventInfo = new DbOfflineEventInfo(config);
log.debug("DbofflineEnventInfo is {}", dbOfflineEventInfo.getEventInfo());
long currentTimeStamp = TimeUtils.getCurrentTime();
Long lastUpdateTimestamp = dbOfflineEventInfo.getLastUpdateTimestamp();
long interval = 0L;
if (lastUpdateTimestamp != null) {
interval = Math.min((currentTimeStamp - lastUpdateTimestamp), TRACKER_CHECK_INTERVAL);
}
if (interval != 0L && interval < NO_NEED_UPDATE_LIMIT) {
log.info("Have already updated within a few minutes, skipping this update");
return;
}
dbOfflineEventInfo.setLastUpdateTimestamp(currentTimeStamp);
log.info(String.format("Db tracker last check time: %d, current check time: %d, site: %s", lastUpdateTimestamp, currentTimeStamp, siteId));
int nodeCount = site.getNodeCount();
for (int i = 1; i <= nodeCount; i++) {
String nodeId = "vipr" + i;
if (activeNodes.contains(nodeId)) {
dbOfflineEventInfo.setLastActiveTimestamp(nodeId, currentTimeStamp);
log.info(String.format("Service(%s) of node(%s) last active timestamp has been updated to %s",
serviceName, nodeId, currentTimeStamp));
if (dbOfflineEventInfo.getOfflineTimeInMS(nodeId) != null) {
dbOfflineEventInfo.setOfflineTimeInMS(nodeId, null);
dbOfflineEventInfo.setKeyOfflineAlertInDay(nodeId, null);
log.info("Service({}) of node({}) is recovered", serviceName, nodeId);
}
} else {
Long lastOfflineInMS = dbOfflineEventInfo.getOfflineTimeInMS(nodeId);
lastOfflineInMS = (lastOfflineInMS == null) ? 0 : lastOfflineInMS;
long newOfflineTime = lastOfflineInMS + interval;
dbOfflineEventInfo.setOfflineTimeInMS(nodeId, newOfflineTime);
alertStatusCheck(nodeId, serviceName, dbOfflineEventInfo, newOfflineTime / TimeUtils.DAYS);
log.info(String.format("Service(%s) of node(%s) has been unavailable for %s mins",
serviceName, nodeId, newOfflineTime / TimeUtils.MINUTES));
}
}
config = dbOfflineEventInfo.toConfiguration(serviceName);
coordinator.getCoordinatorClient().persistServiceConfiguration(siteId, config);
log.info("Persist db tracker info to zk successfully");
}
private void alertStatusCheck(String nodeId, String serviceName, DbOfflineEventInfo dbOfflineEventInfo, long offLineTimeInDay) {
if (offLineTimeInDay < 1) return ;
Long alertDays = dbOfflineEventInfo.getOfflineAlertInDay(nodeId);
if (alertDays != null) {
if (offLineTimeInDay > alertDays) {
if (offLineTimeInDay <= DbInfoUtils.MAX_SERVICE_OUTAGE_TIME / TimeUtils.DAYS) {
_alertLog.warn(String.format("DataBase service(%s) of node(%s) has been unavailable for %s days," +
"please power on the node in timely manner",
serviceName, nodeId, offLineTimeInDay));
//send mail alert
try {
sendDbsvcOfflineMail(nodeId, serviceName, offLineTimeInDay, false);
}catch (Exception e ) {
log.error("Failed to sending mail for db offline alert", e);
}
}else {
//send mail alert with link
_alertLog.warn(String.format("DataBase service(%s) of node(%s) has been unavailable for %s days" +
"node recovery would be needed to recovery it back",
serviceName, nodeId, offLineTimeInDay));
try {
sendDbsvcOfflineMail(nodeId, serviceName, offLineTimeInDay, true);
}catch (Exception e ) {
log.error("Failed to sending mail for db offline alert", e);
}
}
dbOfflineEventInfo.setKeyOfflineAlertInDay(nodeId, offLineTimeInDay);
}
}else {
_alertLog.warn(String.format("DataBase service(%s) of node(%s) has been unavailable for %s days," +
"please power on the node in timely manner",
serviceName, nodeId, offLineTimeInDay));
try {
sendDbsvcOfflineMail(nodeId, serviceName, offLineTimeInDay, false);
}catch (Exception e ) {
log.error("Failed to sending mail for db offline alert", e);
}
dbOfflineEventInfo.setKeyOfflineAlertInDay(nodeId, offLineTimeInDay);
}
}
/**
* Send alert mail that dbsvc offline more than 1 day
* @param nodeId node id of the dbsvc offline
* @param serviceName dbsvc or geodbsvc
* @param offlineDays days of offline
* @param nodeRecoveryRequired if need to node recovery,true will send the mail with recovery link
* */
public void sendDbsvcOfflineMail(String nodeId, String serviceName, long offlineDays, boolean nodeRecoveryRequired) {
String to = getMailAddressOfUser("root");
if (to == null || to.isEmpty()) {
log.warn("Can't send mail alert, no email address for root user");
return;
}
Map<String, String> params = Maps.newHashMap();
params.put("nodeId", nodeId);
params.put("serviceName",serviceName);
params.put("offlineDays", Long.toString(offlineDays));
if (nodeRecoveryRequired) {
params.put("url",coordinator.getPropertyInfo().getProperty("network_vip"));
}
String titile = String.format("ATTENTION - DataBase service(%s) of %s has been down for %s days",
nodeId, serviceName, offlineDays);
String content;
if (nodeRecoveryRequired) {
content = MailHelper.readTemplate("DbsvcOfflineFivedaysEmail.html");
}else {
content = MailHelper.readTemplate("DbsvcOfflineEmail.html");
}
content = MailHelper.parseTemplate(params, content);
getMailHelper().sendMailMessage(to, titile, content);
}
private MailHelper getMailHelper() {
if (mailHelper == null) {
mailHelper = new MailHelper(coordinator.getCoordinatorClient());
}
return mailHelper;
}
/**
* get user's mail address from UserPreference CF
*
* @param userName
* @return
*/
private String getMailAddressOfUser(String userName) {
DataObjectType doType = TypeMap.getDoType(UserPreferences.class);
AlternateIdConstraint constraint = new AlternateIdConstraintImpl(
doType.getColumnField(UserPreferences.USER_ID), userName);
NamedElementQueryResultList queryResults = new NamedElementQueryResultList();
this.dbClient.queryByConstraint(constraint, queryResults);
List<URI> userPrefsIds = new ArrayList<>();
for (NamedElementQueryResultList.NamedElement namedElement : queryResults) {
userPrefsIds.add(namedElement.getId());
}
if (userPrefsIds.isEmpty()) {
return null;
}
final List<UserPreferences> userPrefs = new ArrayList<>();
Iterator<UserPreferences> iter = this.dbClient.queryIterativeObjects(UserPreferences.class, userPrefsIds);
while (iter.hasNext()) {
userPrefs.add(iter.next());
}
if (userPrefs.size() > 1) {
throw new IllegalStateException("There should only be 1 user preferences object for a user");
} if (userPrefs.isEmpty()) {
// if there isn't a user prefs object in the DB yet then we haven't saved one for this user yet.
return null;
}
return userPrefs.get(0).getEmail();
}
}