/*
* Copyright (c) 2016 EMC Corporation
* All Rights Reserved
*/
package com.emc.storageos.systemservices.impl.util;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.emc.storageos.coordinator.client.model.Site;
import com.emc.storageos.coordinator.client.model.SiteState;
import com.emc.storageos.coordinator.client.service.CoordinatorClient;
import com.emc.storageos.coordinator.client.service.DistributedDoubleBarrier;
import com.emc.storageos.coordinator.client.service.DrUtil;
import com.emc.storageos.coordinator.common.impl.ZkPath;
import com.emc.storageos.systemservices.impl.upgrade.CoordinatorClientExt;
import com.emc.storageos.systemservices.impl.upgrade.LocalRepository;
/**
* A DR monitor working in Standby site to monitor zookeeper status in DR environment. Zookeeper in Standby site
* works in 'observer' mode (see https://zookeeper.apache.org/doc/r3.3.3/zookeeperObservers.html). Normally it becomes
* read-only after losing connection to Active site. But we still would make it work so that other services
* like dbsvc, UI etc could be up and running on Standby site. So we invent this monitor to address the following
* situations -
*
* 1) If Standby site loses connection to Active site, the leader monitor(the node who is holding VIP) reconfigures
* zookeeper cluster to 'paritipant' mode(read-writable).
* 2) If Active site comes back, all nodes reconfigures themselves and reconnect back to Active site as 'observer'
*
*/
public class DrZkHealthMonitor extends DrHealthMonitor {
private static final Logger log = LoggerFactory.getLogger(DrZkHealthMonitor.class);
private static final String DR_SWITCH_TO_ZK_OBSERVER_BARRIER = "/config/disasterRecoverySwitchToZkObserver";
private static final int DR_SWITCH_BARRIER_TIMEOUT = 180; // barrier timeout in seconds
private CoordinatorClientExt coordinatorExt;
private DistributedDoubleBarrier switchToZkObserverBarrier;
private String initZkMode; // ZK mode during syssvc startup
private DrUtil drUtil;
public DrZkHealthMonitor() {
}
@Override
public void start() {
CoordinatorClient coordinator = getCoordinator().getCoordinatorClient();
String barrierPath = String.format("%s/%s%s", ZkPath.SITES, coordinator.getSiteId(), DR_SWITCH_TO_ZK_OBSERVER_BARRIER);
switchToZkObserverBarrier = coordinator.getDistributedDoubleBarrier(barrierPath, coordinatorExt.getNodeCount());
super.start();
}
@Override
public void tick() {
try {
String myNodeId = coordinatorExt.getMyNodeId();
String localZkMode = drUtil.getCoordinatorMode(myNodeId);
if (initZkMode == null) {
initZkMode = localZkMode;
}
log.info("Local zookeeper mode: {} ",localZkMode);
if(coordinatorExt.isVirtualIPHolder()){
log.info("Local node has vip, monitor other node zk states");
checkAndReconfigSiteZKModes();
}
/*
* If local ZK (in the standby site) is running on its own independently (leader, follower or standby mode)
* or it could not startup at all (state == null),
* We will try to switch local ZK to observe mode if the active site is running well.
*/
if (localZkMode == null || drUtil.isParticipantNode(localZkMode)) {
if (localZkMode != null && drUtil.isLeaderNode(localZkMode)) {
// node is in participant mode, update the local site state accordingly
checkAndUpdateLocalSiteState();
}
// check if active site is back
if (coordinatorExt.isActiveSiteHealthy()) {
log.info("Active site is back. Reconfig coordinatorsvc to observer mode");
reconnectZKToActiveSite();
} else {
log.info("Active site is unavailable. Keep coordinatorsvc in current state {}", localZkMode);
}
}
}catch(Exception e){
log.error("Exception while monitoring node state: ", e);
}
}
/**
* Update the standby site state when the active site is lost.
* if SYNCED, change it to PAUSED.
* if SYNCING/RESUMING/ADDING, change it to ERROR since it will never finish without the active site.
*/
private void checkAndUpdateLocalSiteState() {
Site localSite = drUtil.getLocalSite();
SiteState state = localSite.getState();
if (SiteState.STANDBY_SYNCED.equals(state) || SiteState.STANDBY_INCR_SYNCING.equals(state)) {
log.info("Updating local site from {} to STANDBY_PAUSED since active is unreachable",
state);
localSite.setState(SiteState.STANDBY_PAUSED);
coordinatorExt.getCoordinatorClient().persistServiceConfiguration(localSite.toConfiguration());
coordinatorExt.rescheduleDrSiteNetworkMonitor();
} else if (SiteState.STANDBY_SYNCING.equals(state) ||
SiteState.STANDBY_RESUMING.equals(state) ||
SiteState.STANDBY_ADDING.equals(state)){
log.info("Updating local site from {} to STANDBY_ERROR since active is unreachable",
localSite.getState());
localSite.setLastState(state);
localSite.setState(SiteState.STANDBY_ERROR);
coordinatorExt.getCoordinatorClient().persistServiceConfiguration(localSite.toConfiguration());
}
}
/**
* make sure that all local site nodes are in correct zk mode
*/
private void checkAndReconfigSiteZKModes() {
List<String> readOnlyNodes = new ArrayList<>();
List<String> observerNodes = new ArrayList<>();
int numOnline = 0;
for(String node : coordinatorExt.getAllNodeIds()){
String nodeState=drUtil.getCoordinatorMode(node);
if (nodeState==null){
log.debug("State for {}: null",node);
continue;
}
else if(DrUtil.ZOOKEEPER_MODE_READONLY.equals(nodeState)){
// Found another node in read only
readOnlyNodes.add(node);
}
else if (DrUtil.ZOOKEEPER_MODE_OBSERVER.equals(nodeState)) {
// Found another node in observer
observerNodes.add(node);
}
log.debug("State for {}: {}",node,nodeState);
numOnline++;
}
int numParticipants = numOnline - readOnlyNodes.size() - observerNodes.size();
int quorum = coordinatorExt.getNodeCount() / 2 + 1;
log.debug("Observer nodes: {}",observerNodes.size());
log.debug("Read Only nodes: {}",readOnlyNodes.size());
log.debug("Participant nodes: {}",numParticipants);
log.debug("nodes Online: {}",numOnline);
// if there is a participant we need to reconfigure or it will be stuck there
// if there are only participants no need to reconfigure
// if there are only read only nodes and we have quorum we need to reconfigure
if(0 < numParticipants && numParticipants < numOnline) {
log.info("Nodes must have consistent zk mode. Reconfiguring all nodes to participant: {}",
observerNodes.addAll(readOnlyNodes));
reconfigZKToWritable(observerNodes, readOnlyNodes);
}
else if (readOnlyNodes.size() == numOnline && numOnline >= quorum){
log.info("A quorum of nodes are read-only, Reconfiguring nodes to participant: {}",readOnlyNodes);
reconfigZKToWritable(observerNodes, readOnlyNodes);
}
}
/**
* Reconnect to zookeeper in active site.
*/
private void reconnectZKToActiveSite() {
LocalRepository localRepository = LocalRepository.getInstance();
try {
boolean allEntered = switchToZkObserverBarrier.enter(DR_SWITCH_BARRIER_TIMEOUT, TimeUnit.SECONDS);
if (allEntered) {
try {
localRepository.reconfigCoordinator("observer");
} finally {
leaveZKDoubleBarrier(switchToZkObserverBarrier, DR_SWITCH_TO_ZK_OBSERVER_BARRIER);
}
localRepository.restartCoordinator("observer");
} else {
log.warn("All nodes unable to enter barrier {}. Try again later", DR_SWITCH_TO_ZK_OBSERVER_BARRIER);
}
} catch (Exception ex) {
log.warn("Unexpected errors during switching back to zk observer. Try again later. {}", ex);
}
}
/**
* reconfigure ZooKeeper to participant mode within the local site
*
* @param barrier barrier to leave
* @param path for logging barrier
* @return true for successful, false for success unknown
*/
private void leaveZKDoubleBarrier(DistributedDoubleBarrier barrier, String path){
try {
log.info("Leaving the barrier {}",path);
boolean leaved = barrier.leave(DR_SWITCH_BARRIER_TIMEOUT, TimeUnit.SECONDS);
if (!leaved) {
log.warn("Unable to leave barrier for {}", path);
}
} catch (Exception ex) {
log.warn("Unexpected errors during leaving barrier",ex);
}
}
/**
* reconfigure ZooKeeper to participant mode within the local site
*
* @param observerNodes to be reconfigured
* @param readOnlyNodes to be reconfigured
*/
public void reconfigZKToWritable(List<String> observerNodes,List<String> readOnlyNodes) {
log.info("Standby is running in read-only mode due to connection loss with active site. " +
"Reconfig coordinatorsvc of all nodes to writable");
try{
boolean reconfigLocal = false;
// if zk is switched from observer mode to participant, reload syssvc
for(String node:observerNodes){
//The local node cannot reboot itself before others
if(node.equals(coordinatorExt.getMyNodeId())){
reconfigLocal=true;
continue;
}
LocalRepository localRepository=LocalRepository.getInstance();
localRepository.remoteReconfigCoordinator(node, "participant");
localRepository.remoteRestartCoordinator(node, "participant");
}
for(String node:readOnlyNodes){
//The local node cannot reboot itself before others
if(node.equals(coordinatorExt.getMyNodeId())){
reconfigLocal=true;
continue;
}
LocalRepository localRepository=LocalRepository.getInstance();
localRepository.remoteReconfigCoordinator(node, "participant");
localRepository.remoteRestartCoordinator(node, "participant");
}
//reconfigure local node last
if (reconfigLocal){
coordinatorExt.reconfigZKToWritable();
}
}catch(Exception ex){
log.warn("Unexpected errors during switching back to zk observer. Try again later. {}", ex.toString());
}
}
public CoordinatorClientExt getCoordinator() {
return coordinatorExt;
}
public void setCoordinator(CoordinatorClientExt coordinatorClientExt) {
this.coordinatorExt = coordinatorClientExt;
}
public DrUtil getDrUtil() {
return drUtil;
}
public void setDrUtil(DrUtil drUtil) {
this.drUtil = drUtil;
}
}