RecoveryManager.java example

Explorer
coprhd-controller-master
/*
 * Copyright (c) 2015 EMC Corporation
 * All Rights Reserved
 */

package com.emc.storageos.systemservices.impl.recovery;

import java.net.URI;
import java.util.List;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Map;
import java.util.Date;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;

import com.emc.storageos.coordinator.client.model.*;
import com.emc.storageos.db.common.DbConfigConstants;
import com.emc.storageos.services.util.*;
import com.emc.vipr.model.sys.recovery.DbOfflineStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.apache.curator.framework.recipes.locks.InterProcessLock;
import org.apache.curator.framework.recipes.leader.LeaderSelector;

import com.emc.vipr.model.sys.ClusterInfo;
import com.emc.vipr.model.sys.recovery.RecoveryStatus;
import com.emc.vipr.model.sys.recovery.RecoveryConstants;
import com.emc.storageos.model.property.PropertyConstants;
import com.emc.storageos.coordinator.client.service.DrUtil;
import com.emc.storageos.coordinator.client.service.NodeListener;
import com.emc.storageos.coordinator.client.service.impl.LeaderSelectorListenerImpl;
import com.emc.storageos.coordinator.common.impl.ZkPath;
import com.emc.storageos.coordinator.common.Configuration;
import com.emc.storageos.coordinator.common.impl.ConfigurationImpl;
import com.emc.storageos.db.client.DbClient;
import com.emc.storageos.systemservices.impl.upgrade.CoordinatorClientExt;
import com.emc.storageos.systemservices.impl.upgrade.LocalRepository;
import com.emc.storageos.systemservices.impl.client.SysClientFactory;
import com.emc.storageos.systemservices.exceptions.SysClientException;
import com.emc.storageos.management.jmx.recovery.DbManagerOps;
import com.emc.storageos.svcs.errorhandling.resources.APIException;


/**
 * Recovery Manager drives whole lifecycle of node recovery. It maintains status machine in ZK.
 * See RecoveryStatus.Status on the detailed description of status transition
 */
public class RecoveryManager implements Runnable {
    private static final Logger log = LoggerFactory.getLogger(RecoveryManager.class);

    private List<String> serviceNames = Arrays.asList(Constants.DBSVC_NAME, Constants.GEODBSVC_NAME);
    private final AtomicBoolean isLeader = new AtomicBoolean(false);
    private List<String> aliveNodes = new ArrayList<String>();
    private List<String> corruptedNodes = new ArrayList<String>();
    private int nodeCount;
    private NamedThreadPoolExecutor recoveryExecutor;
    private NamedThreadPoolExecutor multicastExecutor;
    private boolean waitOnRecoveryTriggering = false;
    private LeaderSelector leaderSelector;

    private static final long REDEPLOY_MULTICAST_TIMEOUT = 120 * 60 * 1000; // 2 hours

    @Autowired
    private CoordinatorClientExt coordinator;

    @Autowired
    private DbClient dbClient;

    @Autowired
    private LocalRepository localRepository;

    public RecoveryManager() {
    }

    /**
     * Initialize recovery manager
     */
    public void init() {
        startRecoveryLeaderSelector();
        addRecoveryStatusListener();
    }

    /**
     * Main loop of Recovery manager. Execute node recovery in case that it is elected as leader.
     */
    @Override
    public void run() {
        while (isLeader.get()) {
            try {
                checkRecoveryStatus();
                checkClusterStatus();
                runNodeRecovery();
            } catch (Exception e) {
                log.warn("Internal error of Recovery manager: {}", e.getMessage());
            }
        }
    }

    /**
     * Check the recovery status saved in ZK.
     * a. No recovery required(DONE/FAILED/NULL): nothing to do. simply wait
     * b. In progress(PREPARING/REPAIRING/SYNCING): fail new request if there is one in progress
     * c. Triggering(INIT): current node should take charge of node recovery
     */
    private void checkRecoveryStatus() throws Exception {
        while (true) {
            InterProcessLock lock = null;
            try {
                lock = getRecoveryLock();
                RecoveryStatus status = queryNodeRecoveryStatus();
                if (isRecovering(status)) {
                    log.warn("This is a stale recovery request due to recovery leader change");
                    return;
                } else if (isTriggering(status)) {
                    log.info("The recovery status is triggering so run recovery directly");
                    return;
                }
                setWaitingRecoveryTriggeringFlag(true);
            } catch (Exception e) {
                markRecoveryFailed(RecoveryStatus.ErrorCode.INTERNAL_ERROR);
                throw e;
            } finally {
                releaseLock(lock);
            }
            log.info("Wait to be triggered");
            waitOnRecoveryTriggering();
        }
    }

    private boolean getWaitingRecoveryTriggeringFlag() {
        return waitOnRecoveryTriggering;
    }

    private void setWaitingRecoveryTriggeringFlag(boolean waiting) {
        waitOnRecoveryTriggering = waiting;
        log.info("Setting waiting flag to {}", waiting);
    }

    /**
     * Check if cluster is triggering recovery
     */
    private boolean isTriggering(RecoveryStatus status) {
        return status.getStatus() == RecoveryStatus.Status.INIT;
    }

    /**
     * Check if cluster is recovering
     */
    private boolean isRecovering(RecoveryStatus status) {
        boolean recovering = (status.getStatus() == RecoveryStatus.Status.PREPARING
                || status.getStatus() == RecoveryStatus.Status.REPAIRING
                || status.getStatus() == RecoveryStatus.Status.SYNCING);
        return recovering;
    }

    /**
     * Check if cluster is in minority nodes corrupted scenario
     */
    private void checkClusterStatus() throws Exception {

        if (isVMwareVapp()) {
            initNodeListByCheckOfflineTime();
            purgeDataForVappRecovery(corruptedNodes);
        }else {
            initNodeListByCheckDbStatus();
        }
        if (corruptedNodes.contains(coordinator.getMyNodeId())) {
            String errMsg = "Close RecoveryManager leadership as node corrupted";
            log.info(errMsg);
            closeRecoveryLeaderSelector();
            throw new Exception(errMsg);
        } else {
            log.info("Proceed RecoveryManager leadership as node is good");
        }
        validateNodesStatus();
    }

    /**
     * Init alive node list and corrupted node list by checking db status and geodb status
     */
    private void initNodeListByCheckDbStatus() throws Exception {
        aliveNodes.clear();
        corruptedNodes.clear();
        for (String serviceName : serviceNames) {
            try (DbManagerOps dbManagerOps = new DbManagerOps(serviceName)) {
                Map<String, Boolean> statusMap = dbManagerOps.getNodeStates();
                for (Map.Entry<String, Boolean> statusEntry : statusMap.entrySet()) {
                    log.info("status map entry: {}-{}", statusEntry.getKey(), statusEntry.getValue());
                    String nodeId = statusEntry.getKey();
                    if (statusEntry.getValue().equals(Boolean.TRUE)) {
                        if (!aliveNodes.contains(nodeId)) {
                            aliveNodes.add(nodeId);
                        }
                    } else {
                        if (!corruptedNodes.contains(nodeId)) {
                            corruptedNodes.add(nodeId);
                        }
                        if (aliveNodes.contains(nodeId)) {
                            aliveNodes.remove(nodeId);
                        }
                    }
                }
            }
        }
        log.info("Alive nodes:{}, corrupted nodes: {}", aliveNodes, corruptedNodes);
    }

    /**
     * Validate cluster is in minority node corrupted scenario
     */
    private void validateNodesStatus() {
        nodeCount = coordinator.getNodeCount();
        if (aliveNodes.size() == nodeCount) {
            markRecoveryCancelled();
            log.warn("All nodes are alive, no need to do recovery");
            throw new IllegalStateException("No need to do recovery");
        } else if (aliveNodes.size() < (nodeCount / 2 + 1)) {
            markRecoveryCancelled();
            log.warn("This procedure doesn't support majority nodes corrupted scenario");
            throw new IllegalStateException("Majority nodes are corrupted");
        }
    }

    /**
     * Start cluster recovery in minority nodes corrupted scenario
     * a. PREPARING: start a multicast thread and then the user do node redeployment
     * b. REPAIRING: run db node repair between the alive nodes to make sure the consistency
     * c. SYNCING: wake the redeployed nodes from hibernate status and do data syncing
     * d. DONE: dbsvc and geodbsvc on all nodes are get started
     * e. FAILED: any error occurred during node recovery
     */
    private synchronized void runNodeRecovery() throws Exception {
        InterProcessLock lock = null;
        try {
            log.info("Node recovery begins");
            lock = getRecoveryLock();

            setRecoveryStatus(RecoveryStatus.Status.PREPARING);
            if (!isVMwareVapp()) {
                startMulticastService();
            }

            setRecoveryStatus(RecoveryStatus.Status.REPAIRING);
            runDbRepair();

            if (isVMwareVapp()) {
                restartServices();
            }
            setRecoveryStatus(RecoveryStatus.Status.SYNCING);
            waitDbsvcStarted();

            validateAutoBootFlag();
            markRecoverySuccessful();
            log.info("Node recovery is done successful");
        } catch (Exception ex) {
            markRecoveryFailed(RecoveryStatus.ErrorCode.INTERNAL_ERROR);
            log.error("Node recovery failed:", ex);
            throw ex;
        } finally {
            releaseLock(lock);
        }
    }

    /**
     * Start multicast service for node redeployment
     * TODO - we are going to remove it after Hyper-V installer is discarded in jedi.
     */
    private void startMulticastService() throws Exception {
        multicastExecutor = new NamedThreadPoolExecutor("Redeploy multicast thread", 1);
        Runnable multicast = new Runnable() {
            @Override
            public void run() {
                try {
                    log.info("Start to multicast cluster configuration for node redeploy.");

                    String version = coordinator.getTargetInfo(RepositoryInfo.class).getCurrentVersion().toString();

                    com.emc.storageos.services.util.Configuration config = PlatformUtils.getLocalConfiguration();
                    config.setScenario(PropertyConstants.REDEPLOY_MODE);
                    config.setAliveNodes(aliveNodes);

                    MulticastUtil.doBroadcast(version, config, REDEPLOY_MULTICAST_TIMEOUT);

                    log.info("Finished multicast cluster configuration for node redeploy.");
                } catch (Exception e) {
                    log.warn("Multicast failed", e);
                }
            }
        };
        multicastExecutor.execute(multicast);
    }

    /**
     * Remove the corrupted nodes and then run db node repair between the alive nodes
     */
    private void runDbRepair() {
        try {
            for (String svcName : serviceNames) {
                try (DbManagerOps dbManagerOps = new DbManagerOps(svcName)) {
                    dbManagerOps.removeNodes(corruptedNodes);
                    dbManagerOps.startNodeRepairAndWaitFinish(true, false);
                }
            }
        } catch (Exception e) {
            log.error("Node repair failed", e);
            markRecoveryFailed(RecoveryStatus.ErrorCode.REPAIR_FAILED);
            throw APIException.internalServerErrors.nodeRepairFailed();
        }
    }

    /**
     * Wait dbsvc and geodbsvc of all nodes get started
     */
    private void waitDbsvcStarted() throws Exception {
        log.info("Wait dbsvc and geodbsvc get started..");
        waitHibernateNodeStarted();
        validateClusterStatus();
        if (isVMwareVapp()) {
            removeOfflineInfo();
        }
    }

    private void removeOfflineInfo() {
        Configuration config = coordinator.getCoordinatorClient().queryConfiguration(coordinator.getCoordinatorClient().getSiteId(),
                Constants.DB_DOWNTIME_TRACKER_CONFIG, Constants.DBSVC_NAME);
        DbOfflineEventInfo dbOfflineEventInfo = new DbOfflineEventInfo(config);
        for (int i = 1; i <= nodeCount; i++) {
            String nodeId = "vipr" + i;
            if (corruptedNodes.contains(nodeId)) {
                if (dbOfflineEventInfo.getOfflineTimeInMS(nodeId) != null) {
                    dbOfflineEventInfo.setOfflineTimeInMS(nodeId, null);
                    log.info("Removed offline Time info of {}", nodeId);
                }
            }
        }
        config = dbOfflineEventInfo.toConfiguration(Constants.DBSVC_NAME);
        coordinator.getCoordinatorClient().persistServiceConfiguration(coordinator.getCoordinatorClient().getSiteId(), config);
        log.info("Clean offlineTime and Persist db tracker info to zk successfully");
    }
    private void informHibernateNodeToReconfigure() {
        DrUtil drUtil = new DrUtil(coordinator.getCoordinatorClient());
        if (drUtil.isMultisite()) {
            InterProcessLock lock = null;
            try {
                lock = drUtil.getDROperationLock();
                long vdcConfigVersion = DrUtil.newVdcConfigVersion();
                log.info("Has multi sites, informing the hibernate nodes to reconfigure..");
                drUtil.updateVdcTargetVersion(coordinator.getCoordinatorClient().getSiteId(),
                        SiteInfo.DR_OP_NODE_RECOVERY, vdcConfigVersion);
            } catch (Exception e) {
                log.error("Failed to inform the hibernate nodes to reconfigure", e);
            } finally {
                try {
                    if (lock != null) {
                        lock.release();
                    }
                } catch (Exception ignore) {
                    log.error("Release lock failed when node recovery", ignore);
                }
            }
        }
    }

    /**
     * Wait dbsvc and geodbsvc on the redeployed nodes get started
     */
    private void waitHibernateNodeStarted() throws Exception {
        long expireTime = System.currentTimeMillis() + RecoveryConstants.RECOVERY_CHECK_TIMEOUT;
        while (true) {
            informHibernateNodeToReconfigure();

            List<String> hibernateNodes = getHibernateNodes();
            if (hibernateNodes.isEmpty()) {
                log.info("Db node rebuild finished");
                break;
            }
            Thread.sleep(RecoveryConstants.RECOVERY_CHECK_INTERVAL);
            if (System.currentTimeMillis() >= expireTime) {
                log.error("Hibernate nodes({}) can't get started within the stipulated time({})",
                        hibernateNodes, RecoveryConstants.RECOVERY_CHECK_TIMEOUT);
                markRecoveryFailed(RecoveryStatus.ErrorCode.SYNC_FAILED);
                throw APIException.internalServerErrors.nodeRebuildFailed();
            }
        }
    }

    /**
     * Double check dbsvc status on all nodes
     */
    private void validateClusterStatus() throws Exception {
        for (int i = 0; i < RecoveryConstants.RECOVERY_RETRY_COUNT; i++) {
            List<String> unavailableNodes = getUnavailableNodes();
            if (unavailableNodes.isEmpty()) {
                log.info("Dbsvc on all nodes are available");
                break;
            }
            Thread.sleep(RecoveryConstants.RECOVERY_CHECK_INTERVAL);
            log.error("Healthy nodes({}) get unavailable during node recovery", unavailableNodes);
            markRecoveryFailed(RecoveryStatus.ErrorCode.NEW_NODE_FAILURE);
            throw APIException.internalServerErrors.newNodeFailureInNodeRecovery(unavailableNodes.toString());
        }
    }

    /**
     * Get hibernate nodes by check if it exist in cassandra node list
     */
    private List<String> getHibernateNodes() {
        List<String> hibernateNodes = new ArrayList<String>();
        for (int i = 1; i <= nodeCount; i++) {
            String nodeId = "vipr" + i;
            if (aliveNodes.contains(nodeId)) {
                log.debug("No need to check {} which is not a redeployed node", nodeId);
                continue;
            }
            if (isNodeHibernating(nodeId)) {
                hibernateNodes.add(nodeId);
                continue;
            }
            if (!isNodeAvailable(nodeId)) {
                hibernateNodes.add(nodeId);
            }
        }
        log.debug("Get hibernate nodes: {}", hibernateNodes);
        return hibernateNodes;
    }

    private boolean isNodeHibernating(String nodeId) {
        for (String serviceName : serviceNames) {
            try (DbManagerOps dbManagerOps = new DbManagerOps(serviceName)) {
                Map<String, Boolean> statusMap = dbManagerOps.getNodeStates();
                if (!statusMap.keySet().contains(nodeId)) {
                    log.debug("Node({}) is still hibernating", nodeId);
                    return true;
                }
            } catch (Exception e) {
                log.warn("Failed to get hibernate node by checking {}", serviceName);
            }
        }
        log.debug("Node({}) is not hibernated any more", nodeId);
        return false;
    }

    /**
     * Get unavailable nodes by check dbsvc and geodbsvc beacon
     */
    private List<String> getUnavailableNodes() {
        List<String> unavailableNodes = new ArrayList<String>();
        for (int i = 1; i <= nodeCount; i++) {
            String nodeId = "vipr" + i;
            if (!isNodeAvailable(nodeId)) {
                unavailableNodes.add(nodeId);
            }
        }
        log.debug("Get unavailable nodes: {}", unavailableNodes);
        return unavailableNodes;
    }

    private boolean isNodeAvailable(String nodeId) {
        for (String serviceName : serviceNames) {
            List<String> availableNodes = coordinator.getServiceAvailableNodes(serviceName);
            if (!availableNodes.contains(nodeId)) {
                log.debug("Service({}) on node({}) is unavailable");
                return false;
            }
        }
        return true;
    }

    /**
     * Trigger node recovery by update recovery status to 'INIT'
     */
    public void triggerNodeRecovery() {
        InterProcessLock lock = null;
        try {
            lock = getRecoveryLock();

            validateNodeRecoveryStatus();
            validateClusterState();

            RecoveryStatus status = new RecoveryStatus();
            status.setStatus(RecoveryStatus.Status.INIT);
            status.setStartTime(new Date());
            persistNodeRecoveryStatus(status);
        } finally {
            releaseLock(lock);
        }
    }

    /**
     * Check if platform is supported
     */
    private void validatePlatform() {
        if (isVMwareVapp()) {
            log.warn("Platform(vApp) is unsupported for node recovery");
            throw new UnsupportedOperationException("Platform(vApp) is unsupported for node recovery");
        }
    }

    private boolean isVMwareVapp() {
        return PlatformUtils.isVMwareVapp();
    }

    /**
     * Check if have triggered node recovery already
     */
    private void validateNodeRecoveryStatus() {
        RecoveryStatus status = queryNodeRecoveryStatus();
        if (isTriggering(status) || isRecovering(status)) {
            log.warn("Have triggered node recovery already");
            throw new IllegalStateException("Have triggered node recovery already");
        }
    }

    /**
     * Check if cluster need to do node recovery
     */
    private void validateClusterState() {
        ClusterInfo.ClusterState state = null;
        if (!isVMwareVapp()) {
            state = coordinator.getCoordinatorClient().getControlNodesState();
            log.info("Current control nodes' state: {}", state);
            if (state == ClusterInfo.ClusterState.STABLE) {
                log.warn("Cluster is stable and no need to do node recovery");
                throw new IllegalStateException("Cluster is stable and no need to do node recovery");
            }
        } else {
            initNodeListByCheckOfflineTime();
            if (aliveNodes.size() == coordinator.getNodeCount()) {
                log.warn("all nodes in vapp is available and no need to do node recovery");
                throw new IllegalStateException("all nodes in vapp is available and no need to do node recovery");
            }
        }
        // Disable node recovery when standby site state is unexpected as db repair would be failed in these scenarios.
        DrUtil drUtil = new DrUtil(coordinator.getCoordinatorClient());
        if (drUtil.isMultisite()) {
            List<Site> allStandbySites = drUtil.listStandbySites();
            for (Site site : allStandbySites) {
                if (!site.getState().equals(SiteState.STANDBY_SYNCED)
                        && !site.getState().equals(SiteState.STANDBY_PAUSED)
                        && !site.getState().equals(SiteState.STANDBY_DEGRADED)) {
                    log.error("Node recovery is not allowed as standby site({}) status is unexpected({})",
                            site.getName(), site.getState());
                    throw new IllegalStateException("Node recovery is not allowed as standby site status is unexpected");
                }
            }
        }

        // Disable node recovery when other connected vdc cluster state is DEGRADED as geo db repair would be failed then.
        if (drUtil.isMultivdc()) {
            List<String> allOtherVdcs = drUtil.getOtherVdcIds();
            for (String vdc : allOtherVdcs) {
                state = coordinator.getCoordinatorClient().getControlNodesState(vdc);
                if (state == ClusterInfo.ClusterState.DEGRADED) {
                    log.error("Node recovery is not allowed as a connected vdc({}) status is degraded", vdc);
                    throw new IllegalStateException("Node recovery is not allowed as a connected vdc status is degraded");
                }
            }
        }
    }

    /**
     * Update node recovery status to ZK
     */
    private void setRecoveryStatus(RecoveryStatus.Status status) {
        if (!isLeader.get()) {
            log.warn("This node is not the recovery leader");
            throw new IllegalStateException("This node is not the recovery leader");
        }
        RecoveryStatus recoveryStatus = queryNodeRecoveryStatus();
        recoveryStatus.setStatus(status);
        persistNodeRecoveryStatus(recoveryStatus);
    }

    /**
     * Update node recovery status to ZK
     */
    private void setRecoveryStatusWithEndTimeMarked(RecoveryStatus.Status status) {
        if (!isLeader.get()) {
            log.warn("This node is not the recovery leader");
            throw new IllegalStateException("This node is not the recovery leader");
        }
        RecoveryStatus recoveryStatus = queryNodeRecoveryStatus();
        recoveryStatus.setStatus(status);
        recoveryStatus.setEndTime(new Date());
        persistNodeRecoveryStatus(recoveryStatus);
    }

    /**
     * Set node recovery status as 'CANCELLED'
     */
    private void markRecoveryCancelled() {
        InterProcessLock lock = null;
        try {
            lock = getRecoveryLock();
            setRecoveryStatusWithEndTimeMarked(RecoveryStatus.Status.CANCELLED);
        } finally {
            releaseLock(lock);
        }
    }

    /**
     * Mark recovery status as successful and set end time
     */
    private void markRecoverySuccessful() {
        setRecoveryStatusWithEndTimeMarked(RecoveryStatus.Status.DONE);
    }

    /**
     * Mark recovery status as failed and set error message and end time
     */
    private void markRecoveryFailed(RecoveryStatus.ErrorCode errorMessage) {
        if (!isLeader.get()) {
            log.warn("This node is not the recovery leader");
            throw new IllegalStateException("This node is not the recovery leader");
        }
        RecoveryStatus recoveryStatus = queryNodeRecoveryStatus();
        if (recoveryStatus.getErrorCode() != null) {
            log.debug("Have already marked.");
            return;
        }
        recoveryStatus.setErrorCode(errorMessage);
        recoveryStatus.setEndTime(new Date());
        recoveryStatus.setStatus(RecoveryStatus.Status.FAILED);
        persistNodeRecoveryStatus(recoveryStatus);

        poweroff(getHibernateNodes());
    }

    /**
     * Persist recovery status to ZK
     */
    private void persistNodeRecoveryStatus(RecoveryStatus status) {
        log.info("Set node recovery status: {}", status);
        if (status == null) {
            return;
        }
        ConfigurationImpl cfg = new ConfigurationImpl();
        cfg.setKind(Constants.NODE_RECOVERY_STATUS);
        cfg.setId(Constants.GLOBAL_ID);

        cfg.setConfig(RecoveryConstants.RECOVERY_STATUS, status.getStatus().toString());
        if (status.getStartTime() != null) {
            cfg.setConfig(RecoveryConstants.RECOVERY_STARTTIME, String.valueOf(status.getStartTime().getTime()));
        }
        if (status.getEndTime() != null) {
            cfg.setConfig(RecoveryConstants.RECOVERY_ENDTIME, String.valueOf(status.getEndTime().getTime()));
        }
        if (status.getErrorCode() != null) {
            cfg.setConfig(RecoveryConstants.RECOVERY_ERRCODE, status.getErrorCode().toString());
        }
        coordinator.getCoordinatorClient().persistServiceConfiguration(cfg);
        log.debug("Persist node recovery status({}) to zk successfully", status);
    }

    /**
     * Query recovery status from ZK
     */
    public RecoveryStatus queryNodeRecoveryStatus() {
        RecoveryStatus status = new RecoveryStatus();
        Configuration cfg = coordinator.getCoordinatorClient().queryConfiguration(Constants.NODE_RECOVERY_STATUS,
                Constants.GLOBAL_ID);
        if (cfg != null) {
            String statusStr = cfg.getConfig(RecoveryConstants.RECOVERY_STATUS);
            status.setStatus(RecoveryStatus.Status.valueOf(statusStr));

            String startTimeStr = cfg.getConfig(RecoveryConstants.RECOVERY_STARTTIME);
            if (startTimeStr != null && startTimeStr.length() > 0) {
                status.setStartTime(new Date(Long.parseLong(startTimeStr)));
            }
            String endTimeStr = cfg.getConfig(RecoveryConstants.RECOVERY_ENDTIME);
            if (endTimeStr != null && endTimeStr.length() > 0) {
                status.setEndTime(new Date(Long.parseLong(endTimeStr)));
            }
            String errorCodeStr = cfg.getConfig(RecoveryConstants.RECOVERY_ERRCODE);
            if (errorCodeStr != null && errorCodeStr.length() > 0) {
                status.setErrorCode(RecoveryStatus.ErrorCode.valueOf(errorCodeStr));
            }
        }
        log.info("Recovery status is: {}", status);
        return status;
    }

    private void validateAutoBootFlag() {
        String siteId = coordinator.getCoordinatorClient().getSiteId();
        List<Configuration> configs = coordinator.getCoordinatorClient().queryAllConfiguration(siteId, Constants.DB_CONFIG);
        if (!isAllAutoBootTrue(configs)) {
            log.info("Auto boot flag check passed");
            return;
        }
        log.info("Auto boot flag was set true on all nodes, Change to false for one node");
        for (int i = 0; i < configs.size(); i++) {
            Configuration config = configs.get(i);
            if (config.getId() == null || config.getId().equals(Constants.GLOBAL_ID)) {
                continue;
            }
            config.setConfig(DbConfigConstants.AUTOBOOT, "false");
            coordinator.getCoordinatorClient().persistServiceConfiguration(siteId, config);
            log.info("Persist autoboot info as false on {} to zk successfully", config.getId());
            break;
        }
    }

    private boolean isAllAutoBootTrue(List<Configuration> configs) {
        for (int i = 0; i < configs.size(); i++) {
            Configuration config = configs.get(i);
            // Bypasses item of "global" and folders of "version", just check db configurations.
            if (config.getId() == null || config.getId().equals(Constants.GLOBAL_ID)) {
                continue;
            }
            if (!Boolean.parseBoolean(config.getConfig(DbConfigConstants.AUTOBOOT))) { 
                return false;
            }
        }
        return true;
    }

    /**
     * Get recovery lock to protect the setting of recovery status
     */
    private InterProcessLock getRecoveryLock() {
        InterProcessLock lock = null;
        log.info("Try to acquire recovery lock");
        try {
            lock = coordinator.getCoordinatorClient().getLock(RecoveryConstants.RECOVERY_LOCK);
            boolean acquired = lock.acquire(RecoveryConstants.RECOVERY_LOCK_TIMEOUT, TimeUnit.MILLISECONDS);
            if (!acquired) {
                throw new IllegalStateException("Unable to get recovery lock");
            }
        } catch (Exception e) {
            log.error("Get recovery lock failed", e);
            throw APIException.internalServerErrors.getLockFailed();
        }
        log.info("Got recovery lock");
        return lock;
    }

    /**
     * Release recovery lock
     */
    private void releaseLock(InterProcessLock lock) {
        if (lock == null) {
            log.info("The recovery lock is null, no need to release");
            return;
        }
        try {
            lock.release();
            log.info("Release recovery lock successful");
        } catch (Exception ignore) {
            log.warn("Release recovery lock failed", ignore);
        }
    }

    /**
     * Poweroff specific nodes
     * 
     * @param nodeIds a list of node id (e.g. vipr1)
     */
    public void poweroff(List<String> nodeIds) {
        for (String nodeId : nodeIds) {
            try {
                log.info("Try to power off {}", nodeId);
                String svcId = nodeId.replace("vipr", "syssvc-");
                URI nodeEndpoint = coordinator.getNodeEndpointForSvcId(svcId);
                if (nodeEndpoint == null) {
                    continue;
                }
                SysClientFactory.getSysClient(coordinator.getNodeEndpointForSvcId(svcId))
                        .post(SysClientFactory.URI_POWEROFF_NODE, null, null);
                log.info("Power off {} successfully", nodeId);
            } catch (SysClientException e) {
                log.error("Power off node({}) failed", nodeId, e.getMessage());
            }
        }
    }

    /**
     * Poweroff local node
     */
    public void poweroff() {
        localRepository.poweroff();
    }

    /**
     * Register recovery status listener to monitor the status's change
     */
    private void addRecoveryStatusListener() {
        try {
            coordinator.getCoordinatorClient().addNodeListener(new RecoveryStatusListener());
        } catch (Exception e) {
            log.error("Fail to add recovery status listener", e);
            throw APIException.internalServerErrors.addListenerFailed();
        }
    }

    /**
     * The listener class is to listen the recovery status node change.
     */
    private class RecoveryStatusListener implements NodeListener {

        @Override
        public String getPath() {
            String path = String.format("%1$s/%2$s/%3$s", ZkPath.CONFIG, Constants.NODE_RECOVERY_STATUS,
                    Constants.GLOBAL_ID);
            return path;
        }

        /**
         * Called when a change of recovery status has occurred
         */
        @Override
        public void nodeChanged() {
            wakeupRecoveryThread();
        }

        /**
         * Called when connection status changed
         */
        @Override
        public void connectionStateChanged(State state) {
        }
    }

    private synchronized void waitOnRecoveryTriggering() throws InterruptedException {
        if (getWaitingRecoveryTriggeringFlag()) {
            this.wait();
        }
    }

    private synchronized void wakeupRecoveryThread() {
        if (getWaitingRecoveryTriggeringFlag()) {
            log.info("Try to notify the semaphore");
            this.notifyAll();
            setWaitingRecoveryTriggeringFlag(false);
        }
    }

    /**
     * Use leader selector to make sure only one node(leader node) start recovery manager
     */
    private void startRecoveryLeaderSelector() {
        while (!coordinator.getCoordinatorClient().isConnected()) {
            log.info("Waiting for connecting to zookeeper");
            try {
                Thread.sleep(RecoveryConstants.RECOVERY_CONNECT_INTERVAL);
            } catch (InterruptedException e) {
                log.warn("Exception while sleeping, ignore", e);
            }
        }
        leaderSelector = coordinator.getCoordinatorClient().getLeaderSelector(
                RecoveryConstants.RECOVERY_LEADER_PATH,
                new RecoveryLeaderSelectorListener());
        leaderSelector.autoRequeue();
        leaderSelector.start();
    }
    /*
    * close leader selector on the node
    * */
    private void closeRecoveryLeaderSelector() {
        log.info ("close the leaderSelect on the node as dbsvc need to recover");

        leaderSelector.close();
        /*make sure continue after stopLeadership executed */
        try {
            Thread.sleep(RecoveryConstants.RECOVERY_CONNECT_INTERVAL);
        } catch (InterruptedException e) {
            log.warn("Exception while sleeping, ignore", e);
        }
    }

    /**
     * The listener class is to listen the leader node change.
     */
    private class RecoveryLeaderSelectorListener extends LeaderSelectorListenerImpl {
        @Override
        protected void startLeadership() throws Exception {
            log.info("Select as leader, wait to start recovery manager");
            isLeader.set(true);
            start();
        }

        @Override
        protected void stopLeadership() {
            log.info("Give up leader, try to stop recovery manager");
            isLeader.set(false);
            stop();
        }
    }

    private void start() {
        recoveryExecutor = new NamedThreadPoolExecutor("Recovery manager", 1);
        recoveryExecutor.execute(this);
    }

    private void stop() {
        recoveryExecutor.shutdownNow();
        try {
            while (!recoveryExecutor.awaitTermination(RecoveryConstants.THREAD_CHECK_INTERVAL, TimeUnit.SECONDS)) {
                log.warn("Waiting recovery thread pool to shutdown for another {} seconds",
                        RecoveryConstants.THREAD_CHECK_INTERVAL);
            }
        } catch (InterruptedException e) {
            log.error("Interrupted while waiting to shutdown recovery thread pool", e);
        }
    }

    private void initNodeListByCheckOfflineTime() {
        aliveNodes.clear();
        corruptedNodes.clear();
        ArrayList<String> nodeList = coordinator.getAllNodeIds();
        for (String nodeId : nodeList) {
            try {
                DbOfflineStatus dbOfflineStatus = SysClientFactory.getSysClient(
                        coordinator.getNodeEndpoint(nodeId)).get(SysClientFactory.URI_GET_DB_OFFLINE_STATUS, DbOfflineStatus.class, null);
                if (dbOfflineStatus.getOutageTimeExceeded()) {
                    corruptedNodes.add(nodeId);
                } else {
                    aliveNodes.add(nodeId);
                }
            } catch (SysClientException e) {
                log.warn("Internal error on clean up purge data: ", e.getMessage());
                throw e;
            }
        }
        log.info("Alive nodes:{}, corrupted nodes: {}", aliveNodes, corruptedNodes);
    }

    private void purgeDataForVappRecovery(List<String> nodeList) {
        for (String nodeId : nodeList) {
            try {
                SysClientFactory.getSysClient(coordinator.getNodeEndpoint(nodeId)).post(URI.create(SysClientFactory.URI_NODE_DBRESET.getPath()), null, null);
            } catch (SysClientException e) {
                log.warn("Internal error on clean up purge data: ",e.getMessage());
                throw e;
            }
        }
    }

    /**
    * restart dbsvc/geosvc/syssvc after repairing for vapp
    */
    private void restartServices() {
        ArrayList<String> restartedServiceNames = new ArrayList<>(serviceNames);
        restartedServiceNames.add(Constants.SYSSVC_NAME);
        for (String nodeId : corruptedNodes) {
            for (String serviceName : restartedServiceNames) {
                SysClientFactory.getSysClient(coordinator.getNodeEndpoint(nodeId)).
                        post(URI.create(SysClientFactory.URI_RESTART_SERVICE.getPath() + "?name=" + serviceName), null, null);
            }
        }
    }
}