DbManager.java example

Explorer
coprhd-controller-master
/*
 * Copyright (c) 2015 EMC Corporation
 * All Rights Reserved
 */

package com.emc.storageos.db.server.impl;

import com.emc.storageos.coordinator.client.model.Site;
import com.emc.storageos.coordinator.client.model.SiteMonitorResult;
import com.emc.storageos.coordinator.client.model.SiteState;
import com.emc.storageos.coordinator.client.service.CoordinatorClient;
import com.emc.storageos.coordinator.client.service.DrUtil;
import com.emc.storageos.coordinator.client.service.impl.DualInetAddress;
import com.emc.storageos.management.jmx.recovery.DbManagerMBean;
import com.emc.storageos.management.jmx.recovery.DbManagerOps;
import com.emc.storageos.services.util.JmxServerWrapper;
import com.emc.vipr.model.sys.recovery.DbRepairStatus;
import com.emc.storageos.services.util.NamedScheduledThreadPoolExecutor;

import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.db.HintedHandOffManager;
import org.apache.cassandra.db.SystemKeyspace;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.gms.ApplicationState;
import org.apache.cassandra.gms.EndpointState;
import org.apache.cassandra.gms.Gossiper;
import org.apache.cassandra.gms.IEndpointStateChangeSubscriber;
import org.apache.cassandra.gms.VersionedValue;
import org.apache.cassandra.locator.IEndpointSnitch;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.utils.UUIDGen;
import org.apache.curator.framework.recipes.locks.InterProcessLock;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.jmx.export.annotation.ManagedResource;

import java.net.InetAddress;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.Set;
import java.util.TreeMap;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

/**
 * MBean implementation for all db management operations
 */
@ManagedResource(objectName = DbManagerOps.MBEAN_NAME, description = "DB Manager MBean")
public class DbManager implements DbManagerMBean {
    private static final Logger log = LoggerFactory.getLogger(DbManager.class);

    private static final int REPAIR_INITIAL_WAIT_FOR_DBSTART_MINUTES = 5;
    // repair every 24*5 hours by default, given we do a proactive repair on start
    // once per five days on demand should suffice
    private static final int DEFAULT_DB_REPAIR_FREQ_MIN = 60 * 24 * 5;
    // a normal node removal should succeed in 30s.
    private static final int REMOVE_NODE_TIMEOUT_MILLIS = 1 * 60 * 1000; // 1 min
    private int repairFreqMin = DEFAULT_DB_REPAIR_FREQ_MIN;

    private CoordinatorClient coordinator;
    private SchemaUtil schemaUtil;
    private DrUtil drUtil;
    private ScheduledExecutorService dbNodeStateCallbackExecutor;
    
    @Autowired
    private JmxServerWrapper jmxServer;

    ScheduledFuture<?> scheduledRepairTrigger;

    // Max retry times after a db repair failure
    private int repairRetryTimes = 5;
    private ScheduledExecutorService executor = new NamedScheduledThreadPoolExecutor("DbRepairPool", 2);
    
    public void setCoordinator(CoordinatorClient coordinator) {
        this.coordinator = coordinator;
    }

    public void setSchemaUtil(SchemaUtil schemaUtil) {
        this.schemaUtil = schemaUtil;
    }

    public void setDrUtil(DrUtil drUtil) {
        this.drUtil = drUtil;
    }
    
    /**
     * Regular repair frequency in minutes
     * 
     * @param repairFreqMin
     */
    public void setRepairFreqMin(int repairFreqMin) {
        this.repairFreqMin = repairFreqMin;
    }

    /**
     * Start a node repair
     * 
     * @param keySpaceName
     * @param maxRetryTimes
     * @param crossVdc
     * @param noNewReapir
     * @return
     * @throws Exception
     */
    private boolean startNodeRepair(String keySpaceName, int maxRetryTimes, boolean crossVdc, boolean noNewReapir) throws Exception {
        DbRepairRunnable runnable = new DbRepairRunnable(jmxServer, this.executor, this.coordinator, keySpaceName,
                this.schemaUtil.isGeoDbsvc(), maxRetryTimes, noNewReapir);
        // call preConfig() here to set IN_PROGRESS for db repair triggered by schedule since we use it in getDbRepairStatus.
        runnable.preConfig();
        synchronized (runnable) {
            this.executor.submit(runnable);
            runnable.wait();
        }

        switch (runnable.getStatus()) {
            case ALREADY_RUNNING:
                return true;
            case NOT_THE_TIME:
                return false;
            case NOTHING_TO_RESUME:
                return false;
        }
        return true;
    }

    private static void addAll(Map<String, Boolean> stateMap, List<String> ips, boolean up) {
        for (String ip : ips) {
            stateMap.put(normalizeInetAddress(ip), up);
        }
    }

    private static String normalizeInetAddress(String ipString) {
        try {
            final DualInetAddress dualAddr = DualInetAddress.fromAddress(ipString);
            return dualAddr.hasInet4() ? dualAddr.getInet4() : dualAddr.getInet6();
        } catch (Exception e) {
            log.error("Failed to normalize ipaddr: {}", ipString, e);
            return null;
        }
    }

    @Override
    public Map<String, Boolean> getNodeStates() {
        Map<String, Boolean> ipStateMap = new TreeMap<>();
        while (true) {
            List<String> upNodes = StorageService.instance.getLiveNodes();
            List<String> downNodes = StorageService.instance.getUnreachableNodes();
            List<String> upNodes2 = StorageService.instance.getLiveNodes();
            if (new HashSet<>(upNodes).equals(new HashSet<>(upNodes2))) {
                addAll(ipStateMap, upNodes, true);
                addAll(ipStateMap, downNodes, false);
                break;
            }
        }

        Map<String, DualInetAddress> idIpMap = this.coordinator.getInetAddessLookupMap().getControllerNodeIPLookupMap();
        Map<String, Boolean> idStateMap = new TreeMap<>();
        for (Map.Entry<String, DualInetAddress> entry : idIpMap.entrySet()) {
            DualInetAddress dualAddr = entry.getValue();
            Boolean state = dualAddr.hasInet4() ? ipStateMap.get(dualAddr.getInet4()) : null;
            if (state == null) {
                state = dualAddr.hasInet6() ? ipStateMap.get(dualAddr.getInet6()) : null;
            }
            if (state != null) {
                idStateMap.put(entry.getKey(), state);
            }
        }

        return idStateMap;
    }

    @Override
    public void removeNode(String nodeId) {
        Map<String, DualInetAddress> idMap = this.coordinator.getInetAddessLookupMap().getControllerNodeIPLookupMap();
        DualInetAddress dualAddr = idMap.get(nodeId);
        if (dualAddr == null) {
            String errMsg = String.format("Cannot find node with name %s", nodeId);
            log.error(errMsg);
            throw new IllegalArgumentException(errMsg);
        }

        Map<String, String> hostIdMap = StorageService.instance.getHostIdMap();
        Map<String, String> ipGuidMap = new HashMap<String, String>();
        for (Map.Entry<String, String> entry : hostIdMap.entrySet()) {
            ipGuidMap.put(normalizeInetAddress(entry.getKey()), entry.getValue());
        }

        String nodeGuid = dualAddr.hasInet4() ? ipGuidMap.get(dualAddr.getInet4()) : null;
        if (nodeGuid == null) {
            nodeGuid = dualAddr.hasInet6() ? ipGuidMap.get(dualAddr.getInet6()) : null;
        }
        if (nodeGuid == null) {
            String errMsg = String.format("Cannot find Cassandra node with IP address %s", dualAddr.toString());
            log.error(errMsg);
            throw new IllegalArgumentException(errMsg);
        }

        log.info("Removing Cassandra node {} on vipr node {}", nodeGuid, nodeId);
        ensureRemoveNode(nodeGuid);
    }

    @Override
    public void startNodeRepair(boolean canResume, boolean crossVdc) throws Exception {
        // The return value is ignored as we are setting interval time to 0, it cannot be NotTheTime. And both AlreadyRunning and Started
        // are considered success. Though the already running repair may not for current cluster state, but that's same if it is and the
        // cluster state changed immediately after that.
        startNodeRepair(this.schemaUtil.getKeyspaceName(), canResume ? this.repairRetryTimes : 0, crossVdc, false);
    }

    private static DbRepairStatus getLastRepairStatus(DbRepairJobState state, String clusterDigest, int maxRetryTime) {
        if (state.getCurrentDigest() != null && (clusterDigest == null || clusterDigest.equals(state.getCurrentDigest()))) {
            if (state.getCurrentRetry() <= maxRetryTime) {
                return new DbRepairStatus(DbRepairStatus.Status.IN_PROGRESS,
                        new Date(state.getCurrentStartTime()), null, state.getCurrentProgress());
            } else {
                return new DbRepairStatus(DbRepairStatus.Status.FAILED,
                        new Date(state.getCurrentStartTime()), new Date(state.getCurrentUpdateTime()),
                        state.getCurrentProgress());
            }
        }

        return getLastSucceededRepairStatus(state, clusterDigest);
    }

    private static DbRepairStatus getLastSucceededRepairStatus(DbRepairJobState state, String clusterDigest) {
        if (state.getLastSuccessDigest() != null && (clusterDigest == null || clusterDigest.equals(state.getLastSuccessDigest()))) {
            return new DbRepairStatus(DbRepairStatus.Status.SUCCESS,
                    new Date(state.getLastSuccessStartTime()), new Date(state.getLastSuccessEndTime()), 100);
        }

        return null;
    }

    @Override
    public DbRepairStatus getLastRepairStatus(boolean forCurrentNodesOnly) {
        try {
            DbRepairJobState state = DbRepairRunnable.queryRepairState(this.coordinator, this.schemaUtil.getKeyspaceName(),
                    this.schemaUtil.isGeoDbsvc());
            log.info("cluster state digest stored in ZK: {}", state.getCurrentDigest());

            DbRepairStatus retState = getLastRepairStatus(state, forCurrentNodesOnly ? DbRepairRunnable.getClusterStateDigest() : null,
                    this.repairRetryTimes);

            if (retState != null && retState.getStatus() == DbRepairStatus.Status.IN_PROGRESS) {
                // See if current state holder is still active, if not, we need to resume it
                String lockName = DbRepairRunnable.getLockName();
                InterProcessLock lock = coordinator.getLock(lockName);

                String currentHolder = DbRepairRunnable.getSelfLockNodeId(lock);
                if (currentHolder == null) { // No thread is actually driving the repair, we need to resume it
                    if (startNodeRepair(this.schemaUtil.getKeyspaceName(), this.repairRetryTimes, false, true)) {
                        log.info("Successfully resumed a previously paused repair");
                    } else {
                        log.warn("Cannot resume a previously paused repair, it could be another thread resumed and finished it");
                    }
                }
            }

            return retState;
        } catch (Exception e) {
            log.error("Failed to get node repair state from ZK", e);
            return null;
        }
    }

    @Override
    public DbRepairStatus getLastSucceededRepairStatus(boolean forCurrentNodesOnly) {
        try {
            DbRepairJobState state = DbRepairRunnable.queryRepairState(this.coordinator, this.schemaUtil.getKeyspaceName(),
                    this.schemaUtil.isGeoDbsvc());

            return getLastSucceededRepairStatus(state, forCurrentNodesOnly ? DbRepairRunnable.getClusterStateDigest() : null);
        } catch (Exception e) {
            log.error("Failed to get node repair state from ZK", e);
            return null;
        }
    }
    
    @Override
    public void resetRepairState() {
        DbRepairRunnable.resetRepairState(this.coordinator, this.schemaUtil.getKeyspaceName(),
                this.schemaUtil.isGeoDbsvc());
    }

    public void start() {
        this.scheduledRepairTrigger = this.executor.scheduleWithFixedDelay(new Runnable() {
            @Override
            public void run() {
                try {
                    startNodeRepair(schemaUtil.getKeyspaceName(), repairRetryTimes, true, false);
                } catch (Exception e) {
                    log.error("Failed to trigger node repair", e);
                }
            }
        }, REPAIR_INITIAL_WAIT_FOR_DBSTART_MINUTES, repairFreqMin, TimeUnit.MINUTES);
    }

    @Override
    public void removeDataCenter(String dcName) {
        log.info("Remove Cassandra data center {}", dcName);
        List<InetAddress> allNodes = new ArrayList<>();
        Set<InetAddress> liveNodes = Gossiper.instance.getLiveMembers();
        allNodes.addAll(liveNodes);
        Set<InetAddress> unreachableNodes = Gossiper.instance.getUnreachableMembers();
        allNodes.addAll(unreachableNodes);
        for (InetAddress nodeIp : allNodes) {
            IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
            String dc = snitch.getDatacenter(nodeIp);
            log.info("node {} belongs to data center {} ", nodeIp, dc);
            if (dc.equals(dcName)) {
                removeCassandraNode(nodeIp);
            }
        }
    }

    private void removeCassandraNode(InetAddress nodeIp) {
        Map<String, String> hostIdMap = StorageService.instance.getHostIdMap();
        String guid = hostIdMap.get(nodeIp.getHostAddress());
        // Skip the node removal if this node doesn't exist in host id map 
        if (guid != null) {
            log.info("Removing Cassandra node {} on vipr node {}", guid, nodeIp);
            Gossiper.instance.convict(nodeIp, 0);
            ensureRemoveNode(guid);
        } else {
            log.info("Skip removal of Cassandra node {} due to no host id found", nodeIp);
        }
    }

    /**
     * A safer method to remove Cassandra node. Calls forceRemoveCompletion after REMOVE_NODE_TIMEOUT_MILLIS
     * This will help to prevent node removal from hanging due to CASSANDRA-6542.
     *
     * @param guid
     */
    public void ensureRemoveNode(final String guid) {
        ExecutorService exe = Executors.newSingleThreadExecutor();
        Future<?> future = exe.submit(new Runnable() {
            public void run() {
                StorageService.instance.removeNode(guid);
            }
        });
        try {
            future.get(REMOVE_NODE_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS);
        } catch (TimeoutException e) {
            log.warn("removenode timeout, calling forceRemoveCompletion()");
            StorageService.instance.forceRemoveCompletion();
        } catch (InterruptedException | ExecutionException e) {
            log.warn("Exception calling removenode", e);
        } finally {
            exe.shutdownNow();
        }
    }
    
    /**
     * Check if data is synced with remote data center specified by dcName.  It checks hinted handoff logs
     * 
     * @return true - synced. otherwise false
     */
    @Override
    public boolean isDataCenterSynced(String dcName) {
        log.info("Check if data synced with Cassandra data center {}", dcName);
        
        // Compact HINTS column family and eliminate deleted hints before checking hinted handoff logs
        try {
            StorageService.instance.forceKeyspaceCompaction("system", SystemKeyspace.HINTS_CF);
        } catch (Exception ex) {
            log.warn("Fail to compact system HINTS_CF", ex);
        }
        
        List<InetAddress> allNodes = new ArrayList<>();
        Set<InetAddress> liveNodes = Gossiper.instance.getLiveMembers();
        allNodes.addAll(liveNodes);
        Set<InetAddress> unreachableNodes = Gossiper.instance.getUnreachableMembers();
        allNodes.addAll(unreachableNodes);
        for (InetAddress nodeIp : allNodes) {
            IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
            String dc = snitch.getDatacenter(nodeIp);
            if (dc.equals(dcName)) {
                log.info("Checking hinted handoff logs for node {} in data center {} ", nodeIp, dc);
                if (hasPendingHintedHandoff(nodeIp)) {
                    return false;
                }
            }
        }
        return true;
    }
 
    /**
     * Check if there is pending hinted handoff logs for given node
     * 
     * @param endpoint
     * @return true - pending hinted handoff logs exists. Otherwise, false
     */
    private boolean hasPendingHintedHandoff(InetAddress endpoint) {
        List<String> endpointsWithPendingHints = HintedHandOffManager.instance.listEndpointsPendingHints();
        if (endpointsWithPendingHints.isEmpty()) {
            log.info("Skip data sync status check. No pending hinted handoff logs");
            return false;
        }
        log.info("Pending hinted hand off logs found at {}", endpointsWithPendingHints);
        UUID hostId = Gossiper.instance.getHostId(endpoint);
        final ByteBuffer hostIdBytes = ByteBuffer.wrap(UUIDGen.decompose(hostId));
        DecoratedKey epkey =  StorageService.getPartitioner().decorateKey(hostIdBytes);
        Token.TokenFactory tokenFactory = StorageService.getPartitioner().getTokenFactory();
        String token = tokenFactory.toString(epkey.getToken());
        for (String unsyncedEndpoint : endpointsWithPendingHints) {
            if (token.equals(unsyncedEndpoint)) {
                log.info("Unsynced data found for : {}", endpoint);
                return true;
            }
        } 
        return false;
    }

    public void init() {
        if (drUtil.isActiveSite()) {
            log.info("Register Cassandra node state listener on DR active site");
            dbNodeStateCallbackExecutor = Executors.newScheduledThreadPool(1);
            Gossiper.instance.register(endpointStateChangeSubscripter);
        }
    }
    
    private Site getSite(InetAddress endpoint) {
        IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
        String dcName = snitch.getDatacenter(endpoint);
        for (Site site : drUtil.listSites()) {
            String cassandraDcId = drUtil.getCassandraDcId(site);
            if (cassandraDcId.equals(dcName)) {
                return site;
            }
        }
        return null;
    }
    
    private void checkAndSetIncrementalSyncing(InetAddress endpoint) {
        Site site = getSite(endpoint);
        if (site == null) {
            log.info("Unknown site for {}. Skip HandoffLogDetector", endpoint);
            return;
        }
        
        log.info("Node {} in site {} comes online", endpoint, site.getUuid());
        if (site.getState() == SiteState.STANDBY_SYNCED) {
            SiteMonitorResult monitorResult = drUtil.getCoordinator().getTargetInfo(site.getUuid(), SiteMonitorResult.class);
            if (monitorResult == null || monitorResult.getDbQuorumLostSince() == 0) {
                log.info("No db quorum lost on standby site. Skip this node up event on {} ", endpoint);
                return;
            }
            if (hasPendingHintedHandoff(endpoint)) { 
                log.info("Hinted handoff logs detected. Change site {} state to STANDBY_INCR_SYNCING", site.getUuid());
                site.setState(SiteState.STANDBY_INCR_SYNCING); 
                drUtil.getCoordinator().persistServiceConfiguration(site.toConfiguration());
            }
        } else {
            log.info("Skip hinted handoff logs detector for {} due to site state is {}. ", endpoint, site.getState());
        }
    }
    
    // Cassandra node state listener
    private IEndpointStateChangeSubscriber endpointStateChangeSubscripter =  new IEndpointStateChangeSubscriber() {
        @Override
        public void onJoin(InetAddress endpoint, EndpointState epState) {
        }

        @Override
        public void beforeChange(InetAddress endpoint, EndpointState currentState, ApplicationState newStateKey,
                VersionedValue newValue) {
        }

        @Override
        public void onChange(InetAddress endpoint, ApplicationState state, VersionedValue value) {
        }

        @Override
        public void onAlive(InetAddress endpoint, EndpointState state) {
            if (drUtil.isStandby()) {
                log.info("Skip node state change of {} on standby site", endpoint);
                return;
            }
            HandoffLogDetector detector = new HandoffLogDetector(endpoint);
            dbNodeStateCallbackExecutor.schedule(detector, 0, TimeUnit.SECONDS);
        }
        
        /**
         * Detect pending handoff logs and set STANDBY_INCR_SYNCING state if necessary
         */
        class HandoffLogDetector implements Runnable {
            private InetAddress endpoint;
            private HandoffLogDetector(InetAddress endpoint) {
                this.endpoint = endpoint;
            }
            public void run() {
                checkAndSetIncrementalSyncing(endpoint);
            }
        };
        
        @Override
        public void onDead(InetAddress endpoint, EndpointState state) {
        }

        @Override
        public void onRemove(InetAddress endpoint) {
        }

        @Override
        public void onRestart(InetAddress endpoint, EndpointState state) {
        }
    };
}