/* * Copyright (c) 2015 EMC Corporation * All Rights Reserved */ package com.emc.storageos.db.server.impl; import com.emc.storageos.coordinator.client.model.Site; import com.emc.storageos.coordinator.client.model.SiteMonitorResult; import com.emc.storageos.coordinator.client.model.SiteState; import com.emc.storageos.coordinator.client.service.CoordinatorClient; import com.emc.storageos.coordinator.client.service.DrUtil; import com.emc.storageos.coordinator.client.service.impl.DualInetAddress; import com.emc.storageos.management.jmx.recovery.DbManagerMBean; import com.emc.storageos.management.jmx.recovery.DbManagerOps; import com.emc.storageos.services.util.JmxServerWrapper; import com.emc.vipr.model.sys.recovery.DbRepairStatus; import com.emc.storageos.services.util.NamedScheduledThreadPoolExecutor; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.HintedHandOffManager; import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.dht.Token; import org.apache.cassandra.gms.ApplicationState; import org.apache.cassandra.gms.EndpointState; import org.apache.cassandra.gms.Gossiper; import org.apache.cassandra.gms.IEndpointStateChangeSubscriber; import org.apache.cassandra.gms.VersionedValue; import org.apache.cassandra.locator.IEndpointSnitch; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.UUIDGen; import org.apache.curator.framework.recipes.locks.InterProcessLock; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.jmx.export.annotation.ManagedResource; import java.net.InetAddress; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Date; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.HashMap; import java.util.Set; import java.util.TreeMap; import java.util.UUID; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; /** * MBean implementation for all db management operations */ @ManagedResource(objectName = DbManagerOps.MBEAN_NAME, description = "DB Manager MBean") public class DbManager implements DbManagerMBean { private static final Logger log = LoggerFactory.getLogger(DbManager.class); private static final int REPAIR_INITIAL_WAIT_FOR_DBSTART_MINUTES = 5; // repair every 24*5 hours by default, given we do a proactive repair on start // once per five days on demand should suffice private static final int DEFAULT_DB_REPAIR_FREQ_MIN = 60 * 24 * 5; // a normal node removal should succeed in 30s. private static final int REMOVE_NODE_TIMEOUT_MILLIS = 1 * 60 * 1000; // 1 min private int repairFreqMin = DEFAULT_DB_REPAIR_FREQ_MIN; private CoordinatorClient coordinator; private SchemaUtil schemaUtil; private DrUtil drUtil; private ScheduledExecutorService dbNodeStateCallbackExecutor; @Autowired private JmxServerWrapper jmxServer; ScheduledFuture<?> scheduledRepairTrigger; // Max retry times after a db repair failure private int repairRetryTimes = 5; private ScheduledExecutorService executor = new NamedScheduledThreadPoolExecutor("DbRepairPool", 2); public void setCoordinator(CoordinatorClient coordinator) { this.coordinator = coordinator; } public void setSchemaUtil(SchemaUtil schemaUtil) { this.schemaUtil = schemaUtil; } public void setDrUtil(DrUtil drUtil) { this.drUtil = drUtil; } /** * Regular repair frequency in minutes * * @param repairFreqMin */ public void setRepairFreqMin(int repairFreqMin) { this.repairFreqMin = repairFreqMin; } /** * Start a node repair * * @param keySpaceName * @param maxRetryTimes * @param crossVdc * @param noNewReapir * @return * @throws Exception */ private boolean startNodeRepair(String keySpaceName, int maxRetryTimes, boolean crossVdc, boolean noNewReapir) throws Exception { DbRepairRunnable runnable = new DbRepairRunnable(jmxServer, this.executor, this.coordinator, keySpaceName, this.schemaUtil.isGeoDbsvc(), maxRetryTimes, noNewReapir); // call preConfig() here to set IN_PROGRESS for db repair triggered by schedule since we use it in getDbRepairStatus. runnable.preConfig(); synchronized (runnable) { this.executor.submit(runnable); runnable.wait(); } switch (runnable.getStatus()) { case ALREADY_RUNNING: return true; case NOT_THE_TIME: return false; case NOTHING_TO_RESUME: return false; } return true; } private static void addAll(Map<String, Boolean> stateMap, List<String> ips, boolean up) { for (String ip : ips) { stateMap.put(normalizeInetAddress(ip), up); } } private static String normalizeInetAddress(String ipString) { try { final DualInetAddress dualAddr = DualInetAddress.fromAddress(ipString); return dualAddr.hasInet4() ? dualAddr.getInet4() : dualAddr.getInet6(); } catch (Exception e) { log.error("Failed to normalize ipaddr: {}", ipString, e); return null; } } @Override public Map<String, Boolean> getNodeStates() { Map<String, Boolean> ipStateMap = new TreeMap<>(); while (true) { List<String> upNodes = StorageService.instance.getLiveNodes(); List<String> downNodes = StorageService.instance.getUnreachableNodes(); List<String> upNodes2 = StorageService.instance.getLiveNodes(); if (new HashSet<>(upNodes).equals(new HashSet<>(upNodes2))) { addAll(ipStateMap, upNodes, true); addAll(ipStateMap, downNodes, false); break; } } Map<String, DualInetAddress> idIpMap = this.coordinator.getInetAddessLookupMap().getControllerNodeIPLookupMap(); Map<String, Boolean> idStateMap = new TreeMap<>(); for (Map.Entry<String, DualInetAddress> entry : idIpMap.entrySet()) { DualInetAddress dualAddr = entry.getValue(); Boolean state = dualAddr.hasInet4() ? ipStateMap.get(dualAddr.getInet4()) : null; if (state == null) { state = dualAddr.hasInet6() ? ipStateMap.get(dualAddr.getInet6()) : null; } if (state != null) { idStateMap.put(entry.getKey(), state); } } return idStateMap; } @Override public void removeNode(String nodeId) { Map<String, DualInetAddress> idMap = this.coordinator.getInetAddessLookupMap().getControllerNodeIPLookupMap(); DualInetAddress dualAddr = idMap.get(nodeId); if (dualAddr == null) { String errMsg = String.format("Cannot find node with name %s", nodeId); log.error(errMsg); throw new IllegalArgumentException(errMsg); } Map<String, String> hostIdMap = StorageService.instance.getHostIdMap(); Map<String, String> ipGuidMap = new HashMap<String, String>(); for (Map.Entry<String, String> entry : hostIdMap.entrySet()) { ipGuidMap.put(normalizeInetAddress(entry.getKey()), entry.getValue()); } String nodeGuid = dualAddr.hasInet4() ? ipGuidMap.get(dualAddr.getInet4()) : null; if (nodeGuid == null) { nodeGuid = dualAddr.hasInet6() ? ipGuidMap.get(dualAddr.getInet6()) : null; } if (nodeGuid == null) { String errMsg = String.format("Cannot find Cassandra node with IP address %s", dualAddr.toString()); log.error(errMsg); throw new IllegalArgumentException(errMsg); } log.info("Removing Cassandra node {} on vipr node {}", nodeGuid, nodeId); ensureRemoveNode(nodeGuid); } @Override public void startNodeRepair(boolean canResume, boolean crossVdc) throws Exception { // The return value is ignored as we are setting interval time to 0, it cannot be NotTheTime. And both AlreadyRunning and Started // are considered success. Though the already running repair may not for current cluster state, but that's same if it is and the // cluster state changed immediately after that. startNodeRepair(this.schemaUtil.getKeyspaceName(), canResume ? this.repairRetryTimes : 0, crossVdc, false); } private static DbRepairStatus getLastRepairStatus(DbRepairJobState state, String clusterDigest, int maxRetryTime) { if (state.getCurrentDigest() != null && (clusterDigest == null || clusterDigest.equals(state.getCurrentDigest()))) { if (state.getCurrentRetry() <= maxRetryTime) { return new DbRepairStatus(DbRepairStatus.Status.IN_PROGRESS, new Date(state.getCurrentStartTime()), null, state.getCurrentProgress()); } else { return new DbRepairStatus(DbRepairStatus.Status.FAILED, new Date(state.getCurrentStartTime()), new Date(state.getCurrentUpdateTime()), state.getCurrentProgress()); } } return getLastSucceededRepairStatus(state, clusterDigest); } private static DbRepairStatus getLastSucceededRepairStatus(DbRepairJobState state, String clusterDigest) { if (state.getLastSuccessDigest() != null && (clusterDigest == null || clusterDigest.equals(state.getLastSuccessDigest()))) { return new DbRepairStatus(DbRepairStatus.Status.SUCCESS, new Date(state.getLastSuccessStartTime()), new Date(state.getLastSuccessEndTime()), 100); } return null; } @Override public DbRepairStatus getLastRepairStatus(boolean forCurrentNodesOnly) { try { DbRepairJobState state = DbRepairRunnable.queryRepairState(this.coordinator, this.schemaUtil.getKeyspaceName(), this.schemaUtil.isGeoDbsvc()); log.info("cluster state digest stored in ZK: {}", state.getCurrentDigest()); DbRepairStatus retState = getLastRepairStatus(state, forCurrentNodesOnly ? DbRepairRunnable.getClusterStateDigest() : null, this.repairRetryTimes); if (retState != null && retState.getStatus() == DbRepairStatus.Status.IN_PROGRESS) { // See if current state holder is still active, if not, we need to resume it String lockName = DbRepairRunnable.getLockName(); InterProcessLock lock = coordinator.getLock(lockName); String currentHolder = DbRepairRunnable.getSelfLockNodeId(lock); if (currentHolder == null) { // No thread is actually driving the repair, we need to resume it if (startNodeRepair(this.schemaUtil.getKeyspaceName(), this.repairRetryTimes, false, true)) { log.info("Successfully resumed a previously paused repair"); } else { log.warn("Cannot resume a previously paused repair, it could be another thread resumed and finished it"); } } } return retState; } catch (Exception e) { log.error("Failed to get node repair state from ZK", e); return null; } } @Override public DbRepairStatus getLastSucceededRepairStatus(boolean forCurrentNodesOnly) { try { DbRepairJobState state = DbRepairRunnable.queryRepairState(this.coordinator, this.schemaUtil.getKeyspaceName(), this.schemaUtil.isGeoDbsvc()); return getLastSucceededRepairStatus(state, forCurrentNodesOnly ? DbRepairRunnable.getClusterStateDigest() : null); } catch (Exception e) { log.error("Failed to get node repair state from ZK", e); return null; } } @Override public void resetRepairState() { DbRepairRunnable.resetRepairState(this.coordinator, this.schemaUtil.getKeyspaceName(), this.schemaUtil.isGeoDbsvc()); } public void start() { this.scheduledRepairTrigger = this.executor.scheduleWithFixedDelay(new Runnable() { @Override public void run() { try { startNodeRepair(schemaUtil.getKeyspaceName(), repairRetryTimes, true, false); } catch (Exception e) { log.error("Failed to trigger node repair", e); } } }, REPAIR_INITIAL_WAIT_FOR_DBSTART_MINUTES, repairFreqMin, TimeUnit.MINUTES); } @Override public void removeDataCenter(String dcName) { log.info("Remove Cassandra data center {}", dcName); List<InetAddress> allNodes = new ArrayList<>(); Set<InetAddress> liveNodes = Gossiper.instance.getLiveMembers(); allNodes.addAll(liveNodes); Set<InetAddress> unreachableNodes = Gossiper.instance.getUnreachableMembers(); allNodes.addAll(unreachableNodes); for (InetAddress nodeIp : allNodes) { IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch(); String dc = snitch.getDatacenter(nodeIp); log.info("node {} belongs to data center {} ", nodeIp, dc); if (dc.equals(dcName)) { removeCassandraNode(nodeIp); } } } private void removeCassandraNode(InetAddress nodeIp) { Map<String, String> hostIdMap = StorageService.instance.getHostIdMap(); String guid = hostIdMap.get(nodeIp.getHostAddress()); // Skip the node removal if this node doesn't exist in host id map if (guid != null) { log.info("Removing Cassandra node {} on vipr node {}", guid, nodeIp); Gossiper.instance.convict(nodeIp, 0); ensureRemoveNode(guid); } else { log.info("Skip removal of Cassandra node {} due to no host id found", nodeIp); } } /** * A safer method to remove Cassandra node. Calls forceRemoveCompletion after REMOVE_NODE_TIMEOUT_MILLIS * This will help to prevent node removal from hanging due to CASSANDRA-6542. * * @param guid */ public void ensureRemoveNode(final String guid) { ExecutorService exe = Executors.newSingleThreadExecutor(); Future<?> future = exe.submit(new Runnable() { public void run() { StorageService.instance.removeNode(guid); } }); try { future.get(REMOVE_NODE_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS); } catch (TimeoutException e) { log.warn("removenode timeout, calling forceRemoveCompletion()"); StorageService.instance.forceRemoveCompletion(); } catch (InterruptedException | ExecutionException e) { log.warn("Exception calling removenode", e); } finally { exe.shutdownNow(); } } /** * Check if data is synced with remote data center specified by dcName. It checks hinted handoff logs * * @return true - synced. otherwise false */ @Override public boolean isDataCenterSynced(String dcName) { log.info("Check if data synced with Cassandra data center {}", dcName); // Compact HINTS column family and eliminate deleted hints before checking hinted handoff logs try { StorageService.instance.forceKeyspaceCompaction("system", SystemKeyspace.HINTS_CF); } catch (Exception ex) { log.warn("Fail to compact system HINTS_CF", ex); } List<InetAddress> allNodes = new ArrayList<>(); Set<InetAddress> liveNodes = Gossiper.instance.getLiveMembers(); allNodes.addAll(liveNodes); Set<InetAddress> unreachableNodes = Gossiper.instance.getUnreachableMembers(); allNodes.addAll(unreachableNodes); for (InetAddress nodeIp : allNodes) { IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch(); String dc = snitch.getDatacenter(nodeIp); if (dc.equals(dcName)) { log.info("Checking hinted handoff logs for node {} in data center {} ", nodeIp, dc); if (hasPendingHintedHandoff(nodeIp)) { return false; } } } return true; } /** * Check if there is pending hinted handoff logs for given node * * @param endpoint * @return true - pending hinted handoff logs exists. Otherwise, false */ private boolean hasPendingHintedHandoff(InetAddress endpoint) { List<String> endpointsWithPendingHints = HintedHandOffManager.instance.listEndpointsPendingHints(); if (endpointsWithPendingHints.isEmpty()) { log.info("Skip data sync status check. No pending hinted handoff logs"); return false; } log.info("Pending hinted hand off logs found at {}", endpointsWithPendingHints); UUID hostId = Gossiper.instance.getHostId(endpoint); final ByteBuffer hostIdBytes = ByteBuffer.wrap(UUIDGen.decompose(hostId)); DecoratedKey epkey = StorageService.getPartitioner().decorateKey(hostIdBytes); Token.TokenFactory tokenFactory = StorageService.getPartitioner().getTokenFactory(); String token = tokenFactory.toString(epkey.getToken()); for (String unsyncedEndpoint : endpointsWithPendingHints) { if (token.equals(unsyncedEndpoint)) { log.info("Unsynced data found for : {}", endpoint); return true; } } return false; } public void init() { if (drUtil.isActiveSite()) { log.info("Register Cassandra node state listener on DR active site"); dbNodeStateCallbackExecutor = Executors.newScheduledThreadPool(1); Gossiper.instance.register(endpointStateChangeSubscripter); } } private Site getSite(InetAddress endpoint) { IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch(); String dcName = snitch.getDatacenter(endpoint); for (Site site : drUtil.listSites()) { String cassandraDcId = drUtil.getCassandraDcId(site); if (cassandraDcId.equals(dcName)) { return site; } } return null; } private void checkAndSetIncrementalSyncing(InetAddress endpoint) { Site site = getSite(endpoint); if (site == null) { log.info("Unknown site for {}. Skip HandoffLogDetector", endpoint); return; } log.info("Node {} in site {} comes online", endpoint, site.getUuid()); if (site.getState() == SiteState.STANDBY_SYNCED) { SiteMonitorResult monitorResult = drUtil.getCoordinator().getTargetInfo(site.getUuid(), SiteMonitorResult.class); if (monitorResult == null || monitorResult.getDbQuorumLostSince() == 0) { log.info("No db quorum lost on standby site. Skip this node up event on {} ", endpoint); return; } if (hasPendingHintedHandoff(endpoint)) { log.info("Hinted handoff logs detected. Change site {} state to STANDBY_INCR_SYNCING", site.getUuid()); site.setState(SiteState.STANDBY_INCR_SYNCING); drUtil.getCoordinator().persistServiceConfiguration(site.toConfiguration()); } } else { log.info("Skip hinted handoff logs detector for {} due to site state is {}. ", endpoint, site.getState()); } } // Cassandra node state listener private IEndpointStateChangeSubscriber endpointStateChangeSubscripter = new IEndpointStateChangeSubscriber() { @Override public void onJoin(InetAddress endpoint, EndpointState epState) { } @Override public void beforeChange(InetAddress endpoint, EndpointState currentState, ApplicationState newStateKey, VersionedValue newValue) { } @Override public void onChange(InetAddress endpoint, ApplicationState state, VersionedValue value) { } @Override public void onAlive(InetAddress endpoint, EndpointState state) { if (drUtil.isStandby()) { log.info("Skip node state change of {} on standby site", endpoint); return; } HandoffLogDetector detector = new HandoffLogDetector(endpoint); dbNodeStateCallbackExecutor.schedule(detector, 0, TimeUnit.SECONDS); } /** * Detect pending handoff logs and set STANDBY_INCR_SYNCING state if necessary */ class HandoffLogDetector implements Runnable { private InetAddress endpoint; private HandoffLogDetector(InetAddress endpoint) { this.endpoint = endpoint; } public void run() { checkAndSetIncrementalSyncing(endpoint); } }; @Override public void onDead(InetAddress endpoint, EndpointState state) { } @Override public void onRemove(InetAddress endpoint) { } @Override public void onRestart(InetAddress endpoint, EndpointState state) { } }; }