/* * Copyright (c) 2015 EMC Corporation * All Rights Reserved */ package com.emc.storageos.systemservices.impl.vdc; import java.net.URI; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; import com.emc.storageos.coordinator.client.model.*; import com.emc.storageos.coordinator.client.model.DrOperationStatus.InterState; import com.emc.storageos.db.client.DbClient; import com.emc.storageos.systemservices.impl.ipsec.IPsecManager; import org.apache.commons.lang3.StringUtils; import org.apache.curator.framework.recipes.locks.InterProcessLock; import org.jsoup.helper.StringUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.emc.storageos.coordinator.client.service.CoordinatorClient; import com.emc.storageos.coordinator.client.service.DrUtil; import com.emc.storageos.coordinator.client.service.NodeListener; import com.emc.storageos.coordinator.common.Configuration; import com.emc.storageos.coordinator.exceptions.RetryableCoordinatorException; import com.emc.storageos.db.client.util.VdcConfigUtil; import com.emc.storageos.security.audit.AuditLogManager; import com.emc.storageos.security.ipsec.IPsecConfig; import com.emc.storageos.services.OperationTypeEnum; import com.emc.storageos.services.util.Exec; import com.emc.storageos.svcs.errorhandling.resources.APIException; import com.emc.storageos.svcs.errorhandling.resources.ServiceCode; import com.emc.storageos.systemservices.exceptions.CoordinatorClientException; import com.emc.storageos.systemservices.exceptions.InvalidLockOwnerException; import com.emc.storageos.systemservices.impl.client.SysClientFactory; import com.emc.storageos.systemservices.impl.upgrade.UpgradeManager; import com.emc.storageos.systemservices.impl.util.AbstractManager; import org.springframework.beans.factory.annotation.Autowired; /** * Manage configuration properties for multivdc and disaster recovery. It listens on * SiteInfo znode changes. Once getting notified, it fetch vdc config from local db * or standby sites config from zk and update vdcconfig.properties on local disk. Genconfig * is supposed to be executed later to apply the new config changes. * * Data revision change and simulatenous cluster poweroff are also managed here */ public class VdcManager extends AbstractManager { private static final Logger log = LoggerFactory.getLogger(VdcManager.class); private IPsecConfig ipsecConfig; @Autowired private IPsecManager ipsecMgr; @Autowired private AuditLogManager auditMgr; @Autowired DbClient dbClient; // local and target info properties private PropertyInfoExt localVdcPropInfo; private PropertyInfoExt targetVdcPropInfo; private PowerOffState targetPowerOffState; private static final String POWEROFFTOOL_COMMAND = "/etc/powerofftool"; private static final String EVENT_SERVICE_TYPE = "DisasterRecovery"; private static final String AUDIT_DR_OPERATION_LOCK = "auditdroperation"; private static final int AUDIT_LOCK_WAIT_TIME_SEC = 5; // set to 2.5 minutes since it takes over 2m for ssh to timeout on non-reachable hosts private static final long SHUTDOWN_TIMEOUT_MILLIS = 150000; // Timeout in minutes for add/resume/data sync // If data synchronization takes long than this value, set site to error public static final int ADD_STANDBY_TIMEOUT_MILLIS = 20 * 60 * 1000; // 20 minutes public static final int RESUME_STANDBY_TIMEOUT_MILLIS = 20 * 60 * 1000; // 20 minutes public static final int REMOVE_STANDBY_TIMEOUT_MILLIS = 20 * 60 * 1000; // 20 minutes public static final int SWITCHOVER_TIMEOUT_MILLIS = 20 * 60 * 1000; // 20 minutes public static final int FAILOVER_STANDBY_SITE_TIMEOUT_MILLIS = 40 * 60 * 1000; // 40 minutes public static final int FAILOVER_ACTIVE_SITE_TIMEOUT_MILLIS = 40 * 60 * 1000; // 40 minutes private SiteInfo targetSiteInfo; private VdcConfigUtil vdcConfigUtil; private Map<String, VdcOpHandler> vdcOpHandlerMap; private Boolean backCompatPreYoda = false; public void setIpsecConfig(IPsecConfig ipsecConfig) { this.ipsecConfig = ipsecConfig; } public void setVdcOpHandlerMap(Map<String, VdcOpHandler> vdcOpHandlerMap) { this.vdcOpHandlerMap = vdcOpHandlerMap; } public void setBackCompatPreYoda(Boolean backCompat) { backCompatPreYoda = backCompat; } @Override protected URI getWakeUpUrl() { return SysClientFactory.URI_WAKEUP_VDC_MANAGER; } /** * Register site info listener to monitor site changes */ private void addSiteInfoListener() { try { coordinator.getCoordinatorClient().addNodeListener(new SiteInfoListener()); } catch (Exception e) { log.error("Fail to add node listener for site info target znode", e); throw APIException.internalServerErrors.addListenerFailed(); } log.info("Successfully added node listener for site info target znode"); } /** * the listener class to listen the site target node change. */ class SiteInfoListener implements NodeListener { public String getPath() { return String.format("/sites/%s/config/%s/%s", coordinator.getCoordinatorClient().getSiteId(), SiteInfo.CONFIG_KIND, SiteInfo.CONFIG_ID); } /** * called when user update the site */ @Override public void nodeChanged() { log.info("Site info changed. Waking up the vdc manager..."); wakeup(); } /** * called when connection state changed. */ @Override public void connectionStateChanged(State state) { log.info("Site info connection state changed to {}", state); if (state.equals(State.CONNECTED)) { log.info("Curator (re)connected. Waking up the vdc manager..."); wakeup(); } } } @Override protected void innerRun() { // need to distinguish persistent locks acquired from UpgradeManager/VdcManager/PropertyManager // otherwise they might release locks acquired by others when they start final String svcId = String.format("%s,vdc", coordinator.getMySvcId()); vdcConfigUtil = new VdcConfigUtil(coordinator.getCoordinatorClient()); vdcConfigUtil.setBackCompatPreYoda(backCompatPreYoda); addSiteInfoListener(); while (doRun) { log.debug("Main loop: Start"); // Step0: check if we have the reboot lock boolean hasLock; try { hasLock = hasRebootLock(svcId); } catch (Exception e) { log.info("Step0: Failed to verify if the current node has the reboot lock ", e); retrySleep(); continue; } if (hasLock) { try { releaseRebootLock(svcId); log.info("Step0: Released reboot lock for node: {}", svcId); wakeupOtherNodes(); } catch (InvalidLockOwnerException e) { log.error("Step0: Failed to release the reboot lock: Not owner."); } catch (Exception e) { log.info("Step0: Failed to release the reboot lock and will retry: {}", e.getMessage()); retrySleep(); continue; } } // Step1: publish current state, and set target if empty try { initializeLocalAndTargetInfo(); } catch (Exception e) { log.info("Step1 failed and will be retried:", e); retrySleep(); continue; } // Step2: power off if all nodes agree. log.info("Step2: Power off if poweroff state != NONE. {}", targetPowerOffState); try { gracefulPowerOffCluster(); } catch (Exception e) { log.error("Step2: Failed to poweroff. {}", e); } // Step3: set site error state if on active try { updateSiteErrors(); } catch (Exception e) { log.error("Step3: Failed to set site errors. {}", e); } // Step4: record DR operation audit log if on active try { auditCompletedDrOperation(); } catch (RuntimeException e) { log.error("Step4: Failed to record DR operation audit log. {}", e); } // Step5: update vdc configuration if changed log.info("Step5: If VDC configuration is changed update"); if (vdcPropertiesChanged()) { log.info("Step5: Current vdc properties are not same as target vdc properties. Updating."); log.debug("Current local vdc properties: " + localVdcPropInfo); log.debug("Target vdc properties: " + targetVdcPropInfo); try { updateVdcProperties(svcId); } catch (Exception e) { log.info("Step5: VDC properties update failed and will be retried:", e); // Restart the loop immediately so that we release the upgrade lock. continue; } continue; } // Step 6 : check backward compatible upgrade flag try { if (backCompatPreYoda) { if (isGeoConfig() && !isLeadVdcForGeoUpgrade()) { log.info("Skip pre-yoda upgrade handling for non lead vdc"); } else { log.info("Check if pre-yoda upgrade is done"); checkPreYodaUpgrade(); continue; } } } catch (Exception ex) { log.error("Step6: Failed to set back compat yoda upgrade. {}", ex); continue; } // Step7: sleep log.info("Step7: sleep"); longSleep(); } } /** * Initialize local and target info * * @throws Exception */ private void initializeLocalAndTargetInfo() throws Exception { // set target if empty targetSiteInfo = coordinator.getTargetInfo(SiteInfo.class); if (targetSiteInfo == null) { targetSiteInfo = new SiteInfo(); try { coordinator.setTargetInfo(targetSiteInfo, false); log.info("Step1b: Target site info set to: {}", targetSiteInfo); } catch (CoordinatorClientException e) { log.info("Step1b: Wait another control node to set target"); retrySleep(); throw e; } } // Initialize vdc prop info localVdcPropInfo = localRepository.getVdcPropertyInfo(); String localConfigVersion = localVdcPropInfo.getProperty(VdcConfigUtil.VDC_CONFIG_VERSION); coordinator.setNodeSessionScopeInfo(new VdcConfigVersion(localConfigVersion)); // ipsec key is a vdc property as well and saved in ZK. // targetVdcPropInfo = loadVdcConfigFromDatabase(); targetVdcPropInfo = loadVdcConfig(); if (isGeoUpgradeFromPreYoda()) { log.info("Detect vdc properties from preyoda. Keep local vdc config properties unchanged until all vdc configs are migrated to zk"); localVdcPropInfo.addProperty(VdcConfigUtil.VDC_CONFIG_VERSION, String.valueOf(targetSiteInfo.getVdcConfigVersion())); localRepository.setVdcPropertyInfo(localVdcPropInfo); } else { if (localVdcPropInfo.getProperty(VdcConfigUtil.VDC_CONFIG_VERSION) == null) { localVdcPropInfo = new PropertyInfoExt(targetVdcPropInfo.getAllProperties()); localVdcPropInfo.addProperty(VdcConfigUtil.VDC_CONFIG_VERSION, String.valueOf(targetSiteInfo.getVdcConfigVersion())); localRepository.setVdcPropertyInfo(localVdcPropInfo); String vdc_ids = targetVdcPropInfo.getProperty(VdcConfigUtil.VDC_IDS); String[] vdcIds = vdc_ids.split(","); if (vdcIds.length > 1) { log.info("More than one Vdc, rebooting"); reboot(); } } } targetPowerOffState = coordinator.getTargetInfo(PowerOffState.class); if (targetPowerOffState == null) { // only control node can set target try { // Set the updated property info in coordinator coordinator.setTargetInfo(new PowerOffState(PowerOffState.State.NONE)); targetPowerOffState = coordinator.getTargetInfo(PowerOffState.class); log.info("Step1b: Target poweroff state set to: {}", PowerOffState.State.NONE); } catch (CoordinatorClientException e) { log.info("Step1b: Wait another control node to set target"); retrySleep(); throw e; } } } /** * Load the vdc configurations * @return * @throws Exception */ private PropertyInfoExt loadVdcConfig() throws Exception { PropertyInfoExt targetVdcPropInfo = new PropertyInfoExt(vdcConfigUtil.genVdcProperties()); // This ipsec_status and ipsec_key properties are not normal system properties, // as they need be protected by double barrier to make sure they be changed and // synced to all nodes at the SAME time, or else the quorum of zk and db will be // broken. This is why we don't put them in system property. targetVdcPropInfo.addProperty(Constants.IPSEC_STATUS, ipsecConfig.getIpsecStatus()); targetVdcPropInfo.addProperty(Constants.IPSEC_KEY, ipsecConfig.getPreSharedKey()); return targetVdcPropInfo; } /** * Check if VDC configuration is different in the database vs. what is stored locally * * @return */ private boolean vdcPropertiesChanged() { long localVdcConfigVersion = localVdcPropInfo.getProperty(VdcConfigUtil.VDC_CONFIG_VERSION) == null ? 0 : Long.parseLong(localVdcPropInfo.getProperty(VdcConfigUtil.VDC_CONFIG_VERSION)); long targetVdcConfigVersion = targetSiteInfo.getVdcConfigVersion(); log.info("local vdc config version: {}, target vdc config version: {}", localVdcConfigVersion, targetVdcConfigVersion); return localVdcConfigVersion != targetVdcConfigVersion; } private boolean isGeoUpgradeFromPreYoda() { String vdcIds = localVdcPropInfo.getProperty(VdcConfigUtil.VDC_IDS); return !StringUtils.isEmpty(vdcIds) && vdcIds.contains(",") && StringUtils.isEmpty(localVdcPropInfo.getProperty(VdcConfigUtil.VDC_CONFIG_VERSION)); } private boolean isGeoConfig() { return targetVdcPropInfo.getProperty(VdcConfigUtil.VDC_IDS).contains(",") || localVdcPropInfo.getProperty(VdcConfigUtil.VDC_IDS).contains(","); } /** * Update vdc properties and reboot the node if * * @param svcId node service id * @throws Exception */ private void updateVdcProperties(String svcId) throws Exception { String action = targetSiteInfo.getActionRequired(); log.info("Step5: Process vdc op handlers, action = {}", action); VdcOpHandler opHandler = getOpHandler(action); opHandler.setTargetSiteInfo(targetSiteInfo); opHandler.setTargetVdcPropInfo(targetVdcPropInfo); opHandler.setLocalVdcPropInfo(localVdcPropInfo); opHandler.execute(); if (opHandler.isRollingRebootNeeded()) { log.info("Step5: Rolling reboot detected for vdc operation {}", action); rollingReboot(svcId); // keep same behaviour as previous releases. always do rolling reboot } else if (opHandler.isConcurrentRebootNeeded()) { log.info("Step5: Concurrent reboot for operation handler {}", action); commitVdcConfigVersionToLocal(); reboot(); } else if (isGeoConfigChange()) { log.info("Step5: Geo configuration changed, so concurrent reboot"); commitVdcConfigVersionToLocal(); reboot(); } else { commitVdcConfigVersionToLocal(); } } private boolean isGeoConfigChange() { boolean isGeo = targetVdcPropInfo.getProperty(VdcConfigUtil.VDC_IDS).contains(",") || localVdcPropInfo.getProperty(VdcConfigUtil.VDC_IDS).contains(","); return isGeo && !StringUtils.equals(targetVdcPropInfo.getProperty(VdcConfigUtil.VDC_IDS), localVdcPropInfo.getProperty(VdcConfigUtil.VDC_IDS)); } private void commitVdcConfigVersionToLocal() { // Flush vdc properties includes VDC_CONFIG_VERSION to disk PropertyInfoExt vdcProperty = new PropertyInfoExt(targetVdcPropInfo.getAllProperties()); vdcProperty.addProperty(VdcConfigUtil.VDC_CONFIG_VERSION, String.valueOf(targetSiteInfo.getVdcConfigVersion())); localRepository.setVdcPropertyInfo(vdcProperty); } /** * Create an operation handler for current vdc config change * * @param action * @return */ private VdcOpHandler getOpHandler(String action) { VdcOpHandler opHandler = vdcOpHandlerMap.get(action); if (opHandler == null) { throw new IllegalStateException(String.format("No VdcOpHandler defined for action %s" , action)); } return opHandler; } /** * If target poweroff state is not NONE, that means user has set it to STARTED. * in the checkAllNodesAgreeToPowerOff, all nodes, including control nodes and data nodes * will start to publish their poweroff state in the order of [NOTICED, ACKNOWLEDGED, POWEROFF]. * Every node can publish the next state only if it sees the previous state are found on every other nodes. * By doing this, we can gaurantee that all nodes receive the acknowledgement of powering among each other, * we can then safely poweroff. * No matter the poweroff failed or not, at the end, we reset the target poweroff state back to NONE. * CTRL-11690: the new behavior is if an agreement cannot be reached, a best-effort attempt to poweroff the * remaining nodes will be made, as if the force parameter is provided. */ private void gracefulPowerOffCluster() { if (targetPowerOffState != null && targetPowerOffState.getPowerOffState() != PowerOffState.State.NONE) { boolean forceSet = targetPowerOffState.getPowerOffState() == PowerOffState.State.FORCESTART; log.info("Step2: Trying to reach agreement with timeout on cluster poweroff"); if (checkAllNodesAgreeToPowerOff(forceSet) && initiatePoweroff(forceSet)) { resetTargetPowerOffState(); powerOffCluster(); } else { log.warn("Step2: Failed to reach agreement among all the nodes. Proceed with best-effort poweroff"); initiatePoweroff(true); resetTargetPowerOffState(); powerOffCluster(); } } } public void powerOffCluster() { log.info("powering off the cluster!"); final String[] cmd = { POWEROFFTOOL_COMMAND }; Exec.sudo(SHUTDOWN_TIMEOUT_MILLIS, cmd); } /** * Check if ongoing DR operation succeeded or failed, then record audit log accordingly and remove this operation record from ZK. */ private void auditCompletedDrOperation() { if (!drUtil.isActiveSite()) { return; } InterProcessLock lock = coordinator.getCoordinatorClient().getSiteLocalLock(AUDIT_DR_OPERATION_LOCK); boolean hasLock = false; try { hasLock = lock.acquire(AUDIT_LOCK_WAIT_TIME_SEC, TimeUnit.SECONDS); if (!hasLock) { return; } log.info("Local site is active, local node acquired lock, starting audit complete DR operations ..."); List<Configuration> configs = coordinator.getCoordinatorClient().queryAllConfiguration(DrOperationStatus.CONFIG_KIND); if (configs == null || configs.isEmpty()) { return; } for (Configuration config : configs) { DrOperationStatus operation = new DrOperationStatus(config); String siteId = operation.getSiteUuid(); InterState interState = operation.getInterState(); Site site = null; try { site = drUtil.getSiteFromLocalVdc(siteId); } catch (RetryableCoordinatorException e) { // It's expected that site id is not found if we're removing this site because it has been removed // Under this situation, just record audit log and clear DR operation status if (interState.equals(InterState.REMOVING_STANDBY) &&e.getServiceCode() == ServiceCode.COORDINATOR_SITE_NOT_FOUND) { this.auditMgr.recordAuditLog(null, null, EVENT_SERVICE_TYPE, getOperationType(interState), System.currentTimeMillis(), AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_END, siteId); coordinator.getCoordinatorClient().removeServiceConfiguration(config); log.info("DR operation status has been cleared: {}", operation); continue; } throw e; } SiteState currentState = site.getState(); if (currentState.equals(SiteState.STANDBY_ERROR)) { // Failed this.auditMgr.recordAuditLog(null, null, EVENT_SERVICE_TYPE, getOperationType(interState), System.currentTimeMillis(), AuditLogManager.AUDITLOG_FAILURE, AuditLogManager.AUDITOP_END, site.toBriefString()); } else if (!currentState.isDROperationOngoing()) { // Succeeded this.auditMgr.recordAuditLog(null, null, EVENT_SERVICE_TYPE, getOperationType(interState), System.currentTimeMillis(), AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_END, site.toBriefString()); } else { // Still ongoing, do nothing continue; } log.info(String.format("Site %s state has transformed from %s to %s", siteId, interState, currentState)); // clear this operation status coordinator.getCoordinatorClient().removeServiceConfiguration(config); log.info("DR operation status has been cleared: {}", operation); } } catch (Exception e) { log.error("Auditing DR operation failed with exception", e); } finally { try { if (hasLock) { lock.release(); } } catch (Exception e) { log.error("Failed to release DR operation audit lock", e); } } } private OperationTypeEnum getOperationType(InterState state) { OperationTypeEnum operationType = null; switch(state) { case ADDING_STANDBY: operationType = OperationTypeEnum.ADD_STANDBY; break; case REMOVING_STANDBY: operationType = OperationTypeEnum.REMOVE_STANDBY; break; case PAUSING_STANDBY: operationType = OperationTypeEnum.PAUSE_STANDBY; break; case RESUMING_STANDBY: operationType = OperationTypeEnum.RESUME_STANDBY; break; case SWITCHINGOVER_ACTIVE: operationType = OperationTypeEnum.ACTIVE_SWITCHOVER; break; case SWITCHINGOVER_STANDBY: operationType = OperationTypeEnum.STANDBY_SWITCHOVER; break; case FAILINGOVER_STANDBY: operationType = OperationTypeEnum.STANDBY_FAILOVER; break; case DEGRADING_STANDBY: operationType = OperationTypeEnum.STANDBY_DEGRADE; break; case REJOINING_STANDBY: operationType = OperationTypeEnum.STANDBY_REJOIN; break; } return operationType; } // TODO - let's see if we can move it to VdcOpHandler later private void updateSiteErrors() { CoordinatorClient coordinatorClient = coordinator.getCoordinatorClient(); if (!drUtil.isActiveSite()) { log.info("Step3: current site is a standby, nothing to do"); return; } for(Site site : drUtil.listSites()) { SiteError error = getSiteError(site); if (error != null) { log.info("set site {} state to STANDBY_ERROR, set lastState to {}",site.getName(),site.getState()); coordinatorClient.setTargetInfo(site.getUuid(), error); site.setLastState(site.getState()); site.setState(SiteState.STANDBY_ERROR); coordinatorClient.persistServiceConfiguration(site.toConfiguration()); } } } private SiteError getSiteError(Site site) { SiteError error = null; long lastSiteUpdateTime = site.getLastStateUpdateTime(); long currentTime = System.currentTimeMillis(); int drOpTimeoutMillis; switch(site.getState()) { case STANDBY_ADDING: drOpTimeoutMillis = drUtil.getDrIntConfig(DrUtil.KEY_ADD_STANDBY_TIMEOUT, ADD_STANDBY_TIMEOUT_MILLIS); if (currentTime - lastSiteUpdateTime > drOpTimeoutMillis) { log.warn("Step3: Site {} set to error due to add standby timeout", site.getName()); error = new SiteError(APIException.internalServerErrors.addStandbyFailedTimeout( drOpTimeoutMillis / 60 / 1000),site.getState().name()); } break; case STANDBY_RESUMING: drOpTimeoutMillis = drUtil.getDrIntConfig(DrUtil.KEY_RESUME_STANDBY_TIMEOUT, RESUME_STANDBY_TIMEOUT_MILLIS); if (currentTime - lastSiteUpdateTime > drOpTimeoutMillis) { log.warn("Step3: Site {} set to error due to resume standby timeout", site.getName()); error = new SiteError(APIException.internalServerErrors.resumeStandbyFailedTimeout( drOpTimeoutMillis / 60 / 1000),site.getState().name()); } break; case STANDBY_REMOVING: drOpTimeoutMillis = drUtil.getDrIntConfig(DrUtil.KEY_REMOVE_STANDBY_TIMEOUT, REMOVE_STANDBY_TIMEOUT_MILLIS); if (currentTime - lastSiteUpdateTime > drOpTimeoutMillis) { log.warn("Step3: Site {} set to error due to remove standby timeout", site.getName()); error = new SiteError(APIException.internalServerErrors.removeStandbyFailedTimeout( drOpTimeoutMillis / 60 / 1000),site.getState().name()); } break; case ACTIVE_SWITCHING_OVER: drOpTimeoutMillis = drUtil.getDrIntConfig(DrUtil.KEY_SWITCHOVER_TIMEOUT, SWITCHOVER_TIMEOUT_MILLIS); if (currentTime - lastSiteUpdateTime > drOpTimeoutMillis) { log.warn("Step3: Site {} set to error due to switchover timeout", site.getName()); error = new SiteError(APIException.internalServerErrors.switchoverActiveFailedTimeout( site.getName(), drOpTimeoutMillis / 60 / 1000),site.getState().name()); } break; case STANDBY_SWITCHING_OVER: drOpTimeoutMillis = drUtil.getDrIntConfig(DrUtil.KEY_SWITCHOVER_TIMEOUT, SWITCHOVER_TIMEOUT_MILLIS); if (currentTime - lastSiteUpdateTime > drOpTimeoutMillis) { log.warn("Step3: Site {} set to error due to switchover timeout", site.getName()); error = new SiteError(APIException.internalServerErrors.switchoverStandbyFailedTimeout( site.getName(), drOpTimeoutMillis / 60 / 1000),site.getState().name()); } break; case STANDBY_FAILING_OVER: drOpTimeoutMillis = drUtil.getDrIntConfig(DrUtil.KEY_FAILOVER_STANDBY_SITE_TIMEOUT, FAILOVER_STANDBY_SITE_TIMEOUT_MILLIS); if (currentTime - lastSiteUpdateTime > drOpTimeoutMillis) { log.warn("Step3: Site {} set to error due to failover timeout", site.getName()); error = new SiteError(APIException.internalServerErrors.failoverFailedTimeout( site.getName(), drOpTimeoutMillis / 60 / 1000),site.getState().name()); } break; case ACTIVE_FAILING_OVER: drOpTimeoutMillis = drUtil.getDrIntConfig(DrUtil.KEY_FAILOVER_ACTIVE_SITE_TIMEOUT, FAILOVER_ACTIVE_SITE_TIMEOUT_MILLIS); if (currentTime - lastSiteUpdateTime > drOpTimeoutMillis) { log.warn("Step3: Site {} set to error due to failover timeout", site.getName()); error = new SiteError(APIException.internalServerErrors.failoverFailedTimeout( site.getName(), drOpTimeoutMillis / 60 / 1000),site.getState().name()); } break; } return error; } private void checkPreYodaUpgrade() throws Exception { if (!dbMigrationDone()) { log.info("Migration to yoda is not completed. Sleep and retry later. isMigrationDone flag = {}", coordinator.isDBMigrationDone()); retrySleep(); return; } if (!drUtil.isAllSitesStable()) { log.info("Current cluster is not stable. Skip and retry later"); retrySleep(); return; } if (isGeoConfig() && !allVdcGetUpgradedToYoda()) { log.info("Sleep and wait for all vdc upgraded to yoda."); retrySleep(); return; } log.info("Db migration is done. Switch to IPSec mode"); enableIpsec(); } private void enableIpsec() throws Exception{ InterProcessLock lock = null; try { lock = coordinator.getCoordinatorClient().getSiteLocalLock("ipseclock"); lock.acquire(); log.info("Acquired the lock {}", "ipseclock"); String preSharedKey = ipsecConfig.getPreSharedKeyFromZK(); if (StringUtil.isBlank(preSharedKey)) { log.info("No pre shared key in zk, generate a new key"); ipsecMgr.rotateKey(true); } else { log.info("First ipsec key found in zk. No need to regenerate it"); } } finally { if (lock != null) { lock.release(); } } } /** * We pick only one vdc assumes the role to rotate ipsec key in post yoda. As default the vdc with * least vdc short id is the lead * * @return true if current vdc is the lead */ private boolean isLeadVdcForGeoUpgrade() { String localId = drUtil.getLocalVdcShortId(); String strVdcIds = targetVdcPropInfo.getProperty(VdcConfigUtil.VDC_IDS); String[] vdcIds = strVdcIds.split(","); for (String id : vdcIds) { if (localId.compareToIgnoreCase(id) > 0 ) { log.info("Current VDC {} is greater than {}.", localId, id); return false; } } log.info("Current VDC {} is the lead in current geo {}", localId, strVdcIds); return true; } private boolean dbMigrationDone() { String currentDbSchemaVersion = coordinator.getCurrentDbSchemaVersion(); String targetDbSchemaVersion = coordinator.getCoordinatorClient().getTargetDbSchemaVersion(); log.info("Current schema version {}", currentDbSchemaVersion); return targetDbSchemaVersion.equals(currentDbSchemaVersion) && coordinator.isDBMigrationDone(); } private boolean allVdcGetUpgradedToYoda() { boolean toYOda = dbClient.checkGeoCompatible("2.5"); log.info("If Geo DB is upgraded to Yoda: {}", toYOda); return toYOda; } private void rollingReboot(String svcId) { while (doRun) { log.info("Acquiring reboot lock for geo config change."); if (!getRebootLock(svcId)) { retrySleep(); } else if (!isQuorumMaintained()) { releaseRebootLock(svcId); retrySleep(); } else { commitVdcConfigVersionToLocal(); reboot(); } } } }