/* * Copyright (c) 2015 EMC Corporation * All Rights Reserved */ package com.emc.storageos.api.service.impl.resource; import com.emc.storageos.coordinator.client.model.SiteNetworkState; import com.emc.storageos.coordinator.client.model.SiteNetworkState.NetworkHealth; import java.net.InetAddress; import java.net.UnknownHostException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Date; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import javax.crypto.SecretKey; import javax.ws.rs.Consumes; import javax.ws.rs.DELETE; import javax.ws.rs.GET; import javax.ws.rs.HeaderParam; import javax.ws.rs.POST; import javax.ws.rs.PUT; import javax.ws.rs.Path; import javax.ws.rs.PathParam; import javax.ws.rs.Produces; import javax.ws.rs.QueryParam; import javax.ws.rs.core.MediaType; import javax.ws.rs.core.Response; import com.emc.storageos.coordinator.client.service.impl.DualInetAddress; import org.apache.commons.codec.binary.Base64; import org.apache.commons.lang.StringUtils; import org.apache.curator.framework.recipes.barriers.DistributedBarrier; import org.apache.curator.framework.recipes.leader.LeaderSelector; import org.apache.curator.framework.recipes.locks.InterProcessLock; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import com.emc.storageos.api.mapper.SiteMapper; import com.emc.storageos.api.service.impl.resource.utils.InternalSiteServiceClient; import com.emc.storageos.coordinator.client.model.Constants; import com.emc.storageos.coordinator.client.model.PropertyInfoExt; import com.emc.storageos.coordinator.client.model.RepositoryInfo; import com.emc.storageos.coordinator.client.model.Site; import com.emc.storageos.coordinator.client.model.SiteError; import com.emc.storageos.coordinator.client.model.SiteInfo; import com.emc.storageos.coordinator.client.model.SiteMonitorResult; import com.emc.storageos.coordinator.client.model.SiteState; import com.emc.storageos.coordinator.client.model.SoftwareVersion; import com.emc.storageos.coordinator.client.model.DrOperationStatus.InterState; import com.emc.storageos.coordinator.client.service.CoordinatorClient; import com.emc.storageos.coordinator.client.service.DrUtil; import com.emc.storageos.coordinator.client.service.impl.LeaderSelectorListenerImpl; import com.emc.storageos.coordinator.common.Configuration; import com.emc.storageos.coordinator.common.impl.ZkPath; import com.emc.storageos.coordinator.exceptions.CoordinatorException; import com.emc.storageos.coordinator.exceptions.RetryableCoordinatorException; import com.emc.storageos.db.client.DbClient; import com.emc.storageos.db.client.model.StringMap; import com.emc.storageos.db.client.model.uimodels.InitialSetup; import com.emc.storageos.model.dr.DRNatCheckParam; import com.emc.storageos.model.dr.DRNatCheckResponse; import com.emc.storageos.model.dr.FailoverPrecheckResponse; import com.emc.storageos.model.dr.SiteActive; import com.emc.storageos.model.dr.SiteAddParam; import com.emc.storageos.model.dr.SiteConfigParam; import com.emc.storageos.model.dr.SiteConfigRestRep; import com.emc.storageos.model.dr.SiteDetailRestRep; import com.emc.storageos.model.dr.SiteErrorResponse; import com.emc.storageos.model.dr.SiteIdListParam; import com.emc.storageos.model.dr.SiteList; import com.emc.storageos.model.dr.SiteParam; import com.emc.storageos.model.dr.SiteRemoved; import com.emc.storageos.model.dr.SiteRestRep; import com.emc.storageos.model.dr.SiteUpdateParam; import com.emc.storageos.model.property.PropertyConstants; import com.emc.storageos.model.property.PropertyInfo; import com.emc.storageos.security.audit.AuditLogManager; import com.emc.storageos.security.authentication.InternalApiSignatureKeyGenerator; import com.emc.storageos.security.authentication.InternalApiSignatureKeyGenerator.SignatureKeyType; import com.emc.storageos.security.authorization.CheckPermission; import com.emc.storageos.security.authorization.DefaultPermissions; import com.emc.storageos.security.authorization.ExcludeLicenseCheck; import com.emc.storageos.security.authorization.Role; import com.emc.storageos.security.ipsec.IPsecConfig; import com.emc.storageos.services.OperationTypeEnum; import com.emc.storageos.services.util.SysUtils; import com.emc.storageos.svcs.errorhandling.resources.APIException; import com.emc.storageos.svcs.errorhandling.resources.InternalServerErrorException; import com.emc.vipr.client.ViPRCoreClient; import com.emc.vipr.client.ViPRSystemClient; import com.emc.vipr.model.sys.ClusterInfo; /** * APIs implementation to standby sites lifecycle management such as add-standby, remove-standby, failover, pause * resume replication etc. */ @Path("/site") @DefaultPermissions(readRoles = { Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN, Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_MONITOR }, writeRoles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }) public class DisasterRecoveryService { private static final Logger log = LoggerFactory.getLogger(DisasterRecoveryService.class); private static final String SHORTID_FMT = "site%d"; private static final int MAX_NUM_OF_STANDBY = 10; private static final String EVENT_SERVICE_TYPE = "DisasterRecovery"; private static final String NTPSERVERS = "network_ntpservers"; private static final int SITE_NAME_LENGTH_LIMIT = 64; private static final int SITE_NUMBER_UPPER_LIMIT = 3; private static final int SITE_CONNECT_TEST_TIMEOUT = 10 * 1000; private static final int SITE_CONNECTION_TEST_PORT = 443; private static final String LOCAL_HOST = "localhost"; private static final String SYSTEM_ENABLE_FIREWALL = "system_enable_firewall"; private InternalApiSignatureKeyGenerator apiSignatureGenerator; private SiteMapper siteMapper; private SysUtils sysUtils; private CoordinatorClient coordinator; private DbClient dbClient; private IPsecConfig ipsecConfig; private DrUtil drUtil; @Autowired private AuditLogManager auditMgr; /** * Record audit log for DisasterRecoveryService * * @param auditType * @param operationalStatus * @param operationStage * @param descparams */ protected void auditDisasterRecoveryOps(OperationTypeEnum auditType, String operationalStatus, String operationStage, Object... descparams) { auditMgr.recordAuditLog(null, null, EVENT_SERVICE_TYPE, auditType, System.currentTimeMillis(), operationalStatus, operationStage, descparams); } /** * init method, this will be called by Spring framework after create bean successfully */ public void init() { siteMapper = new SiteMapper(); startLeaderSelector(); } /** * Attach one fresh install site to this active site as standby * Or attach a active site for the local standby site when it's first being added. * * @param param site detail information * @return site response information */ @POST @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true) public SiteRestRep addStandby(SiteAddParam param) { log.info("Adding standby site: {}", param.getVip()); precheckForSiteNumber(); precheckForGeo(); List<Site> existingSites = drUtil.listStandbySites(); // parameter validation and precheck validateAddParam(param, existingSites); // check the version before using the ViPR client, otherwise there might be compatibility issues. precheckStandbyVersion(param); ViPRCoreClient viprCoreClient; SiteConfigRestRep standbyConfig; try { viprCoreClient = createViPRCoreClient(param.getVip(), param.getUsername(), param.getPassword()); standbyConfig = viprCoreClient.site().getStandbyConfig(); } catch (Exception e) { log.error("Unexpected error when retrieving standby config", e); throw APIException.internalServerErrors.addStandbyPrecheckFailed("Cannot retrieve config from standby site"); } String siteId = standbyConfig.getUuid(); precheckForStandbyAdd(standbyConfig, viprCoreClient); InterProcessLock lock = drUtil.getDROperationLock(); Site standbySite = null; try { standbySite = new Site(); standbySite.setCreationTime((new Date()).getTime()); standbySite.setName(param.getName()); standbySite.setVdcShortId(drUtil.getLocalVdcShortId()); standbySite.setVip(standbyConfig.getVip()); standbySite.setVip6(standbyConfig.getVip6()); standbySite.getHostIPv4AddressMap().putAll(new StringMap(standbyConfig.getHostIPv4AddressMap())); standbySite.getHostIPv6AddressMap().putAll(new StringMap(standbyConfig.getHostIPv6AddressMap())); standbySite.setNodeCount(standbyConfig.getNodeCount()); standbySite.setUuid(standbyConfig.getUuid()); String shortId = generateShortId(drUtil.listSites()); standbySite.setSiteShortId(shortId); standbySite.setDescription(param.getDescription()); standbySite.setState(SiteState.STANDBY_ADDING); if (log.isDebugEnabled()) { log.debug(standbySite.toString()); } // Do this before tx get started which might write key to zk. SecretKey secretKey = apiSignatureGenerator.getSignatureKey(SignatureKeyType.INTERVDC_API); coordinator.startTransaction(); coordinator.addSite(standbyConfig.getUuid()); log.info("Persist standby site to ZK {}", shortId); // coordinator.setTargetInfo(standbySite); coordinator.persistServiceConfiguration(standbySite.toConfiguration()); drUtil.recordDrOperationStatus(standbySite.getUuid(), InterState.ADDING_STANDBY); // wake up syssvc to regenerate configurations long vdcConfigVersion = DrUtil.newVdcConfigVersion(); drUtil.updateVdcTargetVersion(coordinator.getSiteId(), SiteInfo.DR_OP_ADD_STANDBY, vdcConfigVersion); for (Site site : existingSites) { drUtil.updateVdcTargetVersion(site.getUuid(), SiteInfo.DR_OP_ADD_STANDBY, vdcConfigVersion); } // sync site related info with to be added standby site long dataRevision = vdcConfigVersion; List<Site> allStandbySites = new ArrayList<>(); allStandbySites.add(standbySite); allStandbySites.addAll(existingSites); SiteConfigParam configParam = prepareSiteConfigParam(allStandbySites, ipsecConfig.getPreSharedKey(), standbyConfig.getUuid(), dataRevision, vdcConfigVersion, secretKey); viprCoreClient.site().syncSite(standbyConfig.getUuid(), configParam); drUtil.updateVdcTargetVersion(siteId, SiteInfo.DR_OP_CHANGE_DATA_REVISION, vdcConfigVersion, dataRevision); coordinator.commitTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.ADD_STANDBY, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, standbySite.toBriefString()); return siteMapper.map(standbySite); } catch (Exception e) { log.error("Internal error for updating coordinator on standby", e); coordinator.discardTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.ADD_STANDBY, AuditLogManager.AUDITLOG_FAILURE, null, standbySite.toBriefString()); InternalServerErrorException addStandbyFailedException = APIException.internalServerErrors.addStandbyFailed(e.getMessage()); throw addStandbyFailedException; } finally { try { lock.release(); } catch (Exception ignore) { log.error(String.format("Lock release failed when adding standby %s", siteId)); } } } /** * Prepare all sites related info for synchronizing them from master to be added or resumed standby site * * @param standbySites All standby sites * @param ipsecKey The cluster ipsec key * @param targetStandbyUUID The uuid of the target standby * @param targetStandbyDataRevision The data revision of the target standby * @return SiteConfigParam all the sites configuration */ private SiteConfigParam prepareSiteConfigParam(List<Site> standbySites, String ipsecKey, String targetStandbyUUID, long targetStandbyDataRevision, long vdcConfigVersion, SecretKey secretKey) { log.info("Preparing to sync sites info among to be added/resumed standby site..."); Site active = drUtil.getActiveSite(); SiteConfigParam configParam = new SiteConfigParam(); SiteParam activeSite = new SiteParam(); siteMapper.map(active, activeSite); activeSite.setIpsecKey(ipsecKey); log.info(" active site info:{}", activeSite.toString()); configParam.setActiveSite(activeSite); List<SiteParam> standbySitesParam = new ArrayList<>(); for (Site standby : standbySites) { SiteParam standbyParam = new SiteParam(); siteMapper.map(standby, standbyParam); standbyParam.setSecretKey(new String(Base64.encodeBase64(secretKey.getEncoded()), Charset.forName("UTF-8"))); if (standby.getUuid().equals(targetStandbyUUID)) { log.info("Set data revision for site {} to {}", standby.getUuid(), targetStandbyDataRevision); standbyParam.setDataRevision(targetStandbyDataRevision); } standbySitesParam.add(standbyParam); log.info(" standby site info:{}", standbyParam.toString()); } configParam.setStandbySites(standbySitesParam); configParam.setVdcConfigVersion(vdcConfigVersion); // Need set stanby's NTP same as primary, so standby time is consistent with primary after reboot // It's because time inconsistency between primary and standby will cause db rebuild issue: COP-17965 PropertyInfoExt targetPropInfo = coordinator.getTargetInfo(PropertyInfoExt.class); String ntpServers = targetPropInfo.getProperty(NTPSERVERS); log.info(" active site ntp servers: {}", ntpServers); configParam.setNtpServers(ntpServers); return configParam; } /** * Initialize a to be added target standby * The current site will be demoted from active to standby during the process * * @param configParam * @return */ @PUT @Path("/{uuid}/initstandby") @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true) @ExcludeLicenseCheck public Response syncSites(SiteConfigParam configParam) { log.info("sync sites from active site"); return initStandby(configParam); } /** * Initialize a to-be added/resumed target standby * a) re-set all the latest site related info (persisted in ZK) in the target standby * b) vdc properties would be changed accordingly * c) the target standby reboot * d) re-set zk/db data during the target standby reboot * e) the target standby would connect with active and sync all the latest ZK&DB data. * * Scenarios: * a) For adding standby site scenario (External API), the current site will be demoted from active to standby during the process * b) For resuming standby site scenario (Internal API), the current site's original data will be cleaned by setting new data revision. * It is now only used for resuming long paused (> 5 days) standby site * * @param configParam * @return */ @PUT @Path("/internal/initstandby") @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) public Response initStandby(SiteConfigParam configParam) { try { SiteParam activeSiteParam = configParam.getActiveSite(); ipsecConfig.setPreSharedKey(activeSiteParam.getIpsecKey()); log.info("Clean up all obsolete site configurations"); String activeSiteId = activeSiteParam.getUuid(); Set<String> standbySiteIds = new HashSet<>(); for (SiteParam standby : configParam.getStandbySites()) { standbySiteIds.add(standby.getUuid()); } for (Site siteToRemove : drUtil.listSites()) { String siteId = siteToRemove.getUuid(); if (activeSiteId.equals(siteId) || standbySiteIds.contains(siteId)) { continue; } drUtil.removeSite(siteToRemove); } coordinator.addSite(activeSiteParam.getUuid()); Site activeSite = new Site(); siteMapper.map(activeSiteParam, activeSite); activeSite.setVdcShortId(drUtil.getLocalVdcShortId()); coordinator.persistServiceConfiguration(activeSite.toConfiguration()); Long dataRevision = null; // Add other standby sites for (SiteParam standby : configParam.getStandbySites()) { Site site = new Site(); siteMapper.map(standby, site); site.setVdcShortId(drUtil.getLocalVdcShortId()); coordinator.persistServiceConfiguration(site.toConfiguration()); coordinator.addSite(standby.getUuid()); if (standby.getUuid().equals(coordinator.getSiteId())) { dataRevision = standby.getDataRevision(); log.info("Set data revision to {}", dataRevision); } log.info("Persist standby site {} to ZK", standby.getVip()); } if (dataRevision == null) { throw new IllegalStateException("Illegal request on standby site. No data revision in request"); } String ntpServers = configParam.getNtpServers(); PropertyInfoExt targetPropInfo = coordinator.getTargetInfo(PropertyInfoExt.class); if (ntpServers != null && !ntpServers.equals(targetPropInfo.getProperty(NTPSERVERS))) { targetPropInfo.addProperty(NTPSERVERS, ntpServers); coordinator.setTargetInfo(targetPropInfo); log.info("Set ntp servers to {}", ntpServers); } drUtil.updateVdcTargetVersion(coordinator.getSiteId(), SiteInfo.DR_OP_CHANGE_DATA_REVISION, configParam.getVdcConfigVersion(), dataRevision); return Response.status(Response.Status.ACCEPTED).build(); } catch (Exception e) { log.error("Internal error for updating coordinator on standby", e); throw APIException.internalServerErrors.configStandbyFailed(e.getMessage()); } } /** * Get all sites including standby and active * * @return site list contains all sites with detail information */ @GET @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN, Role.SYSTEM_MONITOR }) public SiteList getSites() { log.info("Begin to list all standby sites of local VDC"); SiteList standbyList = new SiteList(); for (Site site : drUtil.listSites()) { standbyList.getSites().add(siteMapper.mapWithNetwork(site, drUtil)); } SiteInfo siteInfo = coordinator.getTargetInfo(coordinator.getSiteId(), SiteInfo.class); standbyList.setConfigVersion(siteInfo.getVdcConfigVersion()); return standbyList; } /** * Check if current site is active site * * @return SiteActive true if current site is active else false */ @GET @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Path("/active") public SiteActive checkIsActive() { log.info("Begin to check if site Active or Standby"); SiteActive isActiveSite = new SiteActive(); try { Site localSite = drUtil.getLocalSite(); isActiveSite.setIsActive(localSite.getState() == SiteState.ACTIVE); isActiveSite.setLocalSiteName(localSite.getName()); isActiveSite.setLocalUuid(localSite.getUuid()); isActiveSite.setIsMultiSite(drUtil.isMultisite()); return isActiveSite; } catch (Exception e) { log.error("Can't get site is Active or Standby"); throw APIException.badRequests.siteIdNotFound(); } } /** * Get specified site according site UUID * * @param uuid site UUID * @return site response with detail information */ @GET @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN, Role.SYSTEM_MONITOR }) @Path("/{uuid}") public SiteRestRep getSite(@PathParam("uuid") String uuid) { log.info("Begin to get standby site by uuid {}", uuid); try { Site site = drUtil.getSiteFromLocalVdc(uuid); return siteMapper.mapWithNetwork(site, drUtil); } catch (Exception e) { log.error("Can't find site with specified site ID {}", uuid); throw APIException.badRequests.siteIdNotFound(); } } /** * Get local site * * @return site response with detail information */ @GET @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN, Role.SYSTEM_MONITOR }) @Path("/local") public SiteRestRep getSite() { log.info("Begin to get local site"); try { Site site = drUtil.getLocalSite(); return siteMapper.map(site); } catch (Exception e) { log.error("Can't find local site", e); throw APIException.badRequests.siteIdNotFound(); } } /** * @return result that indicates whether local site is removed */ @GET @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Path("/islocalsiteremoved") public SiteRemoved isLocalSiteRemoved() { SiteRemoved response = new SiteRemoved(); Site localSite = drUtil.getLocalSite(); if (SiteState.ACTIVE == localSite.getState()) { return response; } for (Site remoteSite : drUtil.listSites()) { if (remoteSite.getUuid().equals(localSite.getUuid())) { continue; } try (InternalSiteServiceClient client = new InternalSiteServiceClient(remoteSite, coordinator, apiSignatureGenerator)) { SiteList sites = client.getSiteList(); if (!isActiveSite(remoteSite.getUuid(), sites)) { continue; } if (isSiteContainedBy(localSite.getUuid(), sites)) { return response; } else { log.info("According returned result from current active site {}, local site {} has been removed", remoteSite.getUuid(), localSite.getUuid()); response.setIsRemoved(true); return response; } } catch (Exception e) { log.warn("Error happened when fetching site list from site {}", remoteSite.getUuid(), e); continue; } } return response; } private boolean isActiveSite(String siteId, SiteList sites) { for (SiteRestRep site : sites.getSites()) { if (siteId.equals(site.getUuid()) && SiteState.ACTIVE.toString().equals(site.getState())) { return true; } } return false; } /** * Remove a standby. After successfully done, it stops data replication to this site * * @param uuid standby site uuid * @return */ @DELETE @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true) @Path("/{uuid}") public Response remove(@PathParam("uuid") String uuid) { SiteIdListParam param = new SiteIdListParam(); param.getIds().add(uuid); return remove(param); } /** * Remove multiple standby sites. After successfully done, it stops data replication to those sites * * @param idList site uuid list to be removed * @return */ @POST @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true) @Path("/remove") public Response remove(SiteIdListParam idList) { List<String> siteIdList = idList.getIds(); String siteIdStr = StringUtils.join(siteIdList, ","); log.info("Begin to remove standby site from local vdc by uuid: {}", siteIdStr); List<Site> toBeRemovedSites = new ArrayList<>(); for (String siteId : siteIdList) { Site site; try { site = drUtil.getSiteFromLocalVdc(siteId); } catch (Exception ex) { log.error("Can't load site {} from ZK", siteId); throw APIException.badRequests.siteIdNotFound(); } if (site.getState().equals(SiteState.ACTIVE)) { log.error("Unable to remove this site {}. It is active", siteId); throw APIException.badRequests.operationNotAllowedOnActiveSite(); } if (site.getState().isDROperationOngoing() && !site.getState().equals(SiteState.STANDBY_SYNCING)) { log.error("Unable to remove this site {} in state {}. " + "DR operation other than STANDBY_SYNCING is ongoing", siteId, site.getState().name()); throw APIException.internalServerErrors.concurrentDROperationNotAllowed(site.getName(), site.getState().toString()); } toBeRemovedSites.add(site); } // Build a site names' string for more human-readable Exception error message StringBuilder siteNamesSb = new StringBuilder(); for (Site site : toBeRemovedSites) { if (siteNamesSb.length() != 0) { siteNamesSb.append(", "); } siteNamesSb.append(site.getName()); } String SiteNamesStr = siteNamesSb.toString(); try { commonPrecheck(siteIdList); } catch (APIException e) { throw e; } catch (Exception e) { throw APIException.internalServerErrors.removeStandbyPrecheckFailed(SiteNamesStr, e.getMessage()); } InterProcessLock lock = drUtil.getDROperationLock(false); List<String> sitesString = new ArrayList<>(); try { log.info("Removing sites"); coordinator.startTransaction(); for (Site site : toBeRemovedSites) { site.setState(SiteState.STANDBY_REMOVING); coordinator.persistServiceConfiguration(site.toConfiguration()); drUtil.recordDrOperationStatus(site.getUuid(), InterState.REMOVING_STANDBY); sitesString.add(site.toBriefString()); } log.info("Notify all sites for reconfig"); long vdcTargetVersion = DrUtil.newVdcConfigVersion(); for (Site standbySite : drUtil.listSites()) { drUtil.updateVdcTargetVersion(standbySite.getUuid(), SiteInfo.DR_OP_REMOVE_STANDBY, vdcTargetVersion); } coordinator.commitTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.REMOVE_STANDBY, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, StringUtils.join(sitesString, ',')); return Response.status(Response.Status.ACCEPTED).build(); } catch (Exception e) { log.error("Failed to remove site {}", siteIdStr, e); coordinator.discardTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.REMOVE_STANDBY, AuditLogManager.AUDITLOG_FAILURE, null, StringUtils.join(sitesString, ',')); throw APIException.internalServerErrors.removeStandbyFailed(SiteNamesStr, e.getMessage()); } finally { try { lock.release(); } catch (Exception ignore) { log.error(String.format("Lock release failed when removing standby sites: %s", siteIdStr)); } } } /** * Get standby site configuration * * @return SiteConfigRestRep standby site configuration. */ @GET @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN, Role.SYSTEM_MONITOR }) @Path("/localconfig") public SiteConfigRestRep getStandbyConfig() { log.info("Begin to get standby config"); String siteId = coordinator.getSiteId(); SecretKey key = apiSignatureGenerator.getSignatureKey(SignatureKeyType.INTERVDC_API); Site site = drUtil.getSiteFromLocalVdc(siteId); SiteConfigRestRep siteConfigRestRep = new SiteConfigRestRep(); siteConfigRestRep.setUuid(siteId); siteConfigRestRep.setVip(site.getVip()); siteConfigRestRep.setVip6(site.getVip6()); siteConfigRestRep.setSecretKey(new String(Base64.encodeBase64(key.getEncoded()), Charset.forName("UTF-8"))); siteConfigRestRep.setHostIPv4AddressMap(site.getHostIPv4AddressMap()); siteConfigRestRep.setHostIPv6AddressMap(site.getHostIPv6AddressMap()); siteConfigRestRep.setDbSchemaVersion(coordinator.getCurrentDbSchemaVersion()); siteConfigRestRep.setFreshInstallation(isFreshInstallation()); siteConfigRestRep.setClusterStable(isClusterStable()); siteConfigRestRep.setNodeCount(site.getNodeCount()); siteConfigRestRep.setState(site.getState().toString()); try { siteConfigRestRep.setSoftwareVersion(coordinator.getTargetInfo(RepositoryInfo.class) .getCurrentVersion().toString()); } catch (Exception e) { log.error("Fail to get software version {}", e); } log.info("Return result: {}", siteConfigRestRep); return siteConfigRestRep; } @POST @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true) @Path("/natcheck") @ExcludeLicenseCheck public DRNatCheckResponse checkIfBehindNat(DRNatCheckParam checkParam, @HeaderParam("X-Forwarded-For") String clientIp) { if (checkParam == null) { log.error("checkParam is null, X-Forwarded-For is {}", clientIp); throw APIException.internalServerErrors.invalidNatCheckCall("(null)", clientIp); } String ipv4Str = checkParam.getIPv4Address(); String ipv6Str = checkParam.getIPv6Address(); log.info(String.format("Performing NAT check, client address connecting to VIP: %s. Client reports its IPv4 = %s, IPv6 = %s", clientIp, ipv4Str, ipv6Str)); boolean isBehindNat = false; try { isBehindNat = sysUtils.checkIfBehindNat(ipv4Str, ipv6Str, clientIp); } catch (Exception e) { log.error("Fail to check NAT {}", e); throw APIException.internalServerErrors.invalidNatCheckCall(e.getMessage(), clientIp); } DRNatCheckResponse resp = new DRNatCheckResponse(); resp.setSeenIp(clientIp); resp.setBehindNAT(isBehindNat); return resp; } /** * Pause a standby site that is already sync'ed with the active * * @param uuid site UUID * @return updated standby site representation */ @POST @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN}, blockProxies = true) @Path("/{uuid}/pause") public Response pauseStandby(@PathParam("uuid") String uuid) { SiteIdListParam param = new SiteIdListParam(); param.getIds().add(uuid); return pause(param); } /** * Pause data replication to multiple standby sites. * * @param idList site uuid list to be removed * @return */ @POST @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN }, blockProxies = true) @Path("/pause") public Response pause(SiteIdListParam idList) { List<String> siteIdList = idList.getIds(); String siteIdStr = StringUtils.join(siteIdList, ","); log.info("Begin to pause standby site from local vdc by uuid: {}", siteIdStr); List<Site> toBePausedSites = new ArrayList<>(); List<String> siteNameList = new ArrayList<>(); for (String siteId : siteIdList) { Site site; try { site = drUtil.getSiteFromLocalVdc(siteId); } catch (Exception ex) { log.error("Can't load site {} from ZK", siteId); throw APIException.badRequests.siteIdNotFound(); } SiteState state = site.getState(); if (state.equals(SiteState.ACTIVE)) { log.error("Unable to pause this site {}. It is active", siteId); throw APIException.badRequests.operationNotAllowedOnActiveSite(); } if (!state.equals(SiteState.STANDBY_SYNCED)) { log.error("Unable to pause this site {}. It is in state {}", siteId, state); throw APIException.badRequests.operationOnlyAllowedOnSyncedSite(site.getName(), state.toString()); } toBePausedSites.add(site); siteNameList.add(site.getName()); } // This String is only used to output human readable message to user when Exception is thrown String siteNameStr = StringUtils.join(siteNameList, ','); precheckForPause(siteNameStr); try { // the site(s) to be paused must be checked as well commonPrecheck(); } catch (APIException e) { throw e; } catch (Exception e) { throw APIException.internalServerErrors.pauseStandbyPrecheckFailed(siteNameStr, e.getMessage()); } InterProcessLock lock = drUtil.getDROperationLock(); // any error is not retry-able beyond this point. List<String> sitesString = new ArrayList<>(); try { log.info("Pausing sites"); long vdcTargetVersion = DrUtil.newVdcConfigVersion(); coordinator.startTransaction(); for (Site site : toBePausedSites) { site.setState(SiteState.STANDBY_PAUSING); site.setLastStateUpdateTime(System.currentTimeMillis()); coordinator.persistServiceConfiguration(site.toConfiguration()); drUtil.recordDrOperationStatus(site.getUuid(), InterState.PAUSING_STANDBY); sitesString.add(site.toBriefString()); // notify the to-be-paused sites before others. drUtil.updateVdcTargetVersion(site.getUuid(), SiteInfo.DR_OP_PAUSE_STANDBY, vdcTargetVersion); } log.info("Notify all sites for reconfig"); for (Site site : drUtil.listSites()) { if (toBePausedSites.contains(site)) { // Site#equals only compares the site uuid // already notified continue; } drUtil.updateVdcTargetVersion(site.getUuid(), SiteInfo.DR_OP_PAUSE_STANDBY, vdcTargetVersion); } coordinator.commitTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.PAUSE_STANDBY, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, StringUtils.join(sitesString, ',')); return Response.status(Response.Status.ACCEPTED).build(); } catch (Exception e) { log.error("Failed to pause site {}", siteIdStr, e); coordinator.discardTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.PAUSE_STANDBY, AuditLogManager.AUDITLOG_FAILURE, null, StringUtils.join(sitesString, ',')); throw APIException.internalServerErrors.pauseStandbyFailed(siteNameStr, e.getMessage()); } finally { try { lock.release(); } catch (Exception ignore) { log.error(String.format("Lock release failed when pausing standby site: %s", siteIdStr)); } } } private void precheckForPause(String siteNames) { PropertyInfo targetProperty = coordinator.getPropertyInfo(); String firewallEnabled = targetProperty.getProperty(SYSTEM_ENABLE_FIREWALL); if (firewallEnabled != null && firewallEnabled.equals("no")) { throw APIException.internalServerErrors.pauseStandbyPrecheckFailed(siteNames, "firewall has been disabled." + "Please make sure to keep it enabled until every standby site has been resumed"); } String ipsecEnabled = ipsecConfig.getIpsecStatus(); if (ipsecEnabled != null && !ipsecEnabled.equals("enabled")) { throw APIException.internalServerErrors.pauseStandbyPrecheckFailed(siteNames, "ipsec has been disabled." + "Please make sure to keep it enabled until every standby site has been resumed"); } } /** * Resume data replication for a paused standby site * * @param uuid site UUID * @return updated standby site representation */ @POST @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN }, blockProxies = true) @Path("/{uuid}/resume") public SiteRestRep resumeStandby(@PathParam("uuid") String uuid) { log.info("Begin to resume data sync to standby site identified by uuid: {}", uuid); Site standby = validateSiteConfig(uuid); SiteState state = standby.getState(); if (!state.equals(SiteState.STANDBY_PAUSED) && !state.equals(SiteState.ACTIVE_DEGRADED)) { log.error("site {} is in state {}, should be STANDBY_PAUSED or ACTIVE_DEGRADED", uuid, standby.getState()); throw APIException.badRequests.operationOnlyAllowedOnPausedSite(standby.getName(), standby.getState().toString()); } SiteNetworkState networkState = drUtil.getSiteNetworkState(uuid); if (networkState.getNetworkHealth() == NetworkHealth.BROKEN) { throw APIException.internalServerErrors.siteConnectionBroken(standby.getName(), "Network health state is broken."); } try (InternalSiteServiceClient client = createInternalSiteServiceClient(standby)) { commonPrecheck(); client.setCoordinatorClient(coordinator); client.setKeyGenerator(apiSignatureGenerator); client.resumePrecheck(); } catch (APIException e) { throw e; } catch (Exception e) { throw APIException.internalServerErrors.resumeStandbyPrecheckFailed(standby.getName(), e.getMessage()); } // Do this before tx get started which might write key to zk. SecretKey secretKey = apiSignatureGenerator.getSignatureKey(SignatureKeyType.INTERVDC_API); InterProcessLock lock = drUtil.getDROperationLock(); long vdcTargetVersion = DrUtil.newVdcConfigVersion(); try { coordinator.startTransaction(); for (Site site : drUtil.listStandbySites()) { if (site.getUuid().equals(uuid)) { log.error("Re-init the target standby", uuid); // init the to-be resumed standby site long dataRevision = vdcTargetVersion; List<Site> standbySites = drUtil.listStandbySites(); SiteConfigParam configParam = prepareSiteConfigParam(standbySites, ipsecConfig.getPreSharedKey(), uuid, dataRevision, vdcTargetVersion, secretKey); try (InternalSiteServiceClient internalSiteServiceClient = new InternalSiteServiceClient()) { internalSiteServiceClient.setCoordinatorClient(coordinator); internalSiteServiceClient.setServer(site.getVipEndPoint()); internalSiteServiceClient.initStandby(configParam); } site.setState(SiteState.STANDBY_RESUMING); coordinator.persistServiceConfiguration(site.toConfiguration()); drUtil.recordDrOperationStatus(site.getUuid(), InterState.RESUMING_STANDBY); drUtil.updateVdcTargetVersion(uuid, SiteInfo.DR_OP_CHANGE_DATA_REVISION, vdcTargetVersion, dataRevision); } else { drUtil.updateVdcTargetVersion(site.getUuid(), SiteInfo.DR_OP_RESUME_STANDBY, vdcTargetVersion); } } // update the local(active) site last drUtil.updateVdcTargetVersion(coordinator.getSiteId(), SiteInfo.DR_OP_RESUME_STANDBY, vdcTargetVersion); coordinator.commitTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.RESUME_STANDBY, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, standby.toBriefString()); return siteMapper.map(standby); } catch (Exception e) { log.error("Error resuming site {}", uuid, e); coordinator.discardTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.RESUME_STANDBY, AuditLogManager.AUDITLOG_FAILURE, null, standby.toBriefString()); InternalServerErrorException resumeStandbyFailedException = APIException.internalServerErrors.resumeStandbyFailed(standby.getName(), e.getMessage()); throw resumeStandbyFailedException; } finally { try { lock.release(); } catch (Exception ignore) { log.error(String.format("Lock release failed when resuming standby site: %s", uuid)); } } } /** * This is internal API to do precheck for resume */ @POST @Path("/internal/resumeprecheck") @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) public SiteErrorResponse resumePrecheck() { log.info("Precheck for resume internally"); SiteErrorResponse response = new SiteErrorResponse(); try { precheckForResumeLocalStandby(); } catch (APIException e) { log.warn("Failed to precheck switchover", e); response.setErrorMessage(e.getMessage()); response.setServiceCode(e.getServiceCode().ordinal()); return response; } catch (Exception e) { log.error("Failed to precheck switchover", e); response.setErrorMessage(e.getMessage()); return response; } return response; } public void precheckForSiteNumber() { int upperLimit = drUtil.getDrIntConfig(DrUtil.KEY_MAX_NUMBER_OF_DR_SITES, SITE_NUMBER_UPPER_LIMIT); int siteNum = drUtil.listSites().size(); if (siteNum >= upperLimit) { throw APIException.internalServerErrors.addStandbyPrecheckFailed( String.format("The maximum number of DR sites(%d) has been reached. Currently %d sites are configured", upperLimit, siteNum)); } } private void precheckForResumeLocalStandby() { Site localSite = drUtil.getLocalSite(); if (!isClusterStable()) { throw APIException.serviceUnavailable.siteClusterStateNotStable(localSite.getName(), Objects.toString(coordinator.getControlNodesState())); } if (SiteState.STANDBY_PAUSED != localSite.getState() && SiteState.ACTIVE_DEGRADED != localSite.getState()) { throw APIException.internalServerErrors.resumeStandbyPrecheckFailed(localSite.getName(), "Standby site is not in paused state"); } } /** * Query the latest error message for specific standby site * * @param uuid site UUID * @return updated standby site representation */ @POST @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN }) @Path("/{uuid}/retry") public SiteRestRep retryOperation(@PathParam("uuid") String uuid) { log.info("Begin to get site error by uuid {}", uuid); Site standby; try { standby = drUtil.getSiteFromLocalVdc(uuid); } catch (CoordinatorException e) { log.error("Can't find site {} from ZK", uuid); throw APIException.badRequests.siteIdNotFound(); } if (!standby.getState().equals(SiteState.STANDBY_ERROR)) { log.error("site {} is in state {}, should be STANDBY_ERROR", uuid, standby.getState()); throw APIException.badRequests.operationOnlyAllowedOnErrorSite(standby.getName(), standby.getState().toString()); } if (!standby.getLastState().equals(SiteState.STANDBY_PAUSING) && !standby.getLastState().equals(SiteState.STANDBY_RESUMING) && !standby.getLastState().equals(SiteState.STANDBY_FAILING_OVER)) { log.error("site {} lastState was {}, retry is only supported for Pause, Resume and Failover", uuid, standby.getLastState()); throw APIException.badRequests.operationRetryOnlyAllowedOnLastState(standby.getName(), standby.getLastState().toString()); } //Reuse the current action required Site localSite = drUtil.getLocalSite(); SiteInfo siteInfo = coordinator.getTargetInfo(localSite.getUuid(),SiteInfo.class); String drOperation = siteInfo.getActionRequired(); // Check that last action matches retry action if (!drOperation.equals(standby.getLastState().getDRAction())) { log.error("Active site last operation was {}, retry is only supported if no other operations have been performed", drOperation); throw APIException.internalServerErrors.retryStandbyPrecheckFailed(standby.getName(), standby.getLastState().toString(), String.format("Another DR operation %s has been run on Active site. Only the latest operation can be retried. " + "This is an unrecoverable Error, please remove site and deploy a new one.",drOperation)); } InterProcessLock lock = drUtil.getDROperationLock(); try { coordinator.startTransaction(); standby.setState(standby.getLastState()); //Failover requires setting old active site to last state as well. if (standby.getState() == SiteState.STANDBY_FAILING_OVER) { for (Site site: drUtil.listSites()){ if (site.getLastState() == SiteState.ACTIVE_FAILING_OVER){ site.setState(SiteState.ACTIVE_FAILING_OVER); coordinator.persistServiceConfiguration(site.toConfiguration()); } } } coordinator.persistServiceConfiguration(standby.toConfiguration()); log.info("Notify all sites for reconfig"); long vdcTargetVersion = DrUtil.newVdcConfigVersion(); for (Site site : drUtil.listSites()) { String siteUuid = site.getUuid(); if (site.getLastState() == SiteState.STANDBY_RESUMING) { SiteInfo siteTargetInfo = coordinator.getTargetInfo(siteUuid, SiteInfo.class); String resumeSiteOperation = siteTargetInfo.getActionRequired(); if (resumeSiteOperation.equals(SiteInfo.DR_OP_CHANGE_DATA_REVISION)) { long dataRevision = vdcTargetVersion; drUtil.updateVdcTargetVersion(siteUuid, resumeSiteOperation, vdcTargetVersion, dataRevision); continue; } } log.info("Set dr operation {} on site {}", drOperation, siteUuid); drUtil.updateVdcTargetVersion(siteUuid, drOperation, vdcTargetVersion); } coordinator.commitTransaction(); return siteMapper.map(standby); } catch (Exception e) { log.error("Error retrying site operation for site {}", uuid, e); coordinator.discardTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.RETRY_STANDBY_OP, AuditLogManager.AUDITLOG_FAILURE, null, standby); InternalServerErrorException retryStandbyOpFailedException = APIException.internalServerErrors.retryStandbyOpFailed(standby.getName(), e.getMessage()); throw retryStandbyOpFailedException; } finally { try { lock.release(); } catch (Exception ignore) { log.error(String.format("Lock release failed when retrying standby site last op: %s", uuid)); } } } /** * Retry last operation when in STANDBY_ERROR * * @param uuid site UUID * @return site response with detail information */ @GET @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN, Role.SYSTEM_MONITOR }) @Path("/{uuid}/error") public SiteErrorResponse getSiteError(@PathParam("uuid") String uuid) { log.info("Begin to get site error by uuid {}", uuid); try { Site standby = drUtil.getSiteFromLocalVdc(uuid); if (standby.getState().equals(SiteState.STANDBY_ERROR)) { return coordinator.getTargetInfo(uuid, SiteError.class).toResponse(); } } catch (CoordinatorException e) { log.error("Can't find site {} from ZK", uuid); throw APIException.badRequests.siteIdNotFound(); } catch (Exception e) { log.error("Find find site from ZK for UUID {} : {}" + uuid, e); } return SiteErrorResponse.noError(); } /** * This API will do switchover to target new active site according passed in site UUID. After failover, old active site will * work as normal standby site and target site will be promoted to active. All site will update properties to trigger reconfig. * * @param uuid target new active site UUID * @return return accepted response if operation is successful */ @POST @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Path("/{uuid}/switchover") @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true) public Response doSwitchover(@PathParam("uuid") String uuid) { log.info("Begin to switchover for standby UUID {}", uuid); precheckForSwitchoverForActiveSite(uuid); List<Site> allStandbySites = drUtil.listStandbySites(); for (Site site : allStandbySites) { if (!site.getUuid().equals(uuid) && site.getState() == SiteState.STANDBY_PAUSED) { try (InternalSiteServiceClient client = new InternalSiteServiceClient(site)) { client.setCoordinatorClient(coordinator); client.setKeyGenerator(apiSignatureGenerator); client.switchoverPrecheck(); } } } String oldActiveUUID = drUtil.getActiveSite().getUuid(); InterProcessLock lock = drUtil.getDROperationLock(); Site newActiveSite = null; Site oldActiveSite = null; try { newActiveSite = drUtil.getSiteFromLocalVdc(uuid); // Set old active site's state, short id and key oldActiveSite = drUtil.getSiteFromLocalVdc(oldActiveUUID); if (StringUtils.isEmpty(oldActiveSite.getSiteShortId())) { oldActiveSite.setSiteShortId(newActiveSite.getVdcShortId()); } coordinator.startTransaction(); oldActiveSite.setState(SiteState.ACTIVE_SWITCHING_OVER); coordinator.persistServiceConfiguration(oldActiveSite.toConfiguration()); // this barrier is set when begin switchover and will be removed by new active site. Old active site will wait and reboot after // barrier is removed DistributedBarrier restartBarrier = coordinator.getDistributedBarrier(String.format("%s/%s/%s", ZkPath.SITES, oldActiveSite.getUuid(), Constants.SWITCHOVER_BARRIER_RESTART)); restartBarrier.setBarrier(); drUtil.recordDrOperationStatus(oldActiveSite.getUuid(), InterState.SWITCHINGOVER_ACTIVE); // trigger reconfig long vdcConfigVersion = DrUtil.newVdcConfigVersion(); // a version for all sites. for (Site eachSite : drUtil.listSites()) { if (!eachSite.getUuid().equals(uuid) && eachSite.getState() == SiteState.STANDBY_PAUSED) { try (InternalSiteServiceClient client = new InternalSiteServiceClient(eachSite)) { client.setCoordinatorClient(coordinator); client.setKeyGenerator(apiSignatureGenerator); client.switchover(newActiveSite.getUuid(), vdcConfigVersion); } }else { drUtil.updateVdcTargetVersion(eachSite.getUuid(), SiteInfo.DR_OP_SWITCHOVER, vdcConfigVersion, oldActiveSite.getUuid(), newActiveSite.getUuid()); } } coordinator.commitTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.SWITCHOVER, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, oldActiveSite.toBriefString(), newActiveSite.toBriefString()); return Response.status(Response.Status.ACCEPTED).build(); } catch (Exception e) { log.error(String.format("Error happened when switchover from site %s to site %s", oldActiveUUID, uuid), e); coordinator.discardTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.SWITCHOVER, AuditLogManager.AUDITLOG_FAILURE, null, newActiveSite.getName(), newActiveSite.getVipEndPoint()); throw APIException.internalServerErrors.switchoverFailed(oldActiveSite.getName(), newActiveSite.getName(), e.getMessage()); } finally { try { lock.release(); } catch (Exception ignore) { log.error(String.format("Lock release failed when switchover from %s to %s", oldActiveUUID, uuid)); } } } /** * This is internal API to do precheck for switchover * * @return return response with error message and service code */ @POST @Path("/internal/switchoverprecheck") @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) public SiteErrorResponse switchoverPrecheck() { log.info("Precheck for switchover internally"); SiteErrorResponse response = new SiteErrorResponse(); try { precheckForSwitchoverForLocalStandby(); } catch (InternalServerErrorException e) { log.warn("Failed to precheck switchover", e); response.setErrorMessage(e.getMessage()); response.setServiceCode(e.getServiceCode().ordinal()); return response; } catch (Exception e) { log.error("Failed to precheck switchover", e); response.setErrorMessage(e.getMessage()); return response; } return response; } /** * This is internal API to do switchover * * @return return response with error message and service code */ @POST @Path("/internal/switchover") @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) public Response switchover(@QueryParam("newActiveSiteUUid") String newActiveSiteUUID, @QueryParam("vdcVersion") String vdcTargetVersion) { log.info("Begin to switchover internally for standby UUID {}", newActiveSiteUUID); Site newActiveSite = null; Site oldActiveSite = null; try { newActiveSite = drUtil.getSiteFromLocalVdc(newActiveSiteUUID); oldActiveSite = drUtil.getSiteFromLocalVdc(drUtil.getActiveSite().getUuid()); if (StringUtils.isEmpty(oldActiveSite.getSiteShortId())) { oldActiveSite.setSiteShortId(newActiveSite.getVdcShortId()); } oldActiveSite.setState(SiteState.STANDBY_SYNCED); coordinator.persistServiceConfiguration(oldActiveSite.toConfiguration()); newActiveSite.setState(SiteState.ACTIVE); coordinator.persistServiceConfiguration(newActiveSite.toConfiguration()); drUtil.updateVdcTargetVersion(drUtil.getLocalSite().getUuid(), SiteInfo.DR_OP_SWITCHOVER, Long.parseLong(vdcTargetVersion), oldActiveSite.getUuid(), newActiveSite.getUuid()); return Response.status(Response.Status.ACCEPTED).build(); } catch (Exception e) { log.error(String.format("Error happened when switchover to site %s", newActiveSiteUUID), e); throw APIException.internalServerErrors.switchoverFailed(oldActiveSite.getName(), newActiveSite.getName(), e.getMessage()); } } /** * This API will do failover from standby site. This operation is only allowed when active site is down. * After failover, this standby site will be promoted to active site. * * @param uuid target new active site UUID * @return return accepted response if operation is successful */ @POST @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Path("/{uuid}/failover") @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true) public Response doFailover(@PathParam("uuid") String uuid) { log.info("Begin to failover for standby UUID {}", uuid); Site currentSite = drUtil.getSiteFromLocalVdc(uuid); precheckForFailoverLocally(uuid); List<Site> allStandbySites = drUtil.listStandbySites(); try { coordinator.startTransaction(); // set state String activeSiteId = drUtil.getActiveSite().getUuid(); Site oldActiveSite = new Site(); if (StringUtils.isEmpty(activeSiteId)) { log.info("Cant't find active site id, go on to do failover"); } else { oldActiveSite = drUtil.getSiteFromLocalVdc(activeSiteId); oldActiveSite.setState(SiteState.ACTIVE_FAILING_OVER); coordinator.persistServiceConfiguration(oldActiveSite.toConfiguration()); } currentSite.setState(SiteState.STANDBY_FAILING_OVER); coordinator.persistServiceConfiguration(currentSite.toConfiguration()); drUtil.recordDrOperationStatus(currentSite.getUuid(), InterState.FAILINGOVER_STANDBY); long vdcTargetVersion = DrUtil.newVdcConfigVersion(); //reconfig other standby sites for (Site site : allStandbySites) { if (!site.getUuid().equals(uuid)) { if (site.getState() == SiteState.STANDBY_SYNCED) { site.setState(SiteState.STANDBY_PAUSED); coordinator.persistServiceConfiguration(site.toConfiguration()); } else if (site.getState() == SiteState.STANDBY_REMOVING) { site.setState(SiteState.STANDBY_ERROR); coordinator.persistServiceConfiguration(site.toConfiguration()); } // update the vdc config version on the new active site. drUtil.updateVdcTargetVersion(site.getUuid(), SiteInfo.DR_OP_FAILOVER, vdcTargetVersion, oldActiveSite.getUuid(), currentSite.getUuid()); } } drUtil.updateVdcTargetVersion(uuid, SiteInfo.DR_OP_FAILOVER, vdcTargetVersion, oldActiveSite.getUuid(), currentSite.getUuid()); coordinator.commitTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.FAILOVER, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, oldActiveSite.toBriefString(), currentSite.toBriefString()); return Response.status(Response.Status.ACCEPTED).build(); } catch (Exception e) { log.error("Error happened when failover at site {}", uuid, e); coordinator.discardTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.FAILOVER, AuditLogManager.AUDITLOG_FAILURE, null, currentSite.getName(), currentSite.getVipEndPoint()); throw APIException.internalServerErrors.failoverFailed(currentSite.getName(), e.getMessage()); } } /** * This is internal API to do precheck for failover * * @return return response with error message and service code */ @POST @Path("/internal/failoverprecheck") @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) public FailoverPrecheckResponse failoverPrecheck() { log.info("Precheck for failover internally"); FailoverPrecheckResponse response = new FailoverPrecheckResponse(); response.setSite(this.siteMapper.map(drUtil.getLocalSite())); try { precheckForFailover(); } catch (InternalServerErrorException e) { log.warn("Failed to precheck failover", e); response.setErrorMessage(e.getMessage()); response.setServiceCode(e.getServiceCode().ordinal()); return response; } catch (Exception e) { log.error("Failed to precheck failover", e); response.setErrorMessage(e.getMessage()); return response; } return response; } /** * This is internal API to do failover * * @return return response with error message and service code */ @POST @Path("/internal/failover") @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) public Response failover(@QueryParam("newActiveSiteUUid") String newActiveSiteUUID, @QueryParam("oldActiveSiteUUid") String oldActiveSiteUUID, @QueryParam("vdcVersion") String vdcTargetVersion) { log.info("Begin to failover internally with newActiveSiteUUid {}, oldActiveSiteUUid {}", newActiveSiteUUID, oldActiveSiteUUID); Site currentSite = drUtil.getLocalSite(); String uuid = currentSite.getUuid(); try { // set state Site oldActiveSite = new Site(); if (StringUtils.isEmpty(oldActiveSiteUUID)) { log.info("Cant't find active site id, go on to do failover"); } else { oldActiveSite = drUtil.getSiteFromLocalVdc(oldActiveSiteUUID); drUtil.removeSite(oldActiveSite); } Site newActiveSite = drUtil.getSiteFromLocalVdc(newActiveSiteUUID); newActiveSite.setState(SiteState.STANDBY_FAILING_OVER); coordinator.persistServiceConfiguration(newActiveSite.toConfiguration()); drUtil.updateVdcTargetVersion(currentSite.getUuid(), SiteInfo.DR_OP_FAILOVER, Long.parseLong(vdcTargetVersion), oldActiveSite.getUuid(), currentSite.getUuid()); auditDisasterRecoveryOps(OperationTypeEnum.FAILOVER, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, oldActiveSite.toBriefString(), newActiveSite.toBriefString()); return Response.status(Response.Status.ACCEPTED).build(); } catch (Exception e) { log.error("Error happened when failover at site %s", uuid, e); auditDisasterRecoveryOps(OperationTypeEnum.FAILOVER, AuditLogManager.AUDITLOG_FAILURE, null, uuid, currentSite.getVipEndPoint(), currentSite.getName()); throw APIException.internalServerErrors.failoverFailed(currentSite.getName(), e.getMessage()); } } /** * Update site information. Only name and description can be updated. * * @param uuid target site uuid * @param siteParam site information * @return */ @PUT @Path("/{uuid}") @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true) public Response updateSite(@PathParam("uuid") String uuid, SiteUpdateParam siteParam) { log.info("Begin to update site information for {}", uuid); Site site = null; try { site = drUtil.getSiteFromLocalVdc(uuid); } catch (RetryableCoordinatorException e) { log.error("Can't find site with specified site UUID {}", uuid); throw APIException.badRequests.siteIdNotFound(); } if (!validSiteName(siteParam.getName())) { throw APIException.internalServerErrors.updateSiteFailed(site.getName(), String.format("Site name should not be empty or longer than %d characters.", SITE_NAME_LENGTH_LIMIT)); } for (Site eachSite : drUtil.listSites()) { if (eachSite.getUuid().equals(uuid)) { continue; } if (eachSite.getName().equals(siteParam.getName())) { throw APIException.internalServerErrors.addStandbyPrecheckFailed("Duplicate site name"); } } try { site.setName(siteParam.getName()); site.setDescription(siteParam.getDescription()); coordinator.persistServiceConfiguration(site.toConfiguration()); auditDisasterRecoveryOps(OperationTypeEnum.UPDATE_SITE, AuditLogManager.AUDITLOG_SUCCESS, null, site.getName(), site.getVipEndPoint(), site.getUuid()); return Response.status(Response.Status.ACCEPTED).build(); } catch (Exception e) { log.error("Error happened when update site %s", uuid, e); auditDisasterRecoveryOps(OperationTypeEnum.UPDATE_SITE, AuditLogManager.AUDITLOG_FAILURE, null, site.getName(), site.getVipEndPoint(), site.getUuid()); throw APIException.internalServerErrors.updateSiteFailed(site.getName(), e.getMessage()); } } private boolean validSiteName(String siteName) { if (!StringUtils.isBlank(siteName) && siteName.length() <= SITE_NAME_LENGTH_LIMIT) { return true; } return false; } private boolean isDataSynced(Site site) { if (site.getState().equals(SiteState.ACTIVE)) { return true; } if (site.getState().equals(SiteState.STANDBY_SYNCED)) { SiteMonitorResult monitorResult = coordinator.getTargetInfo(site.getUuid(), SiteMonitorResult.class); if (monitorResult != null && monitorResult.getDbQuorumLostSince() > 0) { return false; } return true; } return false; } /** * Query the details, such as transition timings, for specific standby site * * @param uuid site UUID * @return SiteActionsTime with detail information */ @GET @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN, Role.SYSTEM_MONITOR }) @Path("/{uuid}/details") public SiteDetailRestRep getSiteDetails(@PathParam("uuid") String uuid) { log.info("Begin to get site paused time by uuid {}", uuid); SiteDetailRestRep standbyDetails = new SiteDetailRestRep(); try { Site standby = drUtil.getSiteFromLocalVdc(uuid); standbyDetails.setCreationTime(new Date(standby.getCreationTime())); Double latency = drUtil.getSiteNetworkState(uuid).getNetworkLatencyInMs(); standbyDetails.setNetworkLatencyInMs(latency); Date lastSyncTime = drUtil.getLastSyncTime(standby); if (lastSyncTime != null) { standbyDetails.setLastSyncTime(lastSyncTime); } standbyDetails.setDataSynced(isDataSynced(standby)); ClusterInfo.ClusterState clusterState = coordinator.getControlNodesState(standby.getUuid()); if(clusterState != null) { standbyDetails.setClusterState(clusterState.toString()); } else { standbyDetails.setClusterState(ClusterInfo.ClusterState.UNKNOWN.toString()); } standbyDetails.setSiteState(standby.getState().toString()); } catch (CoordinatorException e) { log.error("Can't find site {} from ZK", uuid); throw APIException.badRequests.siteIdNotFound(); } catch (Exception e) { log.error("Find find site from ZK for UUID {} : {}" + uuid, e); } return standbyDetails; } @GET @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Path("/internal/list") public SiteList getSitesInternally() { return this.getSites(); } /** * Common precheck logic for DR operations. * * @param excludedSiteIds, site ids to exclude from the cluster state precheck */ private void commonPrecheck(List<String> excludedSiteIds) { if (drUtil.isStandby()) { throw APIException.badRequests.operationOnlyAllowedOnActiveSite(); } if (!isClusterStable()) { throw APIException.serviceUnavailable.clusterStateNotStable(); } for (Site site : drUtil.listStandbySites()) { if (excludedSiteIds.contains(site.getUuid())) { continue; } // don't check node state for paused sites. if (site.getState().equals(SiteState.STANDBY_PAUSED) || site.getState().equals(SiteState.ACTIVE_DEGRADED)) { continue; } int nodeCount = site.getNodeCount(); ClusterInfo.ClusterState state = coordinator.getControlNodesState(site.getUuid()); // state could be null if (!ClusterInfo.ClusterState.STABLE.equals(state)) { log.error("Site {} is not stable {}", site.getUuid(), Objects.toString(state)); throw APIException.serviceUnavailable.siteClusterStateNotStable(site.getName(), Objects.toString(state)); } } } /** * Wrapper for commonPrecheck that enforce precheck on all sites * */ private void commonPrecheck() { commonPrecheck(new ArrayList<String>()); } private Site validateSiteConfig(String uuid) { if (!isClusterStable()) { log.error("Cluster is unstable"); throw APIException.serviceUnavailable.clusterStateNotStable(); } try { return drUtil.getSiteFromLocalVdc(uuid); } catch (CoordinatorException e) { log.error("Can't find site {} from ZK", uuid); throw APIException.badRequests.siteIdNotFound(); } } private void precheckForGeo() { Map<String, List<Site>> vdcSiteMap = drUtil.getVdcSiteMap(); int numOfVdcs = vdcSiteMap.keySet().size(); if (numOfVdcs > 1) { throw APIException.internalServerErrors.addStandbyPrecheckFailed("Not allowed to add standby site in multivdc configuration"); } } /* * Internal method to check whether standby can be attached to current active site */ protected void precheckForStandbyAdd(SiteConfigRestRep standby, ViPRCoreClient viprCoreClient) { if (!isClusterStable()) { throw APIException.internalServerErrors.addStandbyPrecheckFailed("Current site is not stable"); } if (!standby.isClusterStable()) { throw APIException.internalServerErrors.addStandbyPrecheckFailed("Remote site is not stable"); } // standby should be refresh install if (!standby.isFreshInstallation() && !SiteState.ACTIVE_DEGRADED.toString().equalsIgnoreCase(standby.getState())) { throw APIException.internalServerErrors.addStandbyPrecheckFailed("Standby is not a fresh installation"); } // DB schema version should be same String currentDbSchemaVersion = coordinator.getCurrentDbSchemaVersion(); String standbyDbSchemaVersion = standby.getDbSchemaVersion(); if (!currentDbSchemaVersion.equalsIgnoreCase(standbyDbSchemaVersion)) { throw APIException.internalServerErrors.addStandbyPrecheckFailed(String.format( "Standby db schema version %s is not same as active site %s", standbyDbSchemaVersion, currentDbSchemaVersion)); } // this site should not be standby site String activeId = drUtil.getActiveSite().getUuid(); if (activeId != null && !activeId.equals(coordinator.getSiteId())) { throw APIException.internalServerErrors.addStandbyPrecheckFailed("This site is also a standby site"); } checkSupportedIPForAttachStandby(standby); checkNATForAttachStandby(viprCoreClient); } private void checkNATForAttachStandby(ViPRCoreClient viprCoreClient) { DualInetAddress inetAddress = coordinator.getInetAddessLookupMap().getDualInetAddress(); String ipv4 = inetAddress.getInet4(); String ipv6 = inetAddress.getInet6(); log.info("Got local node's IP addresses, IPv4 = {}, IPv6 = {}", ipv4, ipv6); DRNatCheckParam checkParam = new DRNatCheckParam(); checkParam.setIPv4Address(ipv4); checkParam.setIPv6Address(ipv6); DRNatCheckResponse resp = viprCoreClient.site().checkIfBehindNat(checkParam); if (resp.isBehindNAT()) { throw APIException.internalServerErrors.addStandbyPrecheckFailed(String .format("The remote site sees this node's IP as %s, which is different from the local addresses: %s or %s, it may be behind a NAT.", resp.getSeenIp(), ipv4, ipv6)); } } protected void checkSupportedIPForAttachStandby(SiteConfigRestRep standby) { Site site = drUtil.getLocalSite(); // active has IPv4 and standby has no IPv4 if (!isHostIPAddressMapEmpty(site.getHostIPv4AddressMap()) && isHostIPAddressMapEmpty(standby.getHostIPv4AddressMap())) { throw APIException.internalServerErrors .addStandbyPrecheckFailed("Unsupported network configuration. Active site has IPv4. Standby site should be IPv4 or dual stack "); } // active has only IPv6 and standby has IPv4 if (isHostIPAddressMapEmpty(site.getHostIPv4AddressMap()) && !isHostIPAddressMapEmpty(standby.getHostIPv4AddressMap())) { throw APIException.internalServerErrors .addStandbyPrecheckFailed("Unsupported network configuration. Active site only has IPv6, Standby site should not has IPv4 address"); } } private boolean isHostIPAddressMapEmpty(Map<String, String> map) { if (map == null) { return true; } for (String ip : map.values()) { if (!PropertyConstants.IPV4_ADDR_DEFAULT.equals(ip) && !PropertyConstants.IPV6_ADDR_DEFAULT.equals(ip)) { return false; } } return true; } protected void precheckStandbyVersion(SiteAddParam standby) { ViPRSystemClient viprSystemClient = createViPRSystemClient(standby.getVip(), standby.getUsername(), standby.getPassword()); // software version should be matched SoftwareVersion currentSoftwareVersion; SoftwareVersion standbySoftwareVersion; try { currentSoftwareVersion = coordinator.getTargetInfo(RepositoryInfo.class).getCurrentVersion(); standbySoftwareVersion = new SoftwareVersion(viprSystemClient.upgrade().getTargetVersion().getTargetVersion()); } catch (Exception e) { throw APIException.internalServerErrors.addStandbyPrecheckFailed(String.format("Fail to get software version %s", e.getMessage())); } // enforcing a strict match between active/standby software versions // otherwise the standby site will automatically upgrade/downgrade to the same version with the active site if (!currentSoftwareVersion.equals(standbySoftwareVersion)) { throw APIException.internalServerErrors.addStandbyPrecheckFailed(String.format( "Standby site version %s does not equal to current version %s", standbySoftwareVersion, currentSoftwareVersion)); } } /* * Internal method to check whether failover from active to standby is allowed */ protected void precheckForSwitchover(String standbyUuid) { Site standby = null; if (drUtil.isStandby()) { throw APIException.badRequests.operationOnlyAllowedOnActiveSite(); } try { standby = drUtil.getSiteFromLocalVdc(standbyUuid); } catch (CoordinatorException e) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getUuid(), "Standby uuid is not valid, can't find it"); } if (standbyUuid.equals(drUtil.getActiveSite().getUuid())) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), "Can't switchover to an active site"); } if (!drUtil.isSiteUp(standbyUuid)) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), "Standby site is not up"); } if (standby.getState() != SiteState.STANDBY_SYNCED) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), "Standby site is not fully synced"); } List<Site> existingSites = drUtil.listSites(); for (Site site : existingSites) { ClusterInfo.ClusterState state = coordinator.getControlNodesState(site.getUuid()); if (state != ClusterInfo.ClusterState.STABLE) { log.info("Site {} is not stable {}", site.getUuid(), state); throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), String.format("Site %s is not stable", site.getName())); } } } /* * Internal method to check whether failover to standby is allowed */ private void precheckForFailoverLocally(String standbyUuid) { Site standby = drUtil.getLocalSite(); // API should be only send to local site if (!standby.getUuid().equals(standbyUuid)) { throw APIException.internalServerErrors.failoverPrecheckFailed(standby.getName(), String.format("Failover can only be executed in local site. Local site uuid %s is not matched with uuid %s", standby.getUuid(), standbyUuid)); } String uuid = drUtil.getActiveSite().getUuid(); if (!StringUtils.isEmpty(uuid)) { SiteNetworkState networkState = drUtil.getSiteNetworkState(uuid); if (networkState.getNetworkHealth() != NetworkHealth.BROKEN) { throw APIException.internalServerErrors.failoverPrecheckFailed(standby.getName(), "Active site is still available"); } } // should be PAUSED, either marked by itself or user // Don't allow failover to site of ACTIVE_DEGRADED state in X-wing if (standby.getState() != SiteState.STANDBY_PAUSED) { throw APIException.internalServerErrors.failoverPrecheckFailed(standby.getName(), "Please wait for this site to recognize the Active site is down and automatically switch to a Paused state before failing over."); } precheckForFailover(); } void precheckForFailover() { Site standby = drUtil.getLocalSite(); String standbyUuid = standby.getUuid(); String standbyName = standby.getName(); // show be only standby if (drUtil.isActiveSite()) { throw APIException.badRequests.operationNotAllowedOnActiveSite(); } // all syssvc should be up if (!drUtil.isAllSyssvcUp(standbyUuid)) { log.info("Not all syssvc is running at site {}", standby.getName()); throw APIException.internalServerErrors.failoverPrecheckFailed(standby.getName(), String.format("Site %s is not stable, one or more syssvc is not running", standby.getName())); } // Make sure that the local ZK has been reconfigured to participant // This DOES NOT implies that the active site is unreachable, notably when the local site is manually paused String coordinatorMode = drUtil.getLocalCoordinatorMode(); log.info("Local coordinator mode is {}", coordinatorMode); if (coordinatorMode == null || !drUtil.isParticipantNode(coordinatorMode)) { log.info("Active site is available now, can't do failover"); throw APIException.internalServerErrors.failoverPrecheckFailed(standbyName, "Active site is available now, can't do failover"); } } protected SiteRestRep findRecommendFailoverSite(List<SiteRestRep> responseSiteFromRemote, Site currentSite) { if (currentSite.getState().equals(SiteState.STANDBY_SYNCED)) { return this.siteMapper.map(currentSite); } for (SiteRestRep site : responseSiteFromRemote) { if (site != null && SiteState.STANDBY_SYNCED.toString().equalsIgnoreCase(site.getState())) { return site; } } return this.siteMapper.map(currentSite); } protected void validateAddParam(SiteAddParam param, List<Site> existingSites) { String siteName = param.getName(); if (!validSiteName(siteName)) { throw APIException.internalServerErrors.addStandbyPrecheckFailed(String.format( "Site name should not be empty or longer than %d characters.", SITE_NAME_LENGTH_LIMIT)); } String siteVip = param.getVip(); InetAddress address = null; try { address = InetAddress.getByName(siteVip); } catch (UnknownHostException e) { throw APIException.internalServerErrors.addStandbyPrecheckFailed("Could not resolve target standby site virtual IP. Please check name service."); } if (address.getHostAddress().contains(":")) { param.setVip(DualInetAddress.normalizeInet6Address(address.getHostAddress())); } else { param.setVip(address.getHostAddress()); } log.info("Target standby site ip is {}", param.getVip()); for (Site site : existingSites) { if (site.getName().equals(siteName)) { throw APIException.internalServerErrors.addStandbyPrecheckFailed("Duplicate site name"); } // COP-18954 Skip stability check for paused sites if (site.getState().equals(SiteState.STANDBY_PAUSED)) { continue; } ClusterInfo.ClusterState state = coordinator.getControlNodesState(site.getUuid()); if (state != ClusterInfo.ClusterState.STABLE) { log.info("Site {} is not stable {}", site.getUuid(), state); throw APIException.internalServerErrors.addStandbyPrecheckFailed(String.format("Currently site %s is not stable", site.getName())); } } } private String generateShortId(List<Site> existingSites) throws Exception { Set<String> existingShortIds = new HashSet<String>(); for (Site site : existingSites) { existingShortIds.add(site.getSiteShortId()); } for (int i = 1; i < MAX_NUM_OF_STANDBY; i++) { String id = String.format(SHORTID_FMT, i); if (!existingShortIds.contains(id)) { return id; } } throw new Exception("Failed to generate standby short id"); } protected boolean isClusterStable() { return coordinator.getControlNodesState() == ClusterInfo.ClusterState.STABLE; } protected boolean isFreshInstallation() { Configuration setupConfig = coordinator.queryConfiguration(InitialSetup.CONFIG_KIND, InitialSetup.CONFIG_ID); boolean freshInstall = (setupConfig == null) || !Boolean.parseBoolean(setupConfig.getConfig(InitialSetup.COMPLETE)); log.info("Fresh installation {}", freshInstall); boolean hasDataInDB = dbClient.hasUsefulData(); log.info("Has useful data in DB {}", hasDataInDB); return freshInstall && !hasDataInDB; } // encapsulate the create ViPRCoreClient operation for easy UT writing because need to mock ViPRCoreClient protected ViPRCoreClient createViPRCoreClient(String vip, String username, String password) { try { return new ViPRCoreClient(vip, true).withLogin(username, password); } catch (Exception e) { log.error(String.format("Fail to create vipr client, vip: %s, username: %s", vip, username), e); throw APIException.internalServerErrors.failToCreateViPRClient(); } } // encapsulate the create ViPRSystemClient operation for easy UT writing because need to mock ViPRSystemClient protected ViPRSystemClient createViPRSystemClient(String vip, String username, String password) { try { return new ViPRSystemClient(vip, true).withLogin(username, password); } catch (Exception e) { log.error(String.format("Fail to create vipr client, vip: %s, username: %s", vip, username), e); throw APIException.internalServerErrors.failToCreateViPRClient(); } } // encapsulate the create InternalSiteServiceClient operation for easy UT writing because need to mock InternalSiteServiceClient protected InternalSiteServiceClient createInternalSiteServiceClient(Site site) { return new InternalSiteServiceClient(site); } public void setApiSignatureGenerator(InternalApiSignatureKeyGenerator apiSignatureGenerator) { this.apiSignatureGenerator = apiSignatureGenerator; } public void setSiteMapper(SiteMapper siteMapper) { this.siteMapper = siteMapper; } public void setSysUtils(SysUtils sysUtils) { this.sysUtils = sysUtils; } public void setDbClient(DbClient dbClient) { this.dbClient = dbClient; } public void setCoordinator(CoordinatorClient coordinator) { this.coordinator = coordinator; } public void setDrUtil(DrUtil drUtil) { this.drUtil = drUtil; } public void setIpsecConfig(IPsecConfig ipsecConfig) { this.ipsecConfig = ipsecConfig; } private void startLeaderSelector() { LeaderSelector leaderSelector = coordinator.getLeaderSelector(coordinator.getSiteId(), Constants.FAILBACK_DETECT_LEADER, new FailbackLeaderSelectorListener()); leaderSelector.autoRequeue(); leaderSelector.start(); } protected void precheckForSwitchoverForActiveSite(String standbyUuid) { Site standby = null; if (drUtil.isStandby()) { throw APIException.badRequests.operationOnlyAllowedOnActiveSite(); } try { standby = drUtil.getSiteFromLocalVdc(standbyUuid); } catch (CoordinatorException e) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getUuid(), "Standby uuid is not valid, can't find it"); } if (standbyUuid.equals(drUtil.getActiveSite().getUuid())) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), "Can't switchover to an active site"); } if (standby.getState() != SiteState.STANDBY_SYNCED) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), "Standby site is not fully synced"); } if (!drUtil.isSiteUp(standbyUuid)) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), "Standby site is not up"); } if (coordinator.getControlNodesState(standby.getUuid()) != ClusterInfo.ClusterState.STABLE) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), "Standby site is not stable"); } if (!isClusterStable()) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), "Active site is not stable"); } checkSiteConnectivity(standby); List<Site> existingSites = drUtil.listStandbySites(); for (Site site : existingSites) { if (site.getState() != SiteState.STANDBY_SYNCED && site.getState() != SiteState.STANDBY_PAUSED) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), String.format("Standby site %s is not synced or paused", site.getName())); } ClusterInfo.ClusterState state = coordinator.getControlNodesState(site.getUuid()); if (site.getState() != SiteState.STANDBY_PAUSED && state != ClusterInfo.ClusterState.STABLE) { log.info("Site {} is not stable {}", site.getUuid(), state); throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), String.format("Site %s is not stable", site.getName())); } } } private void precheckForSwitchoverForLocalStandby() { if (!isClusterStable()) { throw APIException.serviceUnavailable.clusterStateNotStable(); } Site currentSite = drUtil.getLocalSite(); if (currentSite.getState() != SiteState.STANDBY_SYNCED && currentSite.getState() != SiteState.STANDBY_PAUSED) { throw APIException.internalServerErrors.switchoverPrecheckFailed(currentSite.getName(), String.format("Standby site %s is not synced or paused", currentSite.getName())); } } private void checkSiteConnectivity(Site site) { SiteNetworkState networkState = drUtil.getSiteNetworkState(site.getUuid()); if (networkState.getNetworkHealth() == NetworkHealth.BROKEN) { throw APIException.internalServerErrors.siteConnectionBroken(site.getName(), "Network health state is broken."); } if (drUtil.testPing(site.getVip(), SITE_CONNECTION_TEST_PORT, SITE_CONNECT_TEST_TIMEOUT) == -1) { throw APIException.internalServerErrors.siteConnectionBroken(site.getName(), String.format("Can't connect to site by virtual IP: %s", site.getVip())); } } private class FailbackLeaderSelectorListener extends LeaderSelectorListenerImpl { private static final int FAILBACK_DETECT_INTERNVAL_SECONDS = 60; private ScheduledExecutorService service; @Override protected void startLeadership() throws Exception { log.info("This node is selected as failback detector"); service = Executors.newScheduledThreadPool(1); service.scheduleAtFixedRate(failbackDetectMonitor, 0, FAILBACK_DETECT_INTERNVAL_SECONDS, TimeUnit.SECONDS); } @Override protected void stopLeadership() { service.shutdown(); try { while (!service.awaitTermination(30, TimeUnit.SECONDS)) { log.info("Waiting scheduler thread pool to shutdown for another 30s"); } } catch (InterruptedException e) { log.error("Interrupted while waiting to shutdown scheduler thread pool.", e); Thread.currentThread().interrupt(); return; } } } private boolean isSiteContainedBy(String siteId, SiteList sites) { for (SiteRestRep site : sites.getSites()) { if (siteId.equals(site.getUuid())) { return true; } } log.info("Site {} is removed", siteId); return false; } private Runnable failbackDetectMonitor = new Runnable() { @Override public void run() { try { if (!needCheckFailback() || !isLocalSiteDiscarded()) { log.info("No need to check failback locally or there's no remote active site, return"); return; } if(!resetActiveSite()) { log.error("Failed to reset active site status info"); return; } degradeActiveSite(); } catch (Exception e) { log.error("Error occured during failback detect monitor", e); } } private void degradeActiveSite() throws Exception { try { log.info("Current active site {}", drUtil.getActiveSite().getUuid()); coordinator.startTransaction(); List<Site> standbySites = drUtil.listStandbySites(); for (Site standbySite : standbySites) { if (!drUtil.isLocalSite(standbySite)) { log.info("Set standby site {} from state {} to STANDBY_PAUSED", standbySite.getUuid(), standbySite.getState()); standbySite.setState(SiteState.STANDBY_PAUSED); coordinator.persistServiceConfiguration(standbySite.toConfiguration()); } } // At this moment this site is disconnected with others, so ok to have own vdc version. drUtil.updateVdcTargetVersion(coordinator.getSiteId(), SiteInfo.DR_OP_FAILBACK_DEGRADE, DrUtil.newVdcConfigVersion()); coordinator.commitTransaction(); } catch (Exception e) { coordinator.discardTransaction(); throw e; } } private boolean needCheckFailback() { Site localSite = drUtil.getLocalSite(); if (localSite.getState().equals(SiteState.ACTIVE)) { log.info("Current site is active site, need to check failback"); return true; } if (localSite.getState().equals(SiteState.ACTIVE_DEGRADED)) { log.info("Site is already ACTIVE_FAILBACK_DEGRADED"); if (!coordinator.locateAllServices(localSite.getUuid(), "controllersvc", "1", null, null).isEmpty()) { log.info("there are some controller service alive, process to degrade"); return true; } if (!coordinator.locateAllServices(localSite.getUuid(), "sasvc", "1", null, null).isEmpty()) { log.info("there are some sa service alive, process to degrade"); return true; } if (!coordinator.locateAllServices(localSite.getUuid(), "vasasvc", "1", null, null).isEmpty()) { log.info("there are some vasa service alive, process to degrade"); return true; } } log.info("Current site is not active, and there is no alive controllersvc/sasvc/vasasvc, so no need to check failback"); return false; } /** * @return true when Local site is in ACTIVE_DEGRADED state or can't be found according returned result from other site */ private boolean isLocalSiteDiscarded() { String localSiteId = drUtil.getLocalSite().getUuid(); for (Site remoteSite : drUtil.listStandbySites()) { if (drUtil.isSiteUp(remoteSite.getUuid()) || remoteSite.getState() == SiteState.ACTIVE_DEGRADED) { log.info("Site {} is up or in ACTIVE_DEGRADED state, skip checking it", remoteSite.getUuid()); continue; } try (InternalSiteServiceClient client = new InternalSiteServiceClient(remoteSite, coordinator, apiSignatureGenerator)) { SiteList sites = client.getSiteList(); if (!isSiteContainedBy(localSiteId, sites) || isSiteDegraded(localSiteId, sites)) { log.info("Local site {} is in ACTIVE_DEGRADED state or removed according data returned from site {}", localSiteId, remoteSite.getUuid()); return true; } } catch (Exception e) { log.warn("Failed to check remote site information during failback detect", e); continue; } } return false; } /* * reset the new active site's status info in the local active_degraded site (old active site) */ private boolean resetActiveSite() { String localSiteId = drUtil.getLocalSite().getUuid(); for (Site remoteSite : drUtil.listStandbySites()) { if (drUtil.isSiteUp(remoteSite.getUuid()) || remoteSite.getState() == SiteState.ACTIVE_DEGRADED) { log.info("Site {} is up or in ACTIVE_DEGRADED state, skip checking it", remoteSite.getUuid()); continue; } try (InternalSiteServiceClient client = new InternalSiteServiceClient(remoteSite, coordinator, apiSignatureGenerator)) { SiteList sites = client.getSiteList(); String remoteSiteStatus =""; String localSiteStatus = SiteState.ACTIVE_DEGRADED.toString(); for (SiteRestRep site : sites.getSites()) { if (remoteSite.getUuid().equals(site.getUuid())) { remoteSiteStatus = site.getState(); } if (localSiteId.equals(site.getUuid())) { localSiteStatus = site.getState(); } } if (SiteState.ACTIVE_DEGRADED.toString().equals(localSiteStatus) && SiteState.ACTIVE.toString().equals(remoteSiteStatus)) { log.info("Local site {} is in ACTIVE_DEGRADED state according data returned from site {}", localSiteId, remoteSite.getUuid()); log.info("Remote site {} is in ACTIVE state according data returned from site {}", remoteSite.getUuid(), remoteSite.getUuid()); log.info("Setting active site status information in the local active degraded site"); Site newActiveSite = drUtil.getSiteFromLocalVdc(remoteSite.getUuid()); newActiveSite.setState(SiteState.ACTIVE); coordinator.persistServiceConfiguration(newActiveSite.toConfiguration()); // update local site to degraded to avoid 2 actives in the DR config Site localSite = drUtil.getLocalSite(); SiteState lastState = localSite.getState(); localSite.setState(SiteState.ACTIVE_DEGRADED); localSite.setLastState(lastState); coordinator.persistServiceConfiguration(localSite.toConfiguration()); return true; } } catch (Exception e) { log.warn("Failed to set active site information in the local active degraded site", e); continue; } } return false; } private boolean isSiteContainedBy(String siteId, SiteList sites) { for (SiteRestRep site : sites.getSites()) { if (siteId.equals(site.getUuid())) { return true; } } log.info("Site {} is removed", siteId); return false; } private boolean isSiteDegraded(String siteId, SiteList sites) { for (SiteRestRep site : sites.getSites()) { if (siteId.equals(site.getUuid()) && SiteState.ACTIVE_DEGRADED.toString().equals(site.getState())) { log.info("Site {} is ACTIVE_DEGRADED", siteId); return true; } } return false; } }; }