/* * Copyright (c) 2012-2014 EMC Corporation * All Rights Reserved */ package com.emc.storageos.systemservices.impl.upgrade; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.text.MessageFormat; import java.util.*; import javax.ws.rs.core.MediaType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.emc.storageos.coordinator.client.model.Site; import com.emc.storageos.coordinator.client.model.SiteState; import com.emc.storageos.coordinator.client.service.DistributedPersistentLock; import com.emc.storageos.management.jmx.recovery.DbManagerOps; import com.emc.storageos.coordinator.client.model.Constants; import com.emc.storageos.coordinator.client.model.RepositoryInfo; import com.emc.storageos.coordinator.client.model.SoftwareVersion; import com.emc.storageos.coordinator.client.model.DownloadingInfo; import static com.emc.storageos.coordinator.client.model.Constants.*; import com.emc.storageos.coordinator.common.Service; import com.emc.storageos.coordinator.client.service.NodeListener; import com.emc.storageos.svcs.errorhandling.resources.APIException; import com.emc.storageos.svcs.errorhandling.resources.ServiceCode; import com.emc.storageos.systemservices.exceptions.*; import com.emc.storageos.systemservices.impl.client.SysClientFactory; import com.emc.storageos.systemservices.impl.util.AbstractManager; import com.emc.vipr.model.sys.NodeProgress.DownloadStatus; public class UpgradeManager extends AbstractManager { private static final Logger log = LoggerFactory.getLogger(UpgradeManager.class); private static final String dbNoEncryptFlagFile = "/data/db/no_db_encryption"; // max number of tries of connecting remote repository private final static int MAX_REPO_RETRIES = 3; // time out interval (in millisecond) private final static int TIMEOUT_INTERVAL = 5 * 60 * 1000; // standby site upgrade retry interval if the active site is not STABLE or the current site is not SYNCED // we don't want to sleep for too long (default 10m) or too short (retry 3s) here private final static int STANDBY_UPGRADE_RETRY_INTERVAL = 60 * 1000; // 1m private RemoteRepository remoteRepository; // local and target info properties private RepositoryInfo localInfo; private RepositoryInfo targetInfo; private static boolean isValidRepo; private Service service; // current number of tries of connecting remote repository private int tryRepoCnt = 0; // timer expire time private long expireTime = 0; private volatile boolean backCompatPreYoda; //default to false public void setBackCompatPreYoda(boolean backCompatPreYoda) { this.backCompatPreYoda = backCompatPreYoda; } public LocalRepository getLocalRepository() { return localRepository; } public RemoteRepository getRemoteRepository() { return remoteRepository; } public void setService(Service service) { this.service = service; } @Override protected URI getWakeUpUrl() { return SysClientFactory.URI_WAKEUP_UPGRADE_MANAGER; } /** * Register repository info listener to monitor repository version changes */ private void addRepositoryInfoListener() { try { coordinator.getCoordinatorClient().addNodeListener(new RepositoryInfoListener()); } catch (Exception e) { log.error("Fail to add node listener for repository info target znode", e); throw APIException.internalServerErrors.addListenerFailed(); } log.info("Successfully added node listener for repository info target znode"); } /** * the listener class to listen the repository target node change. */ class RepositoryInfoListener implements NodeListener { public String getPath() { return String.format("/config/%s/%s", RepositoryInfo.CONFIG_KIND, RepositoryInfo.CONFIG_ID); } /** * called when user update the target version */ @Override public void nodeChanged() { log.info("Repository info changed. Waking up the upgrade manager..."); wakeup(); } /** * called when connection state changed. */ @Override public void connectionStateChanged(State state) { log.info("Repository info connection state changed to {}", state); if (state.equals(State.CONNECTED)) { log.info("Curator (re)connected. Waking up the upgrade manager..."); wakeup(); } } } @Override protected void innerRun() { // need to distinguish persistent locks acquired from UpgradeManager/VdcManager/PropertyManager // otherwise they might release locks acquired by others when they start final String svcId = String.format("%s,upgrade", coordinator.getMySvcId()); isValidRepo = localRepository.isValidRepository(); addRepositoryInfoListener(); while (doRun) { log.debug("Main loop: Start"); shortSleep = false; // Step1: check if we have the reboot lock boolean hasLock; try { hasLock = hasUpgradeLock(svcId); } catch (Exception e) { log.info("Step1: Failed to verify if the current node has the reboot lock ", e); retrySleep(); continue; } if (hasLock) { try { releaseUpgradeLock(svcId); log.info("Step1: Released reboot lock for node: {}", svcId); wakeupOtherNodes(); } catch (Exception e) { log.info("Step1: Failed to release the reboot lock and will retry: {}", e.getMessage()); retrySleep(); continue; } } // Step2: publish current state, and set target if empty try { initializeLocalAndTargetInfo(svcId); } catch (Exception e) { log.info("Step2b failed and will be retried: {}", e.getMessage()); retrySleep(); continue; } // Step3: syncing repository final SyncInfo syncinfo = getSyncInfoCommon(localInfo, targetInfo); if (!syncinfo.isEmpty()) { // Step3: nodeInSync discovery String controlNodeInSync = null; try { controlNodeInSync = getAControlNodeInSync(targetInfo); log.info("Step3: Control node in syc: {}", controlNodeInSync); } catch (Exception e) { log.info("Step3 failed and will be retried: {}", e.getMessage()); retrySleep(); continue; } // check and update images boolean waitSyncingFinish = syncNodes(syncinfo, controlNodeInSync, svcId); if (waitSyncingFinish) { retrySleep(); continue; } else { // For restored cluster or redeployed node, the image files don't exist.it will need to download // the upgrade image from the remote repository. If the node can't connenct with the repository, // or the image doesn't exist in it, syssvc would keep throwing exceptions and restart. // So here break the syncing and it will retry in next check loop(loopInterval=10mins). log.info("Step3: Give up syncing upgrade image, and will retry in next check loop"); } } // Step4: if target version is changed, update log.info("Step4: If target version is changed, update"); final SoftwareVersion currentVersion = localInfo.getCurrentVersion(); final SoftwareVersion targetVersion = targetInfo.getCurrentVersion(); if (currentVersion != null && targetVersion != null && !currentVersion.equals(targetVersion)) { log.info("Step4: Current version: {} != target version: {}. Switch version.", currentVersion, targetVersion); // for standby site, check if the active site is stable and the local site is STANDBY_SYNCED if (drUtil.isStandby()) { if (!coordinator.isActiveSiteHealthy()) { log.info("current site is standby and active site is not stable, sleep 1m and try again"); sleep(STANDBY_UPGRADE_RETRY_INTERVAL); continue; } SiteState localSiteState = drUtil.getLocalSite().getState(); if (!localSiteState.equals(SiteState.STANDBY_SYNCED) && !localSiteState.equals(SiteState.STANDBY_INCR_SYNCING)) { log.info("current site is standby and is in state {}, sleep 1m and try again", localSiteState); sleep(STANDBY_UPGRADE_RETRY_INTERVAL); continue; } } try { if (!getUpgradeLock(svcId)) { retrySleep(); continue; } if (!isQuorumMaintained()) { releaseUpgradeLock(svcId); retrySleep(); continue; } updateCurrentVersion(targetVersion); } catch (Exception e) { log.info("Step4: Upgrade failed and will be retried: {}", e.getMessage()); // Restart the loop immediately so that we release the reboot lock. continue; } } // Step6: sleep log.info("Step6: sleep"); longSleep(); } } private void updateCurrentVersion(SoftwareVersion targetVersion) throws Exception { log.info("Step4: Got reboot lock. Update target version one more time"); // retrieve the target version once again, since it might have been changed (reverted to be specific) // by the first upgraded node holding the lock during upgrade from 2.0/2.1 to 2.2. targetInfo = coordinator.getTargetInfo(RepositoryInfo.class); SoftwareVersion newTargetVersion = targetInfo.getCurrentVersion(); if (!targetVersion.equals(newTargetVersion)) { log.warn("Step4: target version has changed (was: {}, now is: {}). Aborting version change.", targetVersion, newTargetVersion); } else { log.info("Step4: Switching to version: {}", newTargetVersion); localRepository.setCurrentVersion(targetVersion); reboot(); } } private void reconfigAndStartDBSerivces() { localRepository.reconfig(); localRepository.restart(Constants.DBSVC_NAME); localRepository.restart(Constants.GEODBSVC_NAME); } private boolean isDbCurrentVersionEncrypted() { String currentDbVersion = coordinator.getCurrentDbSchemaVersion(); if (currentDbVersion.startsWith("1.") || // Vipr 1.x currentDbVersion.startsWith("2.0") || // Vipr 2.0.x currentDbVersion.startsWith("2.1")) { return false; } return true; } /** * Initialize local and target info * * @throws Exception */ private void initializeLocalAndTargetInfo(String svcId) throws Exception { // Step1: publish current state, and set target if empty // publish node state localInfo = localRepository.getRepositoryInfo(); log.info("Step2a: Local repository information: {}", localInfo); coordinator.setNodeSessionScopeInfo(localInfo); // set target if empty targetInfo = coordinator.getTargetInfo(RepositoryInfo.class); if (targetInfo == null || !isValidRepo) { try { // Set the updated propperty info in coordinator // on devkits, don't check the stability of the "cluster" coordinator.setTargetInfo(localInfo, isValidRepo); targetInfo = coordinator.getTargetInfo(RepositoryInfo.class); log.info("Step2b: Target repository set to local state: {}", targetInfo); } catch (CoordinatorClientException e) { log.info("Step2b: Wait another control node to set target"); retrySleep(); throw e; } } // initialize remoteRepository remoteRepository = RemoteRepository.getInstance(); } /** * Syncing nodes * * @param syncinfo syncing info * @param controlNodeInSync control node which is in sync with target * @param svcId node service id */ private boolean syncNodes(SyncInfo syncinfo, String controlNodeInSync, String svcId) { boolean needToWaitSyncFinish = true; if (controlNodeInSync == null) { // if no control node is synced, compete for leader to download if (!isRemoteDownloadAllowed()) { if (coordinator.hasRemoteDownloadLock(svcId)) { log.info("Step3a: Leader gives up lock"); coordinator.releaseRemoteDownloadLock(svcId); wakeupOtherNodes(); } } else if (coordinator.hasRemoteDownloadLock(svcId) || coordinator.getRemoteDownloadLock(svcId)) { try { if (drUtil.isStandby()) { log.info("Step3a: sync'ing with active site as leader of standby site"); Site activeSite = drUtil.getActiveSite(); URI activeVipEndpoint = URI.create(String.format(SysClientFactory.BASE_URL_FORMAT, activeSite.getVipEndPoint(), service.getEndpoint().getPort())); if (!coordinator.isActiveSiteStable(activeSite)) { log.info("Step3a: software image {} not sync'ed on active site yet. Retry later", syncinfo); } else if (syncToNodeInSync(activeVipEndpoint, syncinfo)) { coordinator.setNodeSessionScopeInfo(localRepository.getRepositoryInfo()); coordinator.releaseRemoteDownloadLock(svcId); wakeupOtherNodes(); } } else { log.info("Step3a: sync'ing with remote repo as leader"); if (syncWithRemote(localInfo, targetInfo, syncinfo)) { coordinator.setNodeSessionScopeInfo(localRepository.getRepositoryInfo()); coordinator.releaseRemoteDownloadLock(svcId); wakeupOtherNodes(); } } } catch (Exception e) { log.error("Step3a: ", e); if ((e instanceof APIException) && (((APIException) e).getServiceCode() == ServiceCode.SYS_DOWNLOAD_IMAGE_ERROR)) { needToWaitSyncFinish = false; log.info("Step3a: Leader gives up lock"); coordinator.releaseRemoteDownloadLock(svcId); wakeupOtherNodes(); } } } else { // Non-nodeInSync block // do nothing, wait nodeInSync to complete download log.info("Step3a: Wait nodeInSync to finish download"); } } else if (controlNodeInSync != null) { try { if (syncToNodeInSync(coordinator.getNodeEndpointForSvcId(controlNodeInSync), syncinfo)) { coordinator.setNodeSessionScopeInfo(localRepository.getRepositoryInfo()); wakeupOtherNodes(); } } catch (Exception e) { log.error("Step3b: {}", e); } } return needToWaitSyncFinish; } private SyncInfo getSyncInfoCommon(final RepositoryInfo localInfo, final RepositoryInfo targetInfo) { log.info("Step3: Synchronizing with target repository. Local repository information: {}", localInfo); log.info("Target repository information: {}", targetInfo); SyncInfo syncinfo = SyncInfoBuilder.getTargetSyncInfo(localInfo, targetInfo); log.info("Sync information: {}", syncinfo); return syncinfo; } private boolean syncWithRemote(final RepositoryInfo localInfo, final RepositoryInfo targetInfo, final SyncInfo syncinfo) throws RemoteRepositoryException, LocalRepositoryException { // Step1 - if something to install, install if (syncinfo.getToInstall() != null && !syncinfo.getToInstall().isEmpty()) { final SoftwareVersion toInstall = syncinfo.getToInstall().get(0); File image = null; if (toInstall != null && (image = getRemoteImage(toInstall)) == null) { return false; } if (image != null) { try { localRepository.installImage(image); } finally { image.delete(); } } } // Step2 - if something to remove, remove if (syncinfo.getToRemove() != null && !syncinfo.getToRemove().isEmpty()) { for (SoftwareVersion v : syncinfo.getToRemove()) { localRepository.removeVersion(v); } } return true; } private boolean syncToNodeInSync(final URI leaderEndpoint, final SyncInfo syncinfo) throws SysClientException, LocalRepositoryException { // Step1 - if something to install, install if (syncinfo.getToInstall() != null && !syncinfo.getToInstall().isEmpty()) { final SoftwareVersion toInstall = syncinfo.getToInstall().get(0); File image = null; if (toInstall != null) { image = getLeaderImage(toInstall, leaderEndpoint); if (image == null) { return false; } } if (image != null) { try { localRepository.installImage(image); } finally { image.delete(); } } } // Step2 - if something to remove, remove if (syncinfo.getToRemove() != null && !syncinfo.getToRemove().isEmpty()) { for (SoftwareVersion v : syncinfo.getToRemove()) { localRepository.removeVersion(v); } } return true; } /** * Get a control node which repository info is synced with target * * @param targetRepository target repository * @return node id * @throws Exception */ private String getAControlNodeInSync(RepositoryInfo targetRepository) throws Exception { final Map<Service, RepositoryInfo> localRepo = coordinator.getAllNodeInfos(RepositoryInfo.class, CONTROL_NODE_SYSSVC_ID_PATTERN); final List<SoftwareVersion> targetVersions = targetRepository.getVersions(); List<String> candidates = new ArrayList<>(); for (Map.Entry<Service, RepositoryInfo> entry : localRepo.entrySet()) { if (targetVersions.equals(entry.getValue().getVersions())) { candidates.add(entry.getKey().getId()); } } // return nodeId which is synced if (!candidates.isEmpty()) { return candidates.get(new Random().nextInt(candidates.size())); } return null; } private File getRemoteImage(final SoftwareVersion version) throws RemoteRepositoryException { final File file = new File(DOWNLOAD_DIR + '/' + version + SOFTWARE_IMAGE_SUFFIX); String prefix = MessageFormat.format("Step3a: version={0} local path=\"{1}\": ", version, file); log.info(prefix); if (isDownloadInProgress()) { return null; } if (file.exists()) { DownloadingInfo downloadingInfo; try { downloadingInfo = coordinator.getTargetInfo(DownloadingInfo.class); } catch (Exception e) { throw APIException.internalServerErrors.getObjectFromError("Node downloading info", "coordinator", e); } coordinator.setNodeGlobalScopeInfo(new DownloadingInfo(downloadingInfo._version, downloadingInfo._size, downloadingInfo._size, DownloadStatus.COMPLETED, new ArrayList<>(Arrays.asList(0, 0))), DOWNLOADINFO_KIND, coordinator.getMySvcId()); // Because the file exists, we set the downloadinfo directly to COMPLETED status log.info(prefix + "Success!"); return file; } if (!tryRemoteDownload()) { return null; } final URL url = getRemoteImageURL(version); prefix = MessageFormat.format("Step3a: version={0} local path=\"{1}\" URL=\"{2}\": ", version, file, url.toString()); log.info(prefix + "Opening remote image stream"); final InputStream in = remoteRepository.getImageInputStream(url); log.info(prefix + "Starting background download."); UpgradeImageDownloader.getInstance(this).startBackgroundDownload(prefix, file, in, url.toString(), version.toString()); return null; } private URL getRemoteImageURL(final SoftwareVersion version) { try { URL url = remoteRepository.getImageURL(version); if (url == null) { throw new IllegalStateException("Image URL is null"); } return url; } catch (Exception e) { log.error("Get remote image URL for version({}) failed", version.toString(), e); throw APIException.internalServerErrors.downloadUpgradeImageError(e); } } private File getLeaderImage(final SoftwareVersion version, final URI leaderEndpoint) throws SysClientException { final File file = new File(DOWNLOAD_DIR + '/' + version + SOFTWARE_IMAGE_SUFFIX); final String prefix = MessageFormat.format("Step3b(): path=\"{0}\" leaderEndpoint=\"{1}\": ", file, leaderEndpoint); log.info(prefix); if (isDownloadInProgress()) { return null; } if (file.exists()) { DownloadingInfo downloadingInfo; try { downloadingInfo = coordinator.getNodeGlobalScopeInfo(DownloadingInfo.class, DOWNLOADINFO_KIND, coordinator.getMySvcId()); // if the downloading info is present and the version is the same then update the progress if (downloadingInfo != null && version.toString().equals(downloadingInfo._version)) { coordinator.setNodeGlobalScopeInfo(new DownloadingInfo(downloadingInfo._version, downloadingInfo._size, downloadingInfo._size, DownloadStatus.COMPLETED, new ArrayList<Integer>(Arrays.asList(0, 0))), DOWNLOADINFO_KIND, coordinator.getMySvcId()); } } catch (Exception e) { throw APIException.internalServerErrors.getObjectFromError("Node downloading info", "coordinator", e); } // Because the file exists, we set the downloadinfo directly to COMPLETED status log.info(prefix + "Success!"); return file; } log.info(prefix + "Opening remote image stream"); try { String uri = SysClientFactory.URI_GET_IMAGE + "?version=" + version; final InputStream in = SysClientFactory.getSysClient(leaderEndpoint) .get(new URI(uri), InputStream.class, MediaType.APPLICATION_OCTET_STREAM); log.info(prefix + "Starting background download."); UpgradeImageDownloader.getInstance(this).startBackgroundDownload(prefix, file, in, uri, version.toString()); } catch (URISyntaxException e) { log.error("Internal error occurred while prepareing get image URI: {}", e); } return null; } /** * Check if remote download is progressing * * @return true if remote download is progressing; false otherwise */ private boolean isDownloadInProgress() { return UpgradeImageDownloader.getInstance(this).isDownloading(); } /** * Method used to try remote download * * if the method is called the first time, since expireTime is initialized as 0, * set expireTime to Now() + 5 mins and set counter to 1. * if current time is less than expireTime, increment counter and return true if counter not greater than maximal try count * if current time is not less than expireTime, reset timer and set counter to 1 * * @return true if succeed; false otherwise */ private boolean tryRemoteDownload() { if (System.currentTimeMillis() < expireTime) { tryRepoCnt++; return tryRepoCnt <= MAX_REPO_RETRIES; } else { expireTime = System.currentTimeMillis() + TIMEOUT_INTERVAL; tryRepoCnt = 1; return true; } } /** * Method used to decide if remote download is allowed * similar to tryRemoteDownload except tryRemoteDownload will increment the try count * * @return true if allowed; false otherwise */ private boolean isRemoteDownloadAllowed() { return tryRepoCnt <= MAX_REPO_RETRIES || System.currentTimeMillis() >= expireTime; } /** * Helper method to provide backward compatibility for upgrade from pre-Yoda releases * This should be replaced with hasRebootLock() when pre-Yoda releases are no longer in the direct upgrade path * * @param svcId * @throws Exception needs to be caught by the caller * @return */ private boolean hasUpgradeLock(String svcId) throws Exception { if (backCompatPreYoda) { log.info("Pre-yoda back compatible flag detected. Check upgrade lock from the global area"); // The lock content has changed in Yoda, previously there's only svcId in the lock node String oldSvcId = coordinator.getMySvcId(); DistributedPersistentLock lock = coordinator.getCoordinatorClient() .getPersistentLock(DISTRIBUTED_UPGRADE_LOCK); log.info("Acquiring the upgrade lock for {}...", oldSvcId); if (lock != null) { String lockOwner = lock.getLockOwner(); if (lockOwner != null && lockOwner.equals(oldSvcId)) { log.info("Current owner of the upgrade lock: {} ", lockOwner); return true; } } return false; } else { return hasRebootLock(svcId); } } /** * Helper method to provide backward compatibility for upgrade from pre-Yoda releases * This should be replaced with getRebootLock() when pre-Yoda releases are no longer in the direct upgrade path * * @param svcId * @throws Exception needs to be caught by the caller * @return */ private boolean getUpgradeLock(String svcId) throws Exception { if (backCompatPreYoda) { log.info("Pre-yoda back compatible flag detected. Check upgrade lock from the global area"); // The lock content has changed in Yoda, previously there's only svcId in the lock node String oldSvcId = coordinator.getMySvcId(); DistributedPersistentLock lock = coordinator.getCoordinatorClient() .getPersistentLock(DISTRIBUTED_UPGRADE_LOCK); log.info("Acquiring the upgrade lock for {}...", oldSvcId); boolean result = lock.acquireLock(oldSvcId); if (!result) { log.info("Acquiring reboot lock failed. Retrying..."); return false; } log.info("Successfully acquired the reboot lock."); return true; } else { return getRebootLock(svcId); } } /** * Helper method to provide backward compatibility for upgrade from pre-Yoda releases * This should be replaced with releaseRebootLock() when pre-Yoda releases are no longer in the direct upgrade path * * @param svcId * @return */ private void releaseUpgradeLock(String svcId) { if (backCompatPreYoda) { log.info("Pre-yoda back compatible flag detected. Check upgrade lock from the global area"); // The lock content has changed in Yoda, previously there's only svcId in the lock node String oldSvcId = coordinator.getMySvcId(); try { DistributedPersistentLock lock = coordinator.getCoordinatorClient() .getPersistentLock(DISTRIBUTED_UPGRADE_LOCK); if (lock != null) { String lockOwner = lock.getLockOwner(); if (lockOwner == null) { log.info("Upgrade lock is not held by any node"); return; } if (!lockOwner.equals(oldSvcId)) { log.error("Lock owner is {}", lockOwner); } else { boolean result = lock.releaseLock(lockOwner); if (result) { log.info("Upgrade lock released by owner {} successfully", lockOwner); } else { log.info("Upgrade lock release failed for owner {}", lockOwner); } } } } catch (Exception e) { log.error("Failed to release the upgrade lock:", e); } } else { releaseRebootLock(svcId); } } @Override public void stop() { super.stop(); UpgradeImageDownloader.getInstance(this).shutdownNow(); } }