/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.alibaba.jstorm.task.master; import backtype.storm.task.TopologyContext; import com.alibaba.jstorm.cluster.StormClusterState; import com.alibaba.jstorm.cluster.StormStatus; import com.alibaba.jstorm.daemon.nimbus.StatusType; import com.alibaba.jstorm.schedule.default_assign.ResourceWorkerSlot; import com.alibaba.jstorm.task.upgrade.GrayUpgradeConfig; import com.google.common.collect.Sets; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * @author wange * @since 2.3.1 */ public class GrayUpgradeHandler implements TMHandler, Runnable { private final Logger LOG = LoggerFactory.getLogger(getClass()); private StormClusterState stormClusterState; private String topologyId; private TopologyMasterContext tmContext; private Map<String, Set<Integer>> hostPortToTasks; private Map<Integer, String> taskToHostPort; private Set<String> totalWorkers; @Override public void init(TopologyMasterContext tmContext) { this.tmContext = tmContext; this.stormClusterState = tmContext.getZkCluster(); this.topologyId = tmContext.getTopologyId(); this.hostPortToTasks = new HashMap<>(); this.taskToHostPort = new HashMap<>(); for (ResourceWorkerSlot workerSlot : tmContext.getWorkerSet().get()) { Set<Integer> tasks = workerSlot.getTasks(); String hostPort = workerSlot.getHostPort(); hostPortToTasks.put(hostPort, Sets.newHashSet(tasks)); for (Integer task : tasks) { this.taskToHostPort.put(task, hostPort); } } this.totalWorkers = new HashSet<>(); } @Override public void process(Object event) throws Exception { } @Override public void cleanup() { } /** * scheduled runnable callback, called periodically */ @Override public void run() { try { GrayUpgradeConfig grayUpgradeConf = (GrayUpgradeConfig) stormClusterState.get_gray_upgrade_conf(topologyId); // no upgrade request if (grayUpgradeConf == null) { LOG.debug("gray upgrade conf is null, skip..."); return; } if (grayUpgradeConf.isCompleted() && !grayUpgradeConf.isRollback()) { LOG.debug("detected a complete upgrade, skip..."); return; } if (grayUpgradeConf.isExpired() && !grayUpgradeConf.isRollback()) { LOG.info("detected an expired upgrade, completing..."); // todo: should we check all task status? GrayUpgradeConfig.completeUpgrade(grayUpgradeConf); //stormClusterState.remove_gray_upgrade_info(topologyId); stormClusterState.set_gray_upgrade_conf(topologyId, grayUpgradeConf); stormClusterState.update_storm(topologyId, new StormStatus(StatusType.active)); return; } // first time, set workers if (this.totalWorkers.size() == 0) { setTotalWorkers(tmContext); } // notify current upgrading workers to upgrade (again) Set<String> upgradingWorkers = Sets.newHashSet(stormClusterState.get_upgrading_workers(topologyId)); if (upgradingWorkers.size() > 0) { LOG.info("Following workers are under upgrade:{}", upgradingWorkers); for (String worker : upgradingWorkers) { notifyToUpgrade(worker); } return; } Set<String> upgradedWorkers = Sets.newHashSet(stormClusterState.get_upgraded_workers(topologyId)); if (grayUpgradeConf.isRollback()) { LOG.info("Rollback has completed, removing upgrade info in zk and updating storm status..."); // there's no way back after a rollback stormClusterState.remove_gray_upgrade_info(topologyId); stormClusterState.update_storm(topologyId, new StormStatus(StatusType.active)); return; } if (isUpgradeCompleted(upgradedWorkers, totalWorkers)) { LOG.info("This upgraded has finished! Marking upgrade config as completed..."); GrayUpgradeConfig.completeUpgrade(grayUpgradeConf); stormClusterState.set_gray_upgrade_conf(topologyId, grayUpgradeConf); //stormClusterState.remove_gray_upgrade_info(topologyId); stormClusterState.update_storm(topologyId, new StormStatus(StatusType.active)); return; } // assign next batch of workers if (grayUpgradeConf.continueUpgrading()) { pickWorkersToUpgrade(grayUpgradeConf, upgradedWorkers); } // pause upgrading grayUpgradeConf.setContinueUpgrade(false); stormClusterState.set_gray_upgrade_conf(topologyId, grayUpgradeConf); } catch (Exception ex) { LOG.error("Failed to get upgrade config from zk, will abort this upgrade...", ex); recover(); } } private void pickWorkersToUpgrade(GrayUpgradeConfig grayUpgradeConf, Set<String> upgradedWorkers) throws Exception { Set<String> remainingSlots = new HashSet<>(); remainingSlots.addAll(this.totalWorkers); remainingSlots.removeAll(upgradedWorkers); int workerNum = grayUpgradeConf.getWorkerNum(); String component = grayUpgradeConf.getComponent(); Set<String> workers = grayUpgradeConf.getWorkers(); TopologyContext topologyContext = tmContext.getContext(); if (workers.size() > 0) { LOG.info("Upgrading specified workers:{}", workers); for (String worker : workers) { if (remainingSlots.contains(worker)) { addUpgradingSlot(worker); } else { LOG.warn("Worker {} is not in topology worker list or has been upgraded already, skip.", worker); } } // reset workers workers.clear(); } else if (!StringUtils.isBlank(component)) { LOG.info("Upgrading workers of component:{}", component); List<Integer> tasks = topologyContext.getComponentTasks(component); if (tasks == null) { LOG.error("Failed to get tasks for component {}, maybe it's a wrong component name.", component); return; } Set<String> slots = new HashSet<>(); for (Integer task : tasks) { String worker = this.taskToHostPort.get(task); if (worker != null && remainingSlots.contains(worker)) { slots.add(worker); } } LOG.info("Available workers of component {}: {}", component, slots); pickUpgradingSlots(slots, workerNum > 0 ? workerNum : slots.size()); // reset component if (workerNum == 0 || workerNum >= slots.size()) { grayUpgradeConf.setComponent(null); } } else if (workerNum > 0) { LOG.info("Upgrading workers at random"); pickUpgradingSlots(remainingSlots, workerNum); } } private void pickUpgradingSlots(Set<String> remainingSlots, int n) throws Exception { // pick workers int i = 0; for (String remainingSlot : remainingSlots) { addUpgradingSlot(remainingSlot); i++; if (i == n) { break; } } } private void addUpgradingSlot(String worker) throws Exception { stormClusterState.add_upgrading_worker(topologyId, worker); notifyToUpgrade(worker); } private boolean isUpgradeCompleted(Collection<String> upgradedWorkers, Collection<String> allWorkers) { return upgradedWorkers.size() > 0 && upgradedWorkers.size() >= allWorkers.size(); } private void notifyToUpgrade(String workerSlot) { int headTask = hostPortToTasks.get(workerSlot).iterator().next(); LOG.info("notifying worker {} to upgrade(task {})...", workerSlot, headTask); //collector.emitDirect(headTask, Common.TOPOLOGY_MASTER_GRAY_UPGRADE_STREAM_ID, new Values("upgrade")); } private void setTotalWorkers(TopologyMasterContext tmContext) { Set<ResourceWorkerSlot> workerSlots = tmContext.getWorkerSet().get(); int tmTaskId = tmContext.getTaskId(); this.totalWorkers.clear(); for (ResourceWorkerSlot workerSlot : workerSlots) { if (!workerSlot.getTasks().contains(tmTaskId)) { this.totalWorkers.add(workerSlot.getHostPort()); } } } private void recover() { try { LOG.info("Removing upgrading info..."); stormClusterState.remove_gray_upgrade_info(topologyId); LOG.info("Reset topology state to ACTIVE..."); stormClusterState.update_storm(topologyId, new StormStatus(StatusType.active)); } catch (Exception ex) { LOG.error("Failed to recover from upgrade", ex); } } }