/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.alibaba.jstorm.daemon.supervisor;
import backtype.storm.utils.LocalState;
import com.alibaba.jstorm.blobstore.BlobStore;
import com.alibaba.jstorm.blobstore.BlobStoreUtils;
import com.alibaba.jstorm.callback.RunnableCallback;
import com.alibaba.jstorm.client.ConfigExtension;
import com.alibaba.jstorm.cluster.Common;
import com.alibaba.jstorm.cluster.StormClusterState;
import com.alibaba.jstorm.cluster.StormConfig;
import com.alibaba.jstorm.daemon.worker.LocalAssignment;
import com.alibaba.jstorm.event.EventManager;
import com.alibaba.jstorm.event.EventManagerZkPusher;
import com.alibaba.jstorm.schedule.Assignment;
import com.alibaba.jstorm.schedule.default_assign.ResourceWorkerSlot;
import com.alibaba.jstorm.utils.JStormServerUtils;
import com.alibaba.jstorm.utils.JStormUtils;
import com.alibaba.jstorm.utils.PathUtils;
import com.alibaba.jstorm.utils.TimeUtils;
import org.apache.commons.io.FileExistsException;
import org.apache.commons.io.FileUtils;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import java.util.*;
import java.util.Map.Entry;
/**
* supervisor SynchronizeSupervisor workflow (1) writer local assignment to LocalState (2) download new Assignment's topology (3) remove useless Topology (4)
* push one SyncProcessEvent to SyncProcessEvent's EventManager
*
* @author Johnfang (xiaojian.fxj@alibaba-inc.com)
*/
class SyncSupervisorEvent extends RunnableCallback {
private static final Logger LOG = LoggerFactory.getLogger(SyncSupervisorEvent.class);
// private Supervisor supervisor;
private String supervisorId;
private EventManager syncSupEventManager;
private StormClusterState stormClusterState;
private LocalState localState;
private Map<Object, Object> conf;
private SyncProcessEvent syncProcesses;
private int lastTime;
private Heartbeat heartbeat;
/**
* @param conf
* @param syncSupEventManager
* @param stormClusterState
* @param supervisorId
* @param localState
* @param syncProcesses
*/
public SyncSupervisorEvent(String supervisorId, Map conf, EventManager syncSupEventManager,
StormClusterState stormClusterState, LocalState localState, SyncProcessEvent syncProcesses, Heartbeat heartbeat) {
this.syncProcesses = syncProcesses;
this.syncSupEventManager = syncSupEventManager;
this.stormClusterState = stormClusterState;
this.conf = conf;
this.supervisorId = supervisorId;
this.localState = localState;
this.heartbeat = heartbeat;
}
@Override
public void run() {
LOG.debug("Synchronizing supervisor, interval seconds:" + TimeUtils.time_delta(lastTime));
lastTime = TimeUtils.current_time_secs();
//In order to ensure that the status is the same for each execution of syncsupervisor
MachineCheckStatus checkStatus = new MachineCheckStatus();
checkStatus.SetType(heartbeat.getCheckStatus().getType());
try {
RunnableCallback syncCallback = new EventManagerZkPusher(this, syncSupEventManager);
Map<String, Integer> assignmentVersion = (Map<String, Integer>) localState.get(Common.LS_LOCAL_ZK_ASSIGNMENT_VERSION);
if (assignmentVersion == null) {
assignmentVersion = new HashMap<String, Integer>();
}
Map<String, Assignment> assignments = (Map<String, Assignment>) localState.get(Common.LS_LOCAl_ZK_ASSIGNMENTS);
if (assignments == null) {
assignments = new HashMap<String, Assignment>();
}
LOG.debug("get local assignments " + assignments);
LOG.debug("get local assignments version " + assignmentVersion);
/**
* Step 1: get all assignments and register /ZK-dir/assignment and every assignment watch
*
*/
if (checkStatus.getType().equals(MachineCheckStatus.StatusType.panic) || checkStatus.getType().equals(MachineCheckStatus.StatusType.error)){
// if statuts is pannic or error, it will clear all assignments and kill all workers;
assignmentVersion.clear();
assignments.clear();
LOG.warn("Supervisor Machine Check Status :" + checkStatus.getType() +", so kill all workers.");
} else {
getAllAssignments(assignmentVersion, assignments, syncCallback);
}
LOG.debug("Get all assignments " + assignments);
/**
* Step 2: get topologyIds list from STORM-LOCAL-DIR/supervisor/stormdist/
*/
List<String> downloadedTopologyIds = StormConfig.get_supervisor_toplogy_list(conf);
LOG.debug("Downloaded storm ids: " + downloadedTopologyIds);
/**
* Step 3: get <port,LocalAssignments> from ZK local node's assignment
*/
Map<Integer, LocalAssignment> zkAssignment;
zkAssignment = getLocalAssign(stormClusterState, supervisorId, assignments);
Map<Integer, LocalAssignment> localAssignment;
/**
* Step 4: writer local assignment to LocalState
*/
try {
LOG.debug("Writing local assignment " + zkAssignment);
localAssignment = (Map<Integer, LocalAssignment>) localState.get(Common.LS_LOCAL_ASSIGNMENTS);
if (localAssignment == null) {
localAssignment = new HashMap<Integer, LocalAssignment>();
}
localState.put(Common.LS_LOCAL_ASSIGNMENTS, zkAssignment);
} catch (IOException e) {
LOG.error("put LS_LOCAL_ASSIGNMENTS " + zkAssignment + " of localState failed");
throw e;
}
/**
* Step 5: get reloaded topologys
*/
Set<String> updateTopologys;
updateTopologys = getUpdateTopologys(localAssignment, zkAssignment, assignments);
Set<String> reDownloadTopologys = getNeedReDownloadTopologys(localAssignment);
if (reDownloadTopologys != null) {
updateTopologys.addAll(reDownloadTopologys);
}
/**
* Step 6: download code from ZK
*/
Map<String, String> topologyCodes = getTopologyCodeLocations(assignments, supervisorId);
// downloadFailedTopologyIds which can't finished download binary from nimbus
Set<String> downloadFailedTopologyIds = new HashSet<String>();
downloadTopology(topologyCodes, downloadedTopologyIds, updateTopologys, assignments, downloadFailedTopologyIds);
/**
* Step 7: remove any downloaded useless topology
*/
removeUselessTopology(topologyCodes, downloadedTopologyIds);
/**
* Step 7: push syncProcesses Event
*/
// processEventManager.add(syncProcesses);
syncProcesses.run(zkAssignment, downloadFailedTopologyIds);
// If everything is OK, set the trigger to update heartbeat of
// supervisor
heartbeat.updateHbTrigger(true);
try {
// update localState
localState.put(Common.LS_LOCAL_ZK_ASSIGNMENT_VERSION, assignmentVersion);
localState.put(Common.LS_LOCAl_ZK_ASSIGNMENTS, assignments);
} catch (IOException e) {
LOG.error("put LS_LOCAL_ZK_ASSIGNMENT_VERSION&&LS_LOCAl_ZK_ASSIGNMENTS failed");
throw e;
}
} catch (Exception e) {
LOG.error("Failed to Sync Supervisor", e);
// throw new RuntimeException(e);
}
if (checkStatus.getType().equals(MachineCheckStatus.StatusType.panic)){
// if statuts is pannic, it will kill supervisor;
JStormUtils.halt_process(0, "Supervisor Machine Check Status : Panic , !!!!shutdown!!!!");
}
}
/**
* download code ; two cluster mode: local and distributed
*
* @param conf
* @param topologyId
* @param masterCodeDir
* @throws IOException
*/
private void downloadStormCode(Map conf, String topologyId, String masterCodeDir) throws IOException, TException {
String clusterMode = StormConfig.cluster_mode(conf);
if (clusterMode.endsWith("distributed")) {
downloadDistributeStormCode(conf, topologyId, masterCodeDir);
} else if (clusterMode.endsWith("local")) {
downloadLocalStormCode(conf, topologyId, masterCodeDir);
}
}
private void downloadLocalStormCode(Map conf, String topologyId, String masterCodeDir) throws IOException, TException {
// STORM_LOCAL_DIR/supervisor/tmp/(UUID)
String tmproot = StormConfig.supervisorTmpDir(conf) + File.separator + UUID.randomUUID().toString();
// STORM-LOCAL-DIR/supervisor/stormdist/storm-id
String stormroot = StormConfig.supervisor_stormdist_root(conf, topologyId);
BlobStore blobStore = null;
try {
blobStore = BlobStoreUtils.getNimbusBlobStore(conf, masterCodeDir, null);
FileUtils.forceMkdir(new File(tmproot));
blobStore.readBlobTo(StormConfig.master_stormcode_key(topologyId), new FileOutputStream(StormConfig.stormcode_path(tmproot)));
blobStore.readBlobTo(StormConfig.master_stormconf_key(topologyId), new FileOutputStream(StormConfig.stormconf_path(tmproot)));
} finally {
if (blobStore != null)
blobStore.shutdown();
}
File srcDir = new File(tmproot);
File destDir = new File(stormroot);
try {
FileUtils.moveDirectory(srcDir, destDir);
} catch (FileExistsException e) {
FileUtils.copyDirectory(srcDir, destDir);
FileUtils.deleteQuietly(srcDir);
}
ClassLoader classloader = Thread.currentThread().getContextClassLoader();
String resourcesJar = resourcesJar();
URL url = classloader.getResource(StormConfig.RESOURCES_SUBDIR);
String targetDir = stormroot + '/' + StormConfig.RESOURCES_SUBDIR;
if (resourcesJar != null) {
LOG.info("Extracting resources from jar at " + resourcesJar + " to " + targetDir);
JStormUtils.extractDirFromJar(resourcesJar, StormConfig.RESOURCES_SUBDIR, stormroot);// extract dir
// from jar;;
// util.clj
} else if (url != null) {
LOG.info("Copying resources at " + url.toString() + " to " + targetDir);
FileUtils.copyDirectory(new File(url.getFile()), (new File(targetDir)));
}
}
/**
* Don't need synchronize, due to EventManager will execute serially
*
* @param conf
* @param topologyId
* @param masterCodeDir
* @throws IOException
* @throws TException
*/
private void downloadDistributeStormCode(Map conf, String topologyId, String masterCodeDir) throws IOException, TException {
String tmproot = null;
try {
// STORM_LOCAL_DIR/supervisor/tmp/(UUID)
tmproot = StormConfig.supervisorTmpDir(conf) + File.separator + UUID.randomUUID().toString();
// STORM_LOCAL_DIR/supervisor/stormdist/topologyId
String stormroot = StormConfig.supervisor_stormdist_root(conf, topologyId);
// JStormServerUtils.downloadCodeFromMaster(conf, tmproot, masterCodeDir, topologyId, true);
JStormServerUtils.downloadCodeFromBlobStore(conf, tmproot, topologyId);
// tmproot/stormjar.jar
String localFileJarTmp = StormConfig.stormjar_path(tmproot);
// extract dir from jar
JStormUtils.extractDirFromJar(localFileJarTmp, StormConfig.RESOURCES_SUBDIR, tmproot);
File srcDir = new File(tmproot);
File destDir = new File(stormroot);
try {
FileUtils.moveDirectory(srcDir, destDir);
} catch (FileExistsException e) {
FileUtils.copyDirectory(srcDir, destDir);
FileUtils.deleteQuietly(srcDir);
}
}finally {
if (tmproot != null){
File srcDir = new File(tmproot);
FileUtils.deleteQuietly(srcDir);
}
}
}
private String resourcesJar() {
String path = System.getProperty("java.class.path");
if (path == null) {
return null;
}
String[] paths = path.split(File.pathSeparator);
List<String> jarPaths = new ArrayList<String>();
for (String s : paths) {
if (s.endsWith(".jar")) {
jarPaths.add(s);
}
}
/**
* FIXME, this place seems exist problem
*/
List<String> rtn = new ArrayList<String>();
int size = jarPaths.size();
for (int i = 0; i < size; i++) {
if (JStormUtils.zipContainsDir(jarPaths.get(i), StormConfig.RESOURCES_SUBDIR)) {
rtn.add(jarPaths.get(i));
}
}
if (rtn.size() == 0)
return null;
return rtn.get(0);
}
/**
* a port must be assigned one topology
*
* @param stormClusterState
* @param supervisorId
* @throws Exception
* @returns map: {port,LocalAssignment}
*/
private Map<Integer, LocalAssignment> getLocalAssign(StormClusterState stormClusterState, String supervisorId, Map<String, Assignment> assignments)
throws Exception {
Map<Integer, LocalAssignment> portLA = new HashMap<Integer, LocalAssignment>();
for (Entry<String, Assignment> assignEntry : assignments.entrySet()) {
String topologyId = assignEntry.getKey();
Assignment assignment = assignEntry.getValue();
Map<Integer, LocalAssignment> portTasks = readMyTasks(stormClusterState, topologyId, supervisorId, assignment);
if (portTasks == null) {
continue;
}
// a port must be assigned one storm
for (Entry<Integer, LocalAssignment> entry : portTasks.entrySet()) {
Integer port = entry.getKey();
LocalAssignment la = entry.getValue();
if (!portLA.containsKey(port)) {
portLA.put(port, la);
} else {
throw new RuntimeException("Should not have multiple topologys assigned to one port");
}
}
}
return portLA;
}
/**
* get local node's tasks
*
* @param stormClusterState
* @param topologyId
* @param supervisorId
* @return Map: {port, LocalAssignment}
* @throws Exception
*/
private Map<Integer, LocalAssignment> readMyTasks(StormClusterState stormClusterState, String topologyId, String supervisorId, Assignment assignmentInfo)
throws Exception {
Map<Integer, LocalAssignment> portTasks = new HashMap<Integer, LocalAssignment>();
Set<ResourceWorkerSlot> workers = assignmentInfo.getWorkers();
if (workers == null) {
LOG.error("No worker of assignment's " + assignmentInfo);
return portTasks;
}
for (ResourceWorkerSlot worker : workers) {
if (!supervisorId.equals(worker.getNodeId()))
continue;
portTasks.put(worker.getPort(), new LocalAssignment(topologyId, worker.getTasks(), Common.topologyIdToName(topologyId), worker.getMemSize(),
worker.getCpu(), worker.getJvm(), assignmentInfo.getTimeStamp()));
}
return portTasks;
}
/**
* get mastercodedir for every topology
*
* @throws Exception
* @returns Map: <topologyId, master-code-dir> from zookeeper
*/
public static Map<String, String> getTopologyCodeLocations(Map<String, Assignment> assignments, String supervisorId) throws Exception {
Map<String, String> rtn = new HashMap<String, String>();
for (Entry<String, Assignment> entry : assignments.entrySet()) {
String topologyid = entry.getKey();
Assignment assignmenInfo = entry.getValue();
Set<ResourceWorkerSlot> workers = assignmenInfo.getWorkers();
for (ResourceWorkerSlot worker : workers) {
String node = worker.getNodeId();
if (supervisorId.equals(node)) {
rtn.put(topologyid, assignmenInfo.getMasterCodeDir());
break;
}
}
}
return rtn;
}
public void downloadTopology(Map<String, String> topologyCodes, List<String> downloadedTopologyIds, Set<String> updateTopologys,
Map<String, Assignment> assignments, Set<String> downloadFailedTopologyIds) throws Exception {
Set<String> downloadTopologys = new HashSet<String>();
for (Entry<String, String> entry : topologyCodes.entrySet()) {
String topologyId = entry.getKey();
String masterCodeDir = entry.getValue();
if (!downloadedTopologyIds.contains(topologyId) || updateTopologys.contains(topologyId)) {
LOG.info("Downloading code for storm id " + topologyId + " from " + masterCodeDir);
int retry = 0;
while (retry < 3) {
try {
downloadStormCode(conf, topologyId, masterCodeDir);
// Update assignment timeStamp
StormConfig.write_supervisor_topology_timestamp(conf, topologyId, assignments.get(topologyId).getTimeStamp());
break;
} catch (IOException e) {
LOG.error(e + " downloadStormCode failed " + "topologyId:" + topologyId + " masterCodeDir:" + masterCodeDir);
} catch (TException e) {
LOG.error(e + " downloadStormCode failed " + "topologyId:" + topologyId + " masterCodeDir:" + masterCodeDir);
}
retry++;
}
if (retry < 3) {
LOG.info("Finished downloading code for storm id " + topologyId + " from " + masterCodeDir);
downloadTopologys.add(topologyId);
} else {
LOG.error("Cann't download code for storm id " + topologyId + " from " + masterCodeDir);
downloadFailedTopologyIds.add(topologyId);
}
}
}
// clear directory of topologyId is dangerous , so it only clear the topologyId which
// isn't contained by downloadedTopologyIds
for (String topologyId : downloadFailedTopologyIds) {
if (!downloadedTopologyIds.contains(topologyId)) {
try {
String stormroot = StormConfig.supervisor_stormdist_root(conf, topologyId);
File destDir = new File(stormroot);
FileUtils.deleteQuietly(destDir);
} catch (Exception e) {
LOG.error("Cann't clear directory about storm id " + topologyId + " on supervisor ");
}
}
}
updateTaskCleanupTimeout(downloadTopologys);
}
public void removeUselessTopology(Map<String, String> topologyCodes, List<String> downloadedTopologyIds) {
for (String topologyId : downloadedTopologyIds) {
if (!topologyCodes.containsKey(topologyId)) {
LOG.info("Removing code for storm id " + topologyId);
String path = null;
try {
path = StormConfig.supervisor_stormdist_root(conf, topologyId);
PathUtils.rmr(path);
} catch (IOException e) {
String errMsg = "rmr the path:" + path + "failed\n";
LOG.error(errMsg, e);
}
}
}
}
private Set<String> getUpdateTopologys(Map<Integer, LocalAssignment> localAssignments, Map<Integer, LocalAssignment> zkAssignments,
Map<String, Assignment> assignments) {
Set<String> ret = new HashSet<String>();
if (localAssignments != null && zkAssignments != null) {
for (Entry<Integer, LocalAssignment> entry : localAssignments.entrySet()) {
Integer port = entry.getKey();
LocalAssignment localAssignment = entry.getValue();
LocalAssignment zkAssignment = zkAssignments.get(port);
if (localAssignment == null || zkAssignment == null)
continue;
Assignment assignment = assignments.get(localAssignment.getTopologyId());
if (localAssignment.getTopologyId().equals(zkAssignment.getTopologyId()) && assignment != null
&& assignment.isTopologyChange(localAssignment.getTimeStamp()))
if (ret.add(localAssignment.getTopologyId())) {
LOG.info("Topology " + localAssignment.getTopologyId() + " has been updated. LocalTs=" + localAssignment.getTimeStamp() + ", ZkTs="
+ zkAssignment.getTimeStamp());
}
}
}
return ret;
}
private Set<String> getNeedReDownloadTopologys(Map<Integer, LocalAssignment> localAssignment) {
Set<String> reDownloadTopologys = syncProcesses.getTopologyIdNeedDownload().getAndSet(null);
if (reDownloadTopologys == null || reDownloadTopologys.size() == 0)
return null;
Set<String> needRemoveTopologys = new HashSet<String>();
Map<Integer, String> portToStartWorkerId = syncProcesses.getPortToWorkerId();
for (Entry<Integer, LocalAssignment> entry : localAssignment.entrySet()) {
if (portToStartWorkerId.containsKey(entry.getKey()))
needRemoveTopologys.add(entry.getValue().getTopologyId());
}
LOG.debug("worker is starting on these topology, so delay download topology binary: " + needRemoveTopologys);
reDownloadTopologys.removeAll(needRemoveTopologys);
if (reDownloadTopologys.size() > 0)
LOG.info("Following topologys is going to re-download the jars, " + reDownloadTopologys);
return reDownloadTopologys;
}
private void updateTaskCleanupTimeout(Set<String> topologys) {
Map topologyConf = null;
Map<String, Integer> taskCleanupTimeouts = new HashMap<String, Integer>();
for (String topologyId : topologys) {
try {
topologyConf = StormConfig.read_supervisor_topology_conf(conf, topologyId);
} catch (IOException e) {
LOG.info("Failed to read conf for " + topologyId);
}
Integer cleanupTimeout = null;
if (topologyConf != null) {
cleanupTimeout = JStormUtils.parseInt(topologyConf.get(ConfigExtension.TASK_CLEANUP_TIMEOUT_SEC));
}
if (cleanupTimeout == null) {
cleanupTimeout = ConfigExtension.getTaskCleanupTimeoutSec(conf);
}
taskCleanupTimeouts.put(topologyId, cleanupTimeout);
}
Map<String, Integer> localTaskCleanupTimeouts = null;
try {
localTaskCleanupTimeouts = (Map<String, Integer>) localState.get(Common.LS_TASK_CLEANUP_TIMEOUT);
} catch (IOException e) {
LOG.error("Failed to read local task cleanup timeout map", e);
}
if (localTaskCleanupTimeouts == null)
localTaskCleanupTimeouts = taskCleanupTimeouts;
else
localTaskCleanupTimeouts.putAll(taskCleanupTimeouts);
try {
localState.put(Common.LS_TASK_CLEANUP_TIMEOUT, localTaskCleanupTimeouts);
} catch (IOException e) {
LOG.error("Failed to write local task cleanup timeout map", e);
}
}
private void getAllAssignments(Map<String, Integer> assignmentVersion, Map<String, Assignment> localZkAssignments,
RunnableCallback callback) throws Exception {
Map<String, Assignment> ret = new HashMap<String, Assignment>();
Map<String, Integer> updateAssignmentVersion = new HashMap<String, Integer>();
// get /assignments {topology_id}
List<String> assignments = stormClusterState.assignments(callback);
if (assignments == null) {
assignmentVersion.clear();
localZkAssignments.clear();
LOG.debug("No assignment of ZK");
return;
}
for (String topology_id : assignments) {
Integer zkVersion = stormClusterState.assignment_version(topology_id, callback);
LOG.debug(topology_id + "'s assigment version of zk is :" + zkVersion);
Integer recordedVersion = assignmentVersion.get(topology_id);
LOG.debug(topology_id + "'s assigment version of local is :" + recordedVersion);
Assignment assignment = null;
if (recordedVersion !=null && zkVersion !=null && recordedVersion.equals(zkVersion)) {
assignment = localZkAssignments.get(topology_id);
}
//because the first version is 0
if (assignment == null) {
assignment = stormClusterState.assignment_info(topology_id, callback);
}
if (assignment == null) {
LOG.error("Failed to get Assignment of " + topology_id + " from ZK");
continue;
}
updateAssignmentVersion.put(topology_id, zkVersion);
ret.put(topology_id, assignment);
}
assignmentVersion.clear();
assignmentVersion.putAll(updateAssignmentVersion);
localZkAssignments.clear();
localZkAssignments.putAll(ret);
}
}