package com.alipay.bluewhale.core.daemon.supervisor;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.log4j.Logger;
import backtype.storm.Config;
import backtype.storm.utils.LocalState;
import backtype.storm.utils.Time;
import com.alipay.bluewhale.core.cluster.Common;
import com.alipay.bluewhale.core.cluster.StormConfig;
import com.alipay.bluewhale.core.daemon.State;
import com.alipay.bluewhale.core.messaging.ZMQContext;
import com.alipay.bluewhale.core.task.LocalAssignment;
import com.alipay.bluewhale.core.utils.PathUtils;
import com.alipay.bluewhale.core.utils.ProcessSimulator;
import com.alipay.bluewhale.core.utils.StormUtils;
import com.alipay.bluewhale.core.utils.TimeUtils;
import com.alipay.bluewhale.core.work.Worker;
import com.alipay.bluewhale.core.work.WorkerShutdown;
import com.alipay.bluewhale.core.work.refresh.WorkerHeartbeat;
/**
* SyncProcesses
* (1) �ر������쳣(�����ڣ���ʱ)��worker��
* (2) �ҳ���Ҫ������worker��������
*/
class SyncProcesses extends ShutdownWork {
private static Logger LOG = Logger.getLogger(SyncProcesses.class);
private LocalState localState;
private Map conf;
private ConcurrentHashMap<String, String> workerThreadPids;
private String supervisorId;
private ZMQContext sharedContext;
// private Supervisor supervisor;
/**
* @param conf
* @param localState
* @param workerThreadPids
* @param supervisorId
* @param sharedContext
* @param workerThreadPidsReadLock
* @param workerThreadPidsWriteLock
*/
public SyncProcesses(String supervisorId, Map conf, LocalState localState,
ConcurrentHashMap<String, String> workerThreadPids,
ZMQContext sharedContext) {
this.supervisorId = supervisorId;
this.conf = conf;
this.localState = localState;
this.workerThreadPids = workerThreadPids;
this.sharedContext = sharedContext;
}
@SuppressWarnings("unchecked")
@Override
public void run() {
/**
* Step 1: get assigned tasks from localstat Map<port(type Integer),
* LocalAssignment>
*/
Map<Integer, LocalAssignment> assignedTasks = null;
try {
assignedTasks = (Map<Integer, LocalAssignment>) localState
.get(Common.LS_LOCAL_ASSIGNMENTS);
} catch (IOException e) {
LOG.error(
"Failed to get Common.LS_LOCAL_ASSIGNMENTS from localState\n",
e);
}
if (assignedTasks == null) {
assignedTasks = new HashMap<Integer, LocalAssignment>();
}
/**
* Step 2: get allocated tasks from local_dir/worker/ids/heartbeat
* Map<workerid [WorkerHeartbeat, state]>
*/
Map<String, StateHeartbeat> allocated = null;
try {
allocated = readAllocatedworkers(conf, localState, assignedTasks);
} catch (IOException e2) {
LOG.error("readAllocatedworkers" + allocated + " failed");
}
/**
* Step 3: get which one should be keep and get keeper ports
*/
Map<String, StateHeartbeat> keepers = null;
Set<Integer> keepPorts = null;
if (allocated != null) {
keepers = new HashMap<String, StateHeartbeat>();
keepPorts = new HashSet<Integer>();
Set<Entry<String, StateHeartbeat>> allocatedSet = allocated
.entrySet();
for (Iterator<Entry<String, StateHeartbeat>> it = allocatedSet
.iterator(); it.hasNext();) {
Entry<String, StateHeartbeat> entry = it.next();
String workerid = entry.getKey();
StateHeartbeat hbstate = entry.getValue();
if (hbstate.getState().equals(State.valid)) {
keepers.put(workerid, hbstate);
}
if (hbstate.getHeartbeat() != null) {
keepPorts.add(hbstate.getHeartbeat().getPort());
}
// kill those in allocated that are dead or disallowed;
if (hbstate.getState() != State.valid) {
StringBuilder sb = new StringBuilder();
sb.append("Shutting down and clearing state for id ");
sb.append(workerid);
sb.append(";State:");
sb.append(hbstate.getState());
sb.append(";Heartbeat");
sb.append(hbstate.getHeartbeat());
LOG.info(sb);
try {
shutWorker(conf, supervisorId, workerid,
workerThreadPids);
} catch (IOException e) {
String errMsg = "Failed to shutdown worker workId:"
+ workerid + ",supervisorId: " + supervisorId
+ ",workerThreadPids:" + workerThreadPids;
LOG.error(errMsg, e);
}
}
}
}
/**
* Step 4: get reassigned tasks, which is in assignedTasks, but not in
* keeperPorts Map<port(type Integer), LocalAssignment>
*/
Map<Integer, LocalAssignment> reassignTasks = StormUtils
.select_keys_pred(keepPorts, assignedTasks);
/**
* Step 5: generate new work ids
*/
Map<Integer, String> newWorkerIds = null;
if (reassignTasks != null) {
newWorkerIds = new HashMap<Integer, String>();
Set<Integer> reassignedTaskSet = reassignTasks.keySet();
for (Integer port : reassignedTaskSet) {
String newWorkerId = UUID.randomUUID().toString();
newWorkerIds.put(port, newWorkerId);
// create new worker Id directory
// LOCALDIR/workers/newworkid/pids
String path = StormConfig.worker_pids_root(conf, newWorkerId);
try {
PathUtils.local_mkdirs(path);
} catch (IOException e) {
LOG.error("Making dirs at " + path + " failed");
}
}
}
LOG.debug("Syncing processes");
LOG.debug("Assigned tasks: " + assignedTasks);
LOG.debug("Allocated: " + allocated);
/**
* Step 6: update localstat's LS_APPROVED_WORKERS Create approvedWorkers
* Map<WorkerId, port>
*/
Map<String, Integer> lsApprovedWorkers = null;
try {
lsApprovedWorkers = (Map<String, Integer>) localState
.get(Common.LS_APPROVED_WORKERS);
} catch (IOException e) {
LOG.error("get Common.LS_APPROVED_WORKERS of localState failed");
}
if (lsApprovedWorkers == null) {
lsApprovedWorkers = new HashMap<String, Integer>();
}
Map<String, Integer> approvedWorkers = new HashMap<String, Integer>();
if (keepers != null && lsApprovedWorkers != null) {
Set<String> keepersKeySet = keepers.keySet();
Set<Entry<String, Integer>> lsAWEntrySet = lsApprovedWorkers
.entrySet();
for (Iterator<Entry<String, Integer>> it = lsAWEntrySet.iterator(); it
.hasNext();) {
Entry<String, Integer> entry = it.next();
String keepWorkerId = entry.getKey();
if (keepersKeySet.contains(keepWorkerId)) {
approvedWorkers.put(keepWorkerId, entry.getValue());
}
}
}
if (newWorkerIds != null) {
Set<Entry<Integer, String>> newWorkerIdsEntrySet = newWorkerIds
.entrySet();
for (Entry<Integer, String> entry : newWorkerIdsEntrySet) {
String workerId = entry.getValue();
Integer port = entry.getKey();
approvedWorkers.put(workerId, port);
}
}
try {
localState.put(Common.LS_APPROVED_WORKERS, approvedWorkers);
} catch (IOException e1) {
LOG.error("put Common.LS_APPROVED_WORKERS " + approvedWorkers
+ " of localState failed");
}
/**
* Step 7: wait for worker launch
*/
if (reassignTasks != null) {
Set<Entry<Integer, LocalAssignment>> reassignTasksEntrySet = reassignTasks
.entrySet();
for (Entry<Integer, LocalAssignment> entry : reassignTasksEntrySet) {
Integer port = entry.getKey();
LocalAssignment assignment = entry.getValue();
String workerId = newWorkerIds.get(port);
StringBuilder sb = new StringBuilder();
sb.append("Launching worker with assiangment ");
sb.append(assignment.toString());
sb.append(" for the supervisor ");
sb.append(supervisorId);
sb.append(" on port ");
sb.append(port);
sb.append(" with id ");
sb.append(workerId);
LOG.info(sb);
try {
String clusterMode = StormConfig.cluster_mode(conf);
if (clusterMode.equals("distributed")) {
launchWorker(conf, sharedContext,
assignment.getTopologyId(), supervisorId, port,
workerId);
} else if (clusterMode.equals("local")) {
// in fact, this is no use
launchWorker(conf, sharedContext,
assignment.getTopologyId(), supervisorId, port,
workerId, workerThreadPids);
}
} catch (Exception e) {
String errorMsg = "Failed to launchWorker workerId:"
+ workerId + ":" + port;
LOG.error(errorMsg, e);
}
}
}
/**
* FIXME, workerIds should be Set, not Collection, but here simplify the
* logic
*/
Collection<String> workerIds = newWorkerIds.values();
try {
waitForWorkersLaunch(conf, workerIds);
} catch (IOException e) {
LOG.error(e + " waitForWorkersLaunch failed");
} catch (InterruptedException e) {
LOG.error(e + " waitForWorkersLaunch failed");
}
}
/**
* wait for all workers of the supervisor launch
*
* @param conf
* @param workerIds
* @throws InterruptedException
* @throws IOException
* @pdOid 52b11418-7474-446d-bff5-0ecd68f4954f
*/
public void waitForWorkersLaunch(Map conf, Collection<String> workerIds)
throws IOException, InterruptedException {
int startTime = TimeUtils.current_time_secs();
for (Iterator<String> iter = workerIds.iterator(); iter.hasNext();) {
String workerId = iter.next();
waitForWorkerLaunch(conf, workerId, startTime);
}
}
/**
* wait for worker launch if the time is not > *
* SUPERVISOR_WORKER_START_TIMEOUT_SECS, otherwise info failed
*
* @param conf
* @param workerId
* @param startTime
* @throws IOException
* @throws InterruptedException
* @pdOid f0a6ab43-8cd3-44e1-8fd3-015a2ec51c6a
*/
public void waitForWorkerLaunch(Map conf, String workerId, int startTime)
throws IOException, InterruptedException {
LocalState ls = StormConfig.worker_state(conf, workerId);
while (true) {
WorkerHeartbeat whb = (WorkerHeartbeat) ls
.get(Common.LS_WORKER_HEARTBEAT);
if (whb == null
&& ((TimeUtils.current_time_secs() - startTime) < (Integer) conf
.get(Config.SUPERVISOR_WORKER_START_TIMEOUT_SECS))) {
LOG.info(workerId + "still hasn't started");
Time.sleep(500);
} else {
// whb is valid or timeout
break;
}
}
WorkerHeartbeat whb = (WorkerHeartbeat) ls
.get(Common.LS_WORKER_HEARTBEAT);
if (whb == null) {
LOG.info("Worker " + workerId + "failed to start");
}
}
/**
* get localstat approved workerId's map
*
* @return Map<workerid [workerheart, state]> [workerheart, state] is also a
* map, key is "workheartbeat" and "state"
* @param conf
* @param localState
* @param assignedTasks
* @throws IOException
* @pdOid 11c9bebb-d082-4c51-b323-dd3d5522a649
*/
@SuppressWarnings("unchecked")
public Map<String, StateHeartbeat> readAllocatedworkers(Map conf,
LocalState localState, Map<Integer, LocalAssignment> assignedTasks)
throws IOException {
Map<String, StateHeartbeat> workeridHbstate = null;
int now = TimeUtils.current_time_secs();
/**
* Get approved workerIds from local_dir/supervisor/localstat
* Map<WorkerId, port>
*/
Map<String, Integer> approvedIds = (Map<String, Integer>) localState
.get(Common.LS_APPROVED_WORKERS);
/**
* Get Map<workerId, WorkerHeartbeat> from
* local_dir/worker/ids/heartbeat
*/
Map<String, WorkerHeartbeat> idToHeartbeat = readWorkerHeartbeats(conf);
if (idToHeartbeat != null) {
workeridHbstate = new HashMap<String, StateHeartbeat>();
Set<Map.Entry<String, WorkerHeartbeat>> entrySet = idToHeartbeat
.entrySet();
for (Iterator<Map.Entry<String, WorkerHeartbeat>> it = entrySet
.iterator(); it.hasNext();) {
Map.Entry<String, WorkerHeartbeat> entry = it.next();
String workerid = entry.getKey().toString();
WorkerHeartbeat whb = entry.getValue();
State state = null;
if (whb == null) {
state = State.notStarted;
} else if (approvedIds == null
|| approvedIds.containsKey(workerid) == false
|| matchesAssignment(whb, assignedTasks) == false) {
// workerId isn't approved or
// isn't assigned task
state = State.disallowed;
} else if ((now - whb.getTimeSecs()) > (Integer) conf
.get(Config.SUPERVISOR_WORKER_TIMEOUT_SECS)) {//
state = State.timedOut;
} else {
state = State.valid;
}
LOG.debug("Worker:" + workerid + " state:" + state
+ " WorkerHeartbeat: " + whb
+ " at supervisor time-secs " + now);
workeridHbstate.put(workerid, new StateHeartbeat(state, whb));
}
}
return workeridHbstate;
}
/**
* check whether the workerheartbeat is allowed in the assignedTasks
*
* @param whb
* : WorkerHeartbeat
* @param assignedTasks
* @return boolean if true, the assignments(LS-LOCAL-ASSIGNMENTS) is match
* with workerheart if fasle, is not matched
*/
public boolean matchesAssignment(WorkerHeartbeat whb,
Map<Integer, LocalAssignment> assignedTasks) {
boolean isMatch = true;
LocalAssignment localAssignment = assignedTasks.get(whb.getPort());
if (localAssignment == null) {
isMatch = false;
} else if (!whb.getTopologyId().equals(localAssignment.getTopologyId())) {
// topology id not equal
LOG.info("topology id not equal whb=" + whb.getTopologyId()
+ ",localAssignment=" + localAssignment.getTopologyId());
isMatch = false;
} else if (!(whb.getTaskIds().equals(localAssignment.getTaskIds()))) {
// task-id isn't equal
LOG.info("task-id isn't equal whb=" + whb.getTaskIds()
+ ",localAssignment=" + localAssignment.getTaskIds());
isMatch = false;
}
return isMatch;
}
/**
* get all workers heartbeats of the supervisor
*
* @param conf
* @return Map<workerId, WorkerHeartbeat>
* @throws IOException
* @throws IOException
*/
public Map<String, WorkerHeartbeat> readWorkerHeartbeats(Map conf)
throws IOException {
Map<String, WorkerHeartbeat> workerHeartbeats = null;
// get the path: STORM-LOCAL-DIR/workers
String path = StormConfig.worker_root(conf);
List<String> workerIds = PathUtils.read_dir_contents(path);
if (workerIds != null) {
workerHeartbeats = new HashMap<String, WorkerHeartbeat>();
for (String workerId : workerIds) {
WorkerHeartbeat whb = readWorkerHeartbeat(conf, workerId);
// this place whb can be null
workerHeartbeats.put(workerId, whb);
}
}
return workerHeartbeats;
}
/**
* get worker heartbeat by workerid
*
* @param conf
* @param workerId
* @returns WorkerHeartbeat
* @throws IOException
*/
public WorkerHeartbeat readWorkerHeartbeat(Map conf, String workerId)
throws IOException {
LocalState ls = StormConfig.worker_state(conf, workerId);
return (WorkerHeartbeat) ls.get(Common.LS_WORKER_HEARTBEAT);
}
/**
* launch a worker in local mode
*
* @param conf
* @param sharedcontext
* @param stormId
* @param supervisorId
* @param port
* @param workerId
* @param workerThreadPidsAtom
* @param workerThreadPidsAtomWriteLock
* @pdOid 405f44c7-bc1b-4e16-85cc-b59352b6ff5d
*/
@Deprecated
public void launchWorker(Map conf, ZMQContext sharedcontext,
String stormId, String supervisorId, Integer port, String workerId,
ConcurrentHashMap<String, String> workerThreadPidsAtom)
throws Exception {
String pid = UUID.randomUUID().toString();
WorkerShutdown worker = Worker.mk_worker(conf, sharedcontext, stormId,
supervisorId, port, workerId);
ProcessSimulator.registerProcess(pid, worker);
workerThreadPidsAtom.put(workerId, pid);
}
/**
* launch a worker in distributed mode
*
* @param conf
* @param sharedcontext
* @param topologyId
* @param supervisorId
* @param port
* @param workerId
* @throws IOException
* @pdOid 6ea369dd-5ce2-4212-864b-1f8b2ed94abb
*/
public void launchWorker(Map conf, ZMQContext sharedcontext,
String topologyId, String supervisorId, Integer port,
String workerId) throws IOException {
// STORM-LOCAL-DIR/supervisor/stormdist/topologyId
String stormroot = StormConfig.supervisor_stormdist_root(conf,
topologyId);
// STORM-LOCAL-DIR/supervisor/stormdist/topologyId/stormjar.jar
String stormjar = StormConfig.supervisor_stormjar_path(stormroot);
// get supervisor conf
Map stormConf = StormConfig
.read_supervisor_storm_conf(conf, topologyId);
// get classpath
// String[] param = new String[1];
// param[0] = stormjar;
// String classpath = StormUtils.add_to_classpath(
// StormUtils.current_classpath(), param);
String[] classpath = (new String(StormUtils.current_classpath() + ":" + stormjar)).split(":");
String execute=(String) stormConf.get("worker.classpath.exclude");
ArrayList<String> finalclasspath=new ArrayList<String>();
for(String s:classpath)
{
if(execute==null||!s.matches(execute))
{
finalclasspath.add(s);
}
}
StringBuffer classpathBuffer=new StringBuffer();
String joinchar="";
for(String s:finalclasspath)
{
classpathBuffer.append(joinchar);
classpathBuffer.append(s);
joinchar=":";
}
// get child process parameter
String childopts = "";
if (conf.get(Config.WORKER_CHILDOPTS) != null) {
childopts = ""+conf.get(Config.WORKER_CHILDOPTS);
}
if (conf.get(Config.WORKER_CHILDOPTS+"."+port) != null) {
childopts = ""+conf.get(Config.WORKER_CHILDOPTS+"."+port);
}
if (stormConf.get(Config.TOPOLOGY_WORKER_CHILDOPTS) != null) {
childopts = " " + stormConf.get(Config.TOPOLOGY_WORKER_CHILDOPTS);
}
if (stormConf.get(Config.TOPOLOGY_WORKER_CHILDOPTS+"."+port) != null) {
childopts = " " + stormConf.get(Config.TOPOLOGY_WORKER_CHILDOPTS+"."+port);
}
String stormhome = System.getProperty("storm.home");
if (stormhome == null) {
stormhome=".";
}
// TODO ???��������� %ID%
childopts = childopts.replace("%ID%", port.toString());
childopts = childopts.replaceAll("%port%", port.toString());
childopts = childopts.replaceAll("%storm.home%", stormhome);
String logFileName = "worker-" + port + ".log";
StringBuilder commandSB = new StringBuilder();
// FIXME ���ӻ��ˣ��˴�ƴ���ַ�����ִ���ְ��տո�ֲ�
commandSB.append("java -server ");
commandSB.append(childopts);
commandSB.append(" -Djava.library.path=");
commandSB.append((String) conf.get(Config.JAVA_LIBRARY_PATH));
commandSB.append(" -Dlogfile.name=");
commandSB.append(logFileName);
commandSB.append(" -Dstorm.home=");
commandSB.append(stormhome);
commandSB.append(" -Dlog4j.configuration=storm.log.properties");
commandSB.append(" -cp ");
commandSB.append(classpathBuffer.toString());
commandSB.append(" com.alipay.bluewhale.core.work.Worker ");
commandSB.append(topologyId);
commandSB.append(" ");
commandSB.append(supervisorId);
commandSB.append(" ");
commandSB.append(port);
commandSB.append(" ");
commandSB.append(workerId);
LOG.info("Launching worker with command: " + commandSB);
Map<String, String> environment = new HashMap<String, String>();
environment.put("LD_LIBRARY_PATH",
(String) conf.get(Config.JAVA_LIBRARY_PATH));
try{
ArrayList<String> killlist=findByJavaPort.findProcess(port);
if(killlist!=null)
{
for(int i=0;i<3;i++)
{
for(String pid:killlist)
{
StormUtils.ensure_process_killed(Integer.parseInt(pid));
}
Thread.sleep(300);
}
}
}catch(Throwable e)
{
LOG.error("killlist",e);
}
StormUtils.launch_work_process(commandSB.toString(), environment);
}
}