/**
* CopyRight by Chinamobile
*
* WorkerManager.java
*/
package com.chinamobile.bcbsp.workermanager;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.lang.reflect.Constructor;
import java.net.InetSocketAddress;
import java.net.ServerSocket;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSError;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.Server;
import org.apache.hadoop.metrics.MetricsContext;
import org.apache.hadoop.metrics.MetricsUtil;
import org.apache.hadoop.net.DNS;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.util.DiskChecker;
import org.apache.hadoop.util.RunJar;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import com.chinamobile.bcbsp.Constants;
import com.chinamobile.bcbsp.BSPConfiguration;
import com.chinamobile.bcbsp.fault.storage.Fault;
import com.chinamobile.bcbsp.fault.storage.Fault.Level;
import org.apache.log4j.LogManager;
import com.chinamobile.bcbsp.rpc.ControllerProtocol;
import com.chinamobile.bcbsp.rpc.WorkerManagerProtocol;
import com.chinamobile.bcbsp.sync.SuperStepReportContainer;
import com.chinamobile.bcbsp.util.StaffAttemptID;
import com.chinamobile.bcbsp.util.BSPJobID;
import com.chinamobile.bcbsp.util.BSPJob;
import com.chinamobile.bcbsp.util.StaffStatus;
import com.chinamobile.bcbsp.util.ClassLoaderUtil;
import com.chinamobile.bcbsp.bspcontroller.BSPController;
import com.chinamobile.bcbsp.action.*;
import com.chinamobile.bcbsp.bspstaff.BSPStaffRunner;
import com.chinamobile.bcbsp.bspstaff.Staff;
import com.chinamobile.bcbsp.bspstaff.BSPStaff.WorkerAgentForStaffInterface;
/**
* A WorkerManager is a process that manages staffs assigned by the
* BSPController. Each WorkerManager contacts the BSPController, and it takes
* assigned staffs and reports its status by means of periodical heart beats
* with BSPController. Each WorkerManager is designed to run with HDFS or other
* distributed storages. Basically, a WorkerManager and a data node should be
* run on one physical node.
*
* @author
* @version
*/
public class WorkerManager implements Runnable, WorkerManagerProtocol,
WorkerAgentProtocol {
private static final Log LOG = LogFactory.getLog(WorkerManager.class);
private volatile static int HEART_BEAT_INTERVAL;
private static int CACHE_QUEUE_LENGTH = 20;
private Configuration conf;
// Constants
static enum State {
NORMAL, COMPUTE, SYNC, BARRIER, STALE, INTERRUPTED, DENIED
};
// Running States and its related things
volatile boolean initialized = false;
volatile boolean running = true;
volatile boolean shuttingDown = false;
private boolean justInited = true;
// Attributes
private String workerManagerName;
private InetSocketAddress bspControllerAddr;
// FileSystem
private Path systemDirectory = null;
private FileSystem systemFS = null;
// Job
private int failures;
private int maxStaffsCount = 0;
private Integer currentStaffsCount = 0;
private int finishedStaffsCount = 0;
private List<Fault> workerFaultList = null;
private List<StaffStatus> reportStaffStatusList = null;
private Map<StaffAttemptID, StaffInProgress> runningStaffs = null;
private Map<StaffAttemptID, StaffInProgress> finishedStaffs = null;
private Map<BSPJobID, RunningJob> runningJobs = null;
private Map<BSPJobID, RunningJob> finishedJobs = null;
private Map<BSPJobID, WorkerAgentForJob> runningJobtoWorkerAgent = null;
private String rpcServer;
private Server workerServer;
private ControllerProtocol controllerClient;
private InetSocketAddress staffReportAddress;
private Server staffReportServer = null;
private ArrayList<BSPJobID> failedJobList = new ArrayList<BSPJobID>();
// For current free port counter. It will travel around 60001~65535
private int currentFreePort = 60000;
public WorkerManager(Configuration conf) throws IOException {
LOG.info("worker start");
this.conf = conf;
String mode = conf.get(Constants.BC_BSP_CONTROLLER_ADDRESS);
if (!mode.equals("local")) {
bspControllerAddr = BSPController.getAddress(conf);
}
}
@SuppressWarnings("static-access")
public synchronized void initialize() throws IOException {
if (this.conf.get(Constants.BC_BSP_WORKERMANAGER_RPC_HOST) != null) {
this.workerManagerName = conf
.get(Constants.BC_BSP_WORKERMANAGER_RPC_HOST);
}
if (this.workerManagerName == null) {
this.workerManagerName = DNS.getDefaultHost(
conf.get("bsp.dns.interface", "default"),
conf.get("bsp.dns.nameserver", "default"));
}
// check local disk
checkLocalDirs(conf.getStrings(Constants.BC_BSP_LOCAL_DIRECTORY));
deleteLocalFiles("workerManager");
this.workerFaultList = new ArrayList<Fault>();
this.reportStaffStatusList = new ArrayList<StaffStatus>();
this.runningStaffs = new ConcurrentHashMap<StaffAttemptID, StaffInProgress>();
this.finishedStaffs = new ConcurrentHashMap<StaffAttemptID, StaffInProgress>();
this.runningJobs = new ConcurrentHashMap<BSPJobID, RunningJob>();
this.finishedJobs = new ConcurrentHashMap<BSPJobID, RunningJob>();
this.runningJobtoWorkerAgent = new ConcurrentHashMap<BSPJobID, WorkerAgentForJob>();
this.conf
.set(Constants.BC_BSP_WORKERAGENT_HOST, this.workerManagerName);
this.conf.set(Constants.BC_BSP_WORKERMANAGER_RPC_HOST,
this.workerManagerName);
this.maxStaffsCount = conf.getInt(
Constants.BC_BSP_WORKERMANAGER_MAXSTAFFS, 1);
this.HEART_BEAT_INTERVAL = conf.getInt(Constants.HEART_BEAT_INTERVAL,
1000);
LOG.info("The max number of staffs is : " + this.maxStaffsCount);
int rpcPort = -1;
String rpcAddr = null;
if (false == this.initialized) {
rpcAddr = conf.get(Constants.BC_BSP_WORKERMANAGER_RPC_HOST,
Constants.DEFAULT_BC_BSP_WORKERMANAGER_RPC_HOST);
rpcPort = conf
.getInt(Constants.BC_BSP_WORKERMANAGER_RPC_PORT, 5000);
if (-1 == rpcPort || null == rpcAddr)
throw new IllegalArgumentException("Error rpc address "
+ rpcAddr + " port" + rpcPort);
this.workerServer = RPC.getServer(this, rpcAddr, rpcPort, conf);
this.workerServer.start();
this.rpcServer = rpcAddr + ":" + rpcPort;
LOG.info("Worker rpc server --> " + rpcServer);
}
String address = conf
.get(Constants.BC_BSP_WORKERMANAGER_REPORT_ADDRESS);
InetSocketAddress socAddr = NetUtils.createSocketAddr(address);
String bindAddress = socAddr.getHostName();
int tmpPort = socAddr.getPort();
// RPC initialization
this.staffReportServer = RPC.getServer(this, bindAddress, tmpPort, 10,
false, this.conf);
this.staffReportServer.start();
// get the assigned address
this.staffReportAddress = staffReportServer.getListenerAddress();
LOG.info("WorkerManager up at: " + this.staffReportAddress);
DistributedCache.purgeCache(this.conf);
// establish the communication link to bsp master
this.controllerClient = ( ControllerProtocol ) RPC.waitForProxy(
ControllerProtocol.class, ControllerProtocol.versionID,
bspControllerAddr, conf);
// enroll in bsp controller
if (-1 == rpcPort || null == rpcAddr) {
throw new IllegalArgumentException("Error rpc address " + rpcAddr
+ " port" + rpcPort);
}
if (!this.controllerClient.register(new WorkerManagerStatus(
workerManagerName, cloneAndResetRunningStaffStatuses(),
maxStaffsCount, currentStaffsCount, finishedStaffsCount,
failures, this.rpcServer))) {
LOG.error("There is a problem in establishing communication"
+ " link with BSPController");
throw new IOException("There is a problem in establishing"
+ " communication link with BSPController.");
}
this.running = true;
this.initialized = true;
}
/** Return the port at which the staff tracker bound to */
public synchronized InetSocketAddress getStaffTrackerReportAddress() {
return staffReportAddress;
}
@Override
public boolean dispatch(BSPJobID jobId, Directive directive,
boolean recovery, boolean changeWorkerState, int failCounter) {
// update tasks status
WorkerManagerAction[] actions = directive.getActions();
LOG.info("Got Response from BSPController with "
+ ((actions != null) ? actions.length : 0) + " actions");
// perform actions
if (actions != null) {
for (WorkerManagerAction action : actions) {
try {
if (action instanceof LaunchStaffAction) {
if (recovery == true) {
String localPath = conf
.get(Constants.BC_BSP_LOCAL_DIRECTORY) // /tmp/bcbsp/local
+ "/workerManager";
LOG.info("if(recovery == true)" + " " + localPath);
if (FileSystem.getLocal(conf).exists(
new Path(localPath,
(( LaunchStaffAction ) action)
.getStaff()
.getStaffAttemptId()
.toString()))) {
FileSystem.getLocal(conf).delete(
new Path(localPath,
(( LaunchStaffAction ) action)
.getStaff()
.getStaffAttemptId()
.toString()), true);
}
}
startNewStaff(( LaunchStaffAction ) action, directive,
recovery, changeWorkerState, failCounter);
return true;
} else {
KillStaffAction killAction = ( KillStaffAction ) action;
if (runningStaffs.containsKey(killAction.getStaffID())) {
StaffInProgress sip = runningStaffs.get(killAction
.getStaffID());
sip.staffStatus
.setRunState(StaffStatus.State.KILLED);
sip.killAndCleanup(true);
} else {
LOG.warn(killAction.getStaffID()
+ " is not in the runningStaffs "
+ "and the kill action is invalid.");
}
return false;
}
} catch (IOException e) {
LOG.error("Exception has been catched in WorkerManager--dispatch !", e);
StaffInProgress sip = null;
sip = runningStaffs.get((( LaunchStaffAction ) action)
.getStaff().getStaffAttemptId());
sip.getStatus().setStage(0); // convenient for the call in controller
sip.setStaffStatus(Constants.SATAFF_STATUS.FAULT,
new Fault(Fault.Type.DISK, Level.WARNING, sip
.getStatus().getGroomServer(),
"IOException happened", sip.getStatus()
.getJobId().toString(), sip
.getStatus().getStaffId()
.toString()));
}
}
}
return false;
}
private static void checkLocalDirs(String[] localDirs)
throws DiskErrorException {
boolean writable = false;
if (localDirs != null) {
for (int i = 0; i < localDirs.length; i++) {
try {
DiskChecker.checkDir(new File(localDirs[i]));
LOG.info("Local System is Normal : " + localDirs[i]);
writable = true;
} catch (DiskErrorException e) {
LOG.error("BSP Processor local", e);
}
}
}
if (!writable)
throw new DiskErrorException(
"all local directories are not writable");
}
public String[] getLocalDirs() {
return conf.getStrings(Constants.BC_BSP_LOCAL_DIRECTORY);
}
public void deleteLocalFiles() throws IOException {
String[] localDirs = getLocalDirs();
for (int i = 0; i < localDirs.length; i++) {
File f = new File(localDirs[i]);
deleteLocalDir(f);
}
}
public void deleteLocalDir(File dir) {
if (dir == null || !dir.exists() || !dir.isDirectory())
return;
for (File file : dir.listFiles()) {
if (file.isFile())
file.delete(); // delete the file
else if (file.isDirectory())
deleteLocalDir(file); // recursive delete the subdir
}
dir.delete();// delete the root dir
}
public void deleteLocalFiles(String subdir) throws IOException {
try {
String[] localDirs = getLocalDirs();
for (int i = 0; i < localDirs.length; i++) {
FileSystem.getLocal(this.conf).delete(
new Path(localDirs[i], subdir), true);
}
} catch (NullPointerException e) {
LOG.error("[deleteLocalFiles]", e);
}
}
public void cleanupStorage() throws IOException {
deleteLocalFiles();
}
private void startCleanupThreads() throws IOException {
}
public void updateStaffStatistics(BSPJobID jobId) throws Exception {
synchronized (currentStaffsCount) {
currentStaffsCount--;
}
finishedStaffsCount++;
if (finishedStaffs.size() > CACHE_QUEUE_LENGTH) {
finishedStaffs.clear();
}
synchronized (runningJobs) {
int counter = runningJobs.get(jobId).getStaffCounter();
if (counter > 0) {
runningJobs.get(jobId).setStaffCounter(counter - 1);
}
if (runningJobs.get(jobId).getStaffCounter() == 0) {
if (finishedJobs.size() > CACHE_QUEUE_LENGTH) {
finishedJobs.clear();
}
finishedJobs.put(jobId, runningJobs.remove(jobId));
runningJobtoWorkerAgent.get(jobId).close();
runningJobtoWorkerAgent.remove(jobId);
}
}
}
public State offerService() throws Exception {
while (running && !shuttingDown) {// && !upRecoveryThreshold
try {
this.reportStaffStatusList.clear();
Iterator<Entry<StaffAttemptID, StaffInProgress>> runningStaffsIt = runningStaffs
.entrySet().iterator();
Entry<StaffAttemptID, StaffInProgress> entry;
while (runningStaffsIt.hasNext()) {
entry = runningStaffsIt.next();
switch (entry.getValue().getStatus().getRunState()) {
case COMMIT_PENDING:
case UNASSIGNED:
// TODO : Do nothing now.
break;
case RUNNING:
this.reportStaffStatusList.add(entry.getValue()
.getStatus());
break;
case SUCCEEDED:
updateStaffStatistics(entry.getValue().getStatus()
.getJobId());
runningStaffsIt.remove();
finishedStaffs
.put(entry.getKey(), entry.getValue());
LOG.info(entry.getKey()
+ " has succeed and been removed from the runningStaffs");
break;
case FAULT:
if (entry.getValue().runner.isAlive()) {
entry.getValue().getStatus()
.setPhase(StaffStatus.Phase.CLEANUP);
entry.getValue().runner.kill();
}
this.reportStaffStatusList.add(entry.getValue()
.getStatus());
updateStaffStatistics(entry.getValue().getStatus()
.getJobId());
runningStaffsIt.remove();
finishedStaffs
.put(entry.getKey(), entry.getValue());
LOG.error(entry.getKey()
+ " is fault and has been removed from the runningStaffs");
break;
case STAFF_RECOVERY:
// TODO : Do nothing now.
break;
case WORKER_RECOVERY:
// TODO : Do nothing now.
break;
case FAILED:
// TODO : Do nothing now.
break;
case KILLED:
updateStaffStatistics(entry.getValue().getStatus()
.getJobId());
runningStaffsIt.remove();
finishedStaffs
.put(entry.getKey(), entry.getValue());
LOG.warn(entry.getKey()
+ " has been killed manually and removed from the runningStaffs");
break;
case FAILED_UNCLEAN:
// TODO : Do nothing now.
break;
case KILLED_UNCLEAN:
// TODO : This staff should be report and request
// the cleanup task in the future.
updateStaffStatistics(entry.getValue().getStatus()
.getJobId());
runningStaffsIt.remove();
finishedStaffs
.put(entry.getKey(), entry.getValue());
LOG.warn(entry.getKey()
+ " has been killed manually and removed from the runningStaffs");
break;
default:
LOG.error("Unknown StaffStatus.State: "
+ entry.getValue().getStatus()
.getRunState());
}
}
WorkerManagerStatus gss = new WorkerManagerStatus(
this.workerManagerName, this.reportStaffStatusList,
maxStaffsCount, currentStaffsCount,
finishedStaffsCount, failures, this.rpcServer,
workerFaultList);
try {
boolean ret = controllerClient.report(new Directive(gss));
synchronized (this) {
workerFaultList.clear();
}// list.add() need synchronize
if (!ret) {
LOG.error("fail to update");
}
} catch (Exception ioe) {
LOG.error(
"Fail to communicate with BSPController for reporting.",
ioe);
}
Thread.sleep(HEART_BEAT_INTERVAL);
} catch (InterruptedException ie) {
LOG.error("[offerService]", ie);
}
}
return State.NORMAL;
}
private void startNewStaff(LaunchStaffAction action, Directive directive,
boolean recovery, boolean changeWorkerState, int failCounter) {
Staff s = action.getStaff();
BSPJob jobConf = null;
try {
jobConf = new BSPJob(s.getJobID(), s.getJobFile());
jobConf.setInt("staff.fault.superstep", directive.getFaultSSStep());
} catch (IOException e1) {
LOG.error("Exception has been catched in WorkerManager--startNewStaff-jobConf", e1);
StaffInProgress sip = runningStaffs.get((( LaunchStaffAction ) action).getStaff()
.getStaffAttemptId());
sip.getStatus().setStage(0); // convenient for the call in
// controller
sip.setStaffStatus(Constants.SATAFF_STATUS.FAULT, new Fault(
Fault.Type.DISK, Level.WARNING, sip.getStatus()
.getGroomServer(), "IOException happened", sip
.getStatus().getJobId().toString(), sip.getStatus()
.getStaffId().toString()));
}
StaffInProgress sip = new StaffInProgress(s, jobConf,
this.workerManagerName);
sip.setFailCounter(failCounter);
if (recovery == true) {
sip.getStatus().setRecovery(true);
}
if (changeWorkerState == true) {
sip.setChangeWorkerState(true);
}
try {
localizeJob(sip, directive);
} catch (IOException e) {
LOG.error("Exception has been catched in WorkerManager--startNewStaff-localizeJob", e);
sip = runningStaffs.get((( LaunchStaffAction ) action).getStaff()
.getStaffAttemptId());
sip.getStatus().setStage(0); // convenient for the call in
// controller
sip.setStaffStatus(Constants.SATAFF_STATUS.FAULT, new Fault(
Fault.Type.DISK, Level.WARNING, sip.getStatus()
.getGroomServer(), "IOException happened", sip
.getStatus().getJobId().toString(), sip.getStatus()
.getStaffId().toString()));
}
}
private void localizeJob(StaffInProgress sip, Directive directive)
throws IOException {
Staff staff = sip.getStaff();
conf.addResource(staff.getJobFile());
BSPJob defaultJobConf = new BSPJob(( BSPConfiguration ) conf);
Path localJobFile = defaultJobConf
.getLocalPath(Constants.BC_BSP_LOCAL_SUBDIR_WORKERMANAGER + "/"
+ staff.getStaffID() + "/" + "job.xml");
Path localJarFile = defaultJobConf
.getLocalPath(Constants.BC_BSP_LOCAL_SUBDIR_WORKERMANAGER + "/"
+ staff.getStaffID() + "/" + "job.jar");
systemFS.copyToLocalFile(new Path(staff.getJobFile()), localJobFile);
BSPConfiguration conf = new BSPConfiguration();
conf.addResource(localJobFile);
BSPJob jobConf = new BSPJob(conf, staff.getJobID().toString());
Path jarFile = null;
if(jobConf.getJar() != null){
jarFile = new Path(jobConf.getJar());
}
jobConf.setJar(localJarFile.toString());
if (jarFile != null) {
systemFS.copyToLocalFile(jarFile, localJarFile);
// also unjar the job.jar files in workdir
File workDir = new File(
new File(localJobFile.toString()).getParent(), "work");
if (!workDir.mkdirs()) {
if (!workDir.isDirectory()) {
throw new IOException("Mkdirs failed to create "
+ workDir.toString());
}
}
RunJar.unJar(new File(localJarFile.toString()), workDir);
}
/** Add the user program jar to the system's classpath. */
ClassLoaderUtil.addClassPath(localJarFile.toString());
RunningJob rjob = addStaffToJob(staff.getJobID(), localJobFile, sip,
directive, jobConf);
rjob.localized = true;
sip.setFaultSSStep(directive.getFaultSSStep());
launchStaffForJob(sip, jobConf);
}
private void launchStaffForJob(StaffInProgress sip, BSPJob jobConf) {
try {
sip.setJobConf(jobConf);
sip.launchStaff();
} catch (IOException ioe) {
LOG.error("Exception has been catched in WorkerManager--launchStaffForJob", ioe);
sip.staffStatus.setRunState(StaffStatus.State.FAILED);
sip.getStatus().setStage(0); // convenient for the call in
// controller
sip.setStaffStatus(Constants.SATAFF_STATUS.FAULT, new Fault(
Fault.Type.SYSTEMSERVICE, Fault.Level.INDETERMINATE, sip
.getStatus().getGroomServer(), ioe.toString(), sip
.getStatus().getJobId().toString(), sip.getStatus()
.getStaffId().toString()));
}
}
private RunningJob addStaffToJob(BSPJobID jobId, Path localJobFile,
StaffInProgress sip, Directive directive, BSPJob job) {
synchronized (runningJobs) {
RunningJob rJob = null;
if (!runningJobs.containsKey(jobId)) {
rJob = new RunningJob(jobId, localJobFile);
rJob.localized = false;
rJob.staffs = new HashSet<StaffInProgress>();
rJob.jobFile = localJobFile;
runningJobs.put(jobId, rJob);
// Create a new WorkerAgentForJob for a new job
try {
WorkerAgentForJob bspPeerForJob = new WorkerAgentForJob(
conf, jobId, job, this);
runningJobtoWorkerAgent.put(jobId, bspPeerForJob);
} catch (IOException e) {
LOG.error("Failed to create a WorkerAgentForJob for a new job"
+ jobId.toString());
}
} else {
rJob = runningJobs.get(jobId);
}
rJob.staffs.add(sip);
int counter = rJob.getStaffCounter();
rJob.setStaffCounter(counter + 1);
return rJob;
}
}
/**
* The data structure for initializing a job
*/
static class RunningJob {
private BSPJobID jobid;
private Path jobFile;
// keep this for later use
Set<StaffInProgress> staffs;
private int staffCounter = 0;
boolean localized;
boolean keepJobFiles;
RunningJob(BSPJobID jobid, Path jobFile) {
this.jobid = jobid;
localized = false;
staffs = new HashSet<StaffInProgress>();
this.jobFile = jobFile;
keepJobFiles = false;
}
Path getJobFile() {
return jobFile;
}
BSPJobID getJobId() {
return jobid;
}
public void setStaffCounter(int counter) {
staffCounter = counter;
}
public int getStaffCounter() {
return staffCounter;
}
}
private synchronized List<StaffStatus> cloneAndResetRunningStaffStatuses() {
List<StaffStatus> result = new ArrayList<StaffStatus>(
runningStaffs.size());
for (StaffInProgress sip : runningStaffs.values()) {
StaffStatus status = sip.getStatus();
result.add(( StaffStatus ) status.clone());
}
return result;
}
public void initFileSystem() throws Exception {
if (justInited) {
String dir = controllerClient.getSystemDir();
if (dir == null) {
LOG.error("Fail to get system directory.");
throw new IOException("Fail to get system directory.");
}
systemDirectory = new Path(dir);
systemFS = systemDirectory.getFileSystem(conf);
}
justInited = false;
}
public void run() {
try {
initialize();
initFileSystem();
startCleanupThreads();
boolean denied = false;
while (running && !shuttingDown && !denied) {
boolean staleState = false;
try {
while (running && !staleState && !shuttingDown && !denied) {
try {
State osState = offerService();
if (osState == State.STALE) {
staleState = true;
} else if (osState == State.DENIED) {
denied = true;
}
} catch (Exception e) {
if (!shuttingDown) {
LOG.warn(
"Lost connection to BSP Controller ["
+ bspControllerAddr
+ "]. Retrying...", e);
try {
Thread.sleep(5000);
} catch (InterruptedException ie) {
LOG.error("[run]", ie);
}
}
}
}
} catch (Exception e) {
LOG.error("[run]", e);
}
if (shuttingDown) {
return;
}
LOG.warn("Reinitializing local state");
initialize();
initFileSystem();
}
} catch (Exception ioe) {
LOG.error("Got fatal exception in WorkerManager: "
+ StringUtils.stringifyException(ioe));
LOG.error("WorkerManager will quit abnormally!");
close();
return;
}
}
public synchronized void shutdown() throws IOException {
LOG.info("Prepare to shutdown the WorkerManager");
shuttingDown = true;
close();
}
public synchronized void close() {
this.running = false;
this.initialized = false;
try {
for (StaffInProgress sip : runningStaffs.values()) {
if (sip.runner.isAlive()) {
sip.killAndCleanup(true);
LOG.info(sip.getStatus().getStaffId()
+ " has been killed by system");
}
}
LOG.info("Succeed to stop all Staff Process");
for (Map.Entry<BSPJobID, WorkerAgentForJob> e : runningJobtoWorkerAgent
.entrySet()) {
e.getValue().close();
}
LOG.info("Succeed to stop all WorkerAgentForJob");
this.workerServer.stop();
RPC.stopProxy(controllerClient);
if (staffReportServer != null) {
staffReportServer.stop();
staffReportServer = null;
}
LOG.info("Succeed to stop all RPC Server");
cleanupStorage();
LOG.info("Succeed to cleanup temporary files on the local disk");
} catch (Exception e) {
LOG.error("Failed to execute the close()", e);
}
}
public static Thread startWorkerManager(final WorkerManager hrs) {
return startWorkerManager(hrs, "regionserver" + hrs.workerManagerName);
}
public static Thread startWorkerManager(final WorkerManager hrs,
final String name) {
Thread t = new Thread(hrs);
t.setName(name);
t.start();
return t;
}
/**
* StaffInProgress maintains all the info for a Staff that lives at this
* WorkerManager. It maintains the Staff object, its StaffStatus, and the
* BSPStaffRunner.
*
* @author
* @version
*/
class StaffInProgress {
Staff staff;
WorkerAgentForStaffInterface staffAgent;
public BSPJob jobConf;
BSPJob localJobConf;
BSPStaffRunner runner;
volatile boolean done = false;
volatile boolean wasKilled = false;
private StaffStatus staffStatus;
private String error = "no";
private int faultSSStep = 0;
private boolean changeWorkerState = false;
private int failCounter = 0;
public StaffInProgress(Staff staff, BSPJob jobConf, String workerManagerName) {
this.staff = staff;
this.jobConf = jobConf;
this.localJobConf = null;
this.staffStatus = new StaffStatus(staff.getJobID(),
staff.getStaffID(), 0, StaffStatus.State.UNASSIGNED,
"running", workerManagerName, StaffStatus.Phase.STARTING);
}
public void setStaffStatus(int stateStatus, Fault fault) {
switch (stateStatus) {
case Constants.SATAFF_STATUS.RUNNING:
this.staffStatus.setRunState(StaffStatus.State.RUNNING);
break;
case Constants.SATAFF_STATUS.SUCCEED:
this.staffStatus.setRunState(StaffStatus.State.SUCCEEDED);
break;
case Constants.SATAFF_STATUS.FAULT:
this.staffStatus.setRunState(StaffStatus.State.FAULT);
this.staffStatus.setFault(fault);
break;
default:
LOG.error("Unknown StaffStatus.State: <Constants.SATAFF_STATUS>"
+ stateStatus);
}
}
public boolean getChangeWorkerState() {
return changeWorkerState;
}
public void setChangeWorkerState(boolean changeWorkerState) {
this.changeWorkerState = changeWorkerState;
}
public String getError() {
return this.error;
}
public int getFaultSSStep() {
return faultSSStep;
}
public void setFaultSSStep(int faultSSStep) {
this.faultSSStep = faultSSStep;
}
public void setFailCounter(int failCounter) {
this.failCounter = failCounter;
}
public int getFailCounter() {
return this.failCounter;
}
private void localizeStaff(Staff task) throws IOException {
Path localJobFile = this.jobConf
.getLocalPath(Constants.BC_BSP_LOCAL_SUBDIR_WORKERMANAGER
+ "/" + task.getStaffID() + "/job.xml");
Path localJarFile = this.jobConf
.getLocalPath(Constants.BC_BSP_LOCAL_SUBDIR_WORKERMANAGER
+ "/" + task.getStaffID() + "/job.jar");
String jobFile = task.getJobFile();
systemFS.copyToLocalFile(new Path(jobFile), localJobFile);
task.setJobFile(localJobFile.toString());
localJobConf = new BSPJob(task.getJobID(), localJobFile.toString());
localJobConf.set("bsp.task.id", task.getStaffID().toString());
String jarFile = localJobConf.getJar();
if (jarFile != null) {
systemFS.copyToLocalFile(new Path(jarFile), localJarFile);
localJobConf.setJar(localJarFile.toString());
}
LOG.debug("localizeStaff : " + localJobConf.getJar());
LOG.debug("localizeStaff : " + localJobFile.toString());
task.setConf(localJobConf);
}
public synchronized void setJobConf(BSPJob jobConf) {
this.jobConf = jobConf;
}
public synchronized BSPJob getJobConf() {
return localJobConf;
}
public void launchStaff() throws IOException {
localizeStaff(staff);
staffStatus.setRunState(StaffStatus.State.RUNNING);
BSPJobID jobID = localJobConf.getJobID();
runningJobtoWorkerAgent.get(jobID).addStaffCounter(
staff.getStaffAttemptId());
runningJobtoWorkerAgent.get(jobID).setJobConf(jobConf);
runningStaffs.put(staff.getStaffAttemptId(), this);
synchronized (currentStaffsCount) {
currentStaffsCount++;
}
this.runner = staff.createRunner(WorkerManager.this);
this.runner.setFaultSSStep(this.faultSSStep);
this.runner.start();
}
/**
* This task has run on too long, and should be killed.
*/
public synchronized void killAndCleanup(boolean wasFailure)
throws IOException {
onKillStaff();
runner.kill();
}
private void onKillStaff() {
if (this.staffAgent != null) {
this.staffAgent.onKillStaff();
}
}
public Staff getStaff() {
return staff;
}
public synchronized StaffStatus getStatus() {
return staffStatus;
}
public StaffStatus.State getRunState() {
return staffStatus.getRunState();
}
public boolean wasKilled() {
return wasKilled;
}
@Override
public boolean equals(Object obj) {
return (obj instanceof StaffInProgress)
&& staff.getStaffID().equals(
(( StaffInProgress ) obj).getStaff().getStaffID());
}
@Override
public int hashCode() {
return staff.getStaffID().hashCode();
}
public void setStaffAgent(WorkerAgentForStaffInterface staffAgent) {
this.staffAgent = staffAgent;
}
}
public boolean isRunning() {
return running;
}
public static WorkerManager constructWorkerManager(
Class<? extends WorkerManager> workerManagerClass,
final Configuration conf2) {
try {
Constructor<? extends WorkerManager> c = workerManagerClass
.getConstructor(Configuration.class);
return c.newInstance(conf2);
} catch (Exception e) {
throw new RuntimeException("Failed construction of " + "WorkerManager: "
+ workerManagerClass.toString(), e);
}
}
@Override
public long getProtocolVersion(String protocol, long clientVersion)
throws IOException {
if (protocol.equals(WorkerManagerProtocol.class.getName())) {
return WorkerManagerProtocol.versionID;
} else if (protocol.equals(WorkerAgentProtocol.class.getName())) {
return WorkerAgentProtocol.versionID;
} else {
throw new IOException("Unknown protocol to WorkerManager: "
+ protocol);
}
}
/**
* The main() for child processes.
*
* @author
* @version
*/
public static class Child {
public static void main(String[] args) {
BSPConfiguration defaultConf = new BSPConfiguration();
// report address
String host = args[0];
int port = Integer.parseInt(args[1]);
InetSocketAddress address = new InetSocketAddress(host, port);
StaffAttemptID staffid = StaffAttemptID.forName(args[2]);
int faultSSStep = Integer.parseInt(args[3]);
String hostName = args[4];
LOG.info(staffid + ": Child Starts");
LOG.info("=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*");
WorkerAgentProtocol umbilical = null;
Staff staff = null;
BSPJob job = null;
try {
umbilical = ( WorkerAgentProtocol ) RPC.getProxy(
WorkerAgentProtocol.class,
WorkerAgentProtocol.versionID, address, defaultConf);
staff = umbilical.getStaff(staffid);
defaultConf.addResource(new Path(staff.getJobFile()));
job = new BSPJob(staff.getJobID(), staff.getJobFile());
// use job-specified working directory
FileSystem.get(job.getConf()).setWorkingDirectory(
job.getWorkingDirectory());
boolean recovery = umbilical.getStaffRecoveryState(staffid);
boolean changeWorkerState = umbilical
.getStaffChangeWorkerState(staffid);
int failCounter = umbilical.getFailCounter(staffid);
job.setInt("staff.fault.superstep", faultSSStep);
staff.run(job, staff, umbilical, recovery, changeWorkerState, failCounter,
hostName); // run the task
} catch (ClassNotFoundException cnfE) {
LOG.error("Exception has been catched in WorkerManager--Error running child", cnfE);
// Report back any failures, for diagnostic purposes
ByteArrayOutputStream baos = new ByteArrayOutputStream();
cnfE.printStackTrace(new PrintStream(baos));
umbilical.setStaffStatus(
staffid,
Constants.SATAFF_STATUS.FAULT,
new Fault(Fault.Type.SYSTEMSERVICE,
Fault.Level.CRITICAL, umbilical
.getWorkerManagerName(job.getJobID(),
staffid), cnfE.toString(),
job.toString(), staffid.toString()), 0);
} catch (FSError e) {
LOG.error("Exception has been catched in WorkerManager--FSError from child", e);
umbilical.setStaffStatus(
staffid,
Constants.SATAFF_STATUS.FAULT,
new Fault(Fault.Type.SYSTEMSERVICE,
Fault.Level.CRITICAL, umbilical
.getWorkerManagerName(job.getJobID(),
staffid), e.toString(), job
.toString(), staffid.toString()), 0);
} catch (Throwable throwable) {
LOG.error("Exception has been catched in WorkerManager--Error running child", throwable);
// Report back any failures, for diagnostic purposes
ByteArrayOutputStream baos = new ByteArrayOutputStream();
throwable.printStackTrace(new PrintStream(baos));
umbilical.setStaffStatus(
staffid,
Constants.SATAFF_STATUS.FAULT,
new Fault(Fault.Type.SYSTEMSERVICE,
Fault.Level.CRITICAL, umbilical
.getWorkerManagerName(job.getJobID(),
staffid), throwable.toString(),
job.toString(), staffid.toString()), 0);
} finally {
RPC.stopProxy(umbilical);
MetricsContext metricsContext = MetricsUtil
.getContext("mapred");
metricsContext.close();
// Shutting down log4j of the child-vm...
// This assumes that on return from Staff.run()
// there is no more logging done.
LogManager.shutdown();
}
}
}
@Override
public Staff getStaff(StaffAttemptID staffid) throws IOException {
StaffInProgress sip = runningStaffs.get(staffid);
if (sip != null) {
return sip.getStaff();
} else {
LOG.warn(staffid + " is not in the runningStaffs");
return null;
}
}
@Override
public boolean getStaffRecoveryState(StaffAttemptID staffId) {
return runningStaffs.get(staffId).getStatus().isRecovery();
}
@Override
public boolean getStaffChangeWorkerState(StaffAttemptID staffId) {
return runningStaffs.get(staffId).getChangeWorkerState();
}
@Override
public int getFailCounter(StaffAttemptID staffId) {
return this.runningStaffs.get(staffId).getFailCounter();
}
@Override
public boolean ping(StaffAttemptID staffId) throws IOException {
return false;
}
@Override
public void done(StaffAttemptID staffId, boolean shouldBePromoted)
throws IOException {
// TODO Auto-generated method stub
}
@Override
public void fsError(StaffAttemptID staffId, String message)
throws IOException {
// TODO Auto-generated method stub
}
@Override
public String getWorkerManagerName(BSPJobID jobId, StaffAttemptID staffId) {
return runningJobtoWorkerAgent.get(jobId).getWorkerManagerName(jobId, staffId);
}
@Override
public boolean localBarrier(BSPJobID jobId, StaffAttemptID staffId,
int superStepCounter, SuperStepReportContainer ssrc) {
return runningJobtoWorkerAgent.get(jobId).localBarrier(jobId, staffId,
superStepCounter, ssrc);
}
@Override
public int getNumberWorkers(BSPJobID jobId, StaffAttemptID staffId) {
return runningJobtoWorkerAgent.get(jobId).getNumberWorkers(jobId,
staffId);
}
@Override
public void setNumberWorkers(BSPJobID jobId, StaffAttemptID staffId, int num) {
runningJobtoWorkerAgent.get(jobId)
.setNumberWorkers(jobId, staffId, num);
}
// nc
@Override
public void addStaffReportCounter(BSPJobID jobId) {
runningJobtoWorkerAgent.get(jobId).addStaffReportCounter();
}
public String getWorkerManagerName() {
return this.workerManagerName;
}
@Override
public BSPJobID getBSPJobID() {
return null;
}
@Override
public void setStaffStatus(StaffAttemptID staffId, int staffStatus, Fault fault, int stage) {
this.runningStaffs.get(staffId).setStaffStatus(staffStatus, fault);
this.runningStaffs.get(staffId).getStatus().setStage(stage);
}
public StaffStatus getStaffStatus(StaffAttemptID staffId){
return this.runningStaffs.get(staffId).getStatus();
}
/**
* This method is used to set mapping table that shows the partition to the
* worker. According to Job ID get WorkerAgentForJob and call its method to
* set this mapping table.
*
* @param jobId
* @param partitionId
* @param hostName
*/
public void setWorkerNametoPartitions(BSPJobID jobId, int partitionId,
String hostName) {
this.runningJobtoWorkerAgent.get(jobId).setWorkerNametoPartitions(
jobId, partitionId, hostName);
}
/**
* Get the host name of the workerManager.
*
* @return
*/
public String getHostName() {
return this.conf.get(Constants.BC_BSP_WORKERAGENT_HOST,
Constants.DEFAULT_BC_BSP_WORKERAGENT_HOST);
}
@Override
public void clearFailedJobList() {
this.failedJobList.clear();
}
@Override
public void addFailedJob(BSPJobID jobId) {
this.failedJobList.add(jobId);
}
@Override
public int getFailedJobCounter() {
return this.failedJobList.size();
}
@Override
public synchronized int getFreePort() {
ServerSocket s;
this.currentFreePort = this.currentFreePort + 1;
int count = 0;
for (; this.currentFreePort <= 65536; this.currentFreePort++) {
count++;
if (count > 5535) {
LOG.info("[WorkerManager: getFreePort()] attempts to get a free port over 5535 times!");
return 60000;
}
if (this.currentFreePort > 65535)
this.currentFreePort = 60001;
try {
s = new ServerSocket(this.currentFreePort);
s.close();
return this.currentFreePort;
} catch (IOException e) {
LOG.error("[WokerManager] caught", e);
}
}
return 60000;
}
@Override
public void setStaffAgentAddress(StaffAttemptID staffID, String addr) {
if (this.runningStaffs.containsKey(staffID)) {
StaffInProgress sip = this.runningStaffs.get(staffID);
String[] addrs = addr.split(":");
InetSocketAddress address = new InetSocketAddress(addrs[0],
Integer.parseInt(addrs[1]));
WorkerAgentForStaffInterface staffAgent = null;
try {
staffAgent = ( WorkerAgentForStaffInterface ) RPC.getProxy(
WorkerAgentForStaffInterface.class,
WorkerAgentForStaffInterface.versionID, address,
this.conf);
} catch (IOException e) {
LOG.error("[WorkerManager] caught: ", e);
}
sip.setStaffAgent(staffAgent);
}
}
}