/**
* CopyRight by Chinamobile
*
* SimpleStaffScheduler.java
*/
package com.chinamobile.bcbsp.bspcontroller;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.chinamobile.bcbsp.util.BSPJobID;
import com.chinamobile.bcbsp.util.JobStatus;
import com.chinamobile.bcbsp.util.StaffStatus;
import com.chinamobile.bcbsp.workermanager.WorkerManagerStatus;
import com.chinamobile.bcbsp.action.Directive;
import com.chinamobile.bcbsp.action.LaunchStaffAction;
import com.chinamobile.bcbsp.action.WorkerManagerAction;
import com.chinamobile.bcbsp.bspcontroller.JobInProgress;
import com.chinamobile.bcbsp.bspstaff.Staff;
import com.chinamobile.bcbsp.bspstaff.StaffInProgress;
import com.chinamobile.bcbsp.fault.storage.Fault;
import com.chinamobile.bcbsp.rpc.WorkerManagerProtocol;
import com.chinamobile.bcbsp.sync.SynchronizationServerInterface;
import com.chinamobile.bcbsp.sync.SynchronizationServer;
/**
* A simple staff scheduler.
*
* to be described by Wang Zhigang.
*
* @author
* @version
*/
class SimpleStaffScheduler extends StaffScheduler {
private static final Log LOG = LogFactory.getLog(SimpleStaffScheduler.class);
public static final String WAIT_QUEUE = "waitQueue";
public static final String PROCESSING_QUEUE = "processingQueue";
public static final String FINISHED_QUEUE = "finishedQueue";
public static final String FAILED_QUEUE = "failedQueue";
private static final int CACHE_QUEUE_LENGTH = 20;
private QueueManager queueManager;
private volatile boolean initialized;
private SynchronizationServerInterface syncServer;
private JobListener jobListener;
private JobProcessor jobProcessor;
private class JobListener extends JobInProgressListener {
@Override
public void jobAdded(JobInProgress job) throws IOException {
queueManager.initJob(job); // init staff
/** lock control(JobProcessor.run()--find(WAIT_QUEUE)) */
synchronized (WAIT_QUEUE) {
queueManager.addJob(WAIT_QUEUE, job);
queueManager.resortWaitQueue(WAIT_QUEUE);
LOG.info("JobListener: " + job.getJobID()
+ " is added to the wait_queue");
}
}
/**
* Review comments:
* (1)If clients submit jobs continuously, FAILED_QUEUE and FINISHED_QUEUE will
* be too large to be resident in the memory. Then the BSPController will be crashed!
* Review time: 2011-11-30;
* Reviewer: Hongxu Zhang.
*
* Fix log:
* (1)If the length of FINISHED_QUEUE or FAILED_QUEUE is more than CACHE_QUEUE_LENGTH,
* the job in the header of QUEUE will be cleanup.
* Fix time: 2011-12-04;
* Programmer: Zhigang Wang.
*/
@Override
public ArrayList<BSPJobID> jobRemoved(JobInProgress job) throws IOException {
if(job.getStatus().getRunState() == JobStatus.RECOVERY) {
queueManager.moveJob(PROCESSING_QUEUE, FAILED_QUEUE, job);
LOG.info("JobListener" + job.getJobID()
+ " is removed from the PROCESSING_QUEUE");
} else {
queueManager.moveJob(PROCESSING_QUEUE, FINISHED_QUEUE, job);
}
ArrayList<BSPJobID> removeJob = new ArrayList<BSPJobID>();
ArrayList<JobInProgress> finished =
new ArrayList<JobInProgress>(queueManager.findQueue(FINISHED_QUEUE).getJobs());
ArrayList<JobInProgress> failed =
new ArrayList<JobInProgress>(queueManager.findQueue(FAILED_QUEUE).getJobs());
if (finished.size() > CACHE_QUEUE_LENGTH) {
removeJob.add(finished.get(0).getJobID());
queueManager.removeJob(FINISHED_QUEUE, finished.get(0));
}
if (failed.size() > CACHE_QUEUE_LENGTH) {
removeJob.add(failed.get(0).getJobID());
queueManager.removeJob(FAILED_QUEUE, failed.get(0));
}
return removeJob;
}
}
/**
* JobProcessor
*
* to be described by Wang Zhigang.
*
* @author
* @version
*/
private class JobProcessor extends Thread implements Schedulable {
JobProcessor() {
super("JobProcess");
}
/**
* run: scheduler thread. Main logic scheduling staff to
* WorkerManager(s). Also, it will move JobInProgress from WAIT_QUEUE to
* PROCESSING_QUEUE
*/
public void run() {
if (false == initialized) {
throw new IllegalStateException(
"SimpleStaffScheduler initialization"
+ " is not yet finished!");
}
while (initialized) {
Queue<JobInProgress> queue;
// add lock to WAIT_QUEUE
synchronized (WAIT_QUEUE) {
queue = queueManager.findQueue(WAIT_QUEUE);
}
if (queue == null) {
throw new NullPointerException(WAIT_QUEUE
+ " does not exist.");
}
// remove a job from the WAIT_QUEUE and check the ClusterStatus
JobInProgress jip = queue.removeJob();
ClusterStatus cs;
while (true) {
try {
Thread.sleep(2000);
cs = controller.getClusterStatus(false);
if (jip.getNumBspStaff() <= (cs.getMaxClusterStaffs() - cs
.getRunningClusterStaffs())) {
break;
}
} catch (Exception e) {
// TODO : The NullPointerException maybe happen when stop the thread.
}
}// while
//schedule the job and add it to the PROCESSING_QUEUE
chooseScheduler(jip);
queueManager.addJob(PROCESSING_QUEUE, jip);
}// while
}// run
/**
* schedule: Schedule job to the chosen Worker
*
* @param jip
* JobInProgress
*/
public void chooseScheduler(JobInProgress jip) {
if(jip.getStatus().getRunState() == JobStatus.RUNNING) {
normalSchedule(jip);
} else if(jip.getStatus().getRunState() == JobStatus.RECOVERY) {
recoverySchedule(jip);
} else {
LOG.warn("Currently master only shcedules job in running state or revovery state. "
+ "This may be refined in the future. JobId:" + jip.getJobID());
}
}
/**
* Review comments:
* (1)The name of variables is not coherent. For examples, I think the "groomServerManager"
* should be "controller", and the "tasksLoadFactor" should be "staffsLoadFactor".
* Review time: 2011-11-30;
* Reviewer: Hongxu Zhang.
*
* Fix log:
* (1)The conflicting name of variables has been repaired.
* Fix time: 2011-12-04;
* Programmer: Zhigang Wang.
*/
public void normalSchedule(JobInProgress job) {
List<JobInProgress> jip_wait, jip_process;
int remainingStaffsLoad = job.getNumBspStaff();
synchronized (WAIT_QUEUE) {
jip_wait = new ArrayList<JobInProgress>(queueManager.findQueue(WAIT_QUEUE).getJobs());
for (JobInProgress jip : jip_wait) {
remainingStaffsLoad += jip.getNumBspStaff();
}
}
int runningStaffLoad = 0;
synchronized (PROCESSING_QUEUE) {
jip_process = new ArrayList<JobInProgress>(queueManager.findQueue(PROCESSING_QUEUE).getJobs());
for (JobInProgress jip : jip_process) {
runningStaffLoad += jip.getNumBspStaff();
}
}
ClusterStatus clusterStatus = controller.getClusterStatus(false);
double staffsLoadFactor = (( double ) (remainingStaffsLoad + runningStaffLoad))
/ clusterStatus.getMaxClusterStaffs();
// begin scheduling all staff(s) for the chosen job
StaffInProgress[] staffs = job.getStaffInProgress();
for (int i = 0; i < staffs.length; i++) {
if (!staffs[i].isRunning() && !staffs[i].isComplete()) {
Collection<WorkerManagerStatus> glist = controller.workerServerStatusKeySet();
WorkerManagerStatus[] gss = ( WorkerManagerStatus[] ) glist
.toArray(new WorkerManagerStatus[glist.size()]);
// choose a reasonable worker for the chosen staff of the
Staff t = job.obtainNewStaff(gss, i, staffsLoadFactor);
if (job.getStatus().getRunState() == JobStatus.RUNNING) {
WorkerManagerProtocol worker = controller
.findWorkerManager(staffs[i]
.getWorkerManagerStatus());
boolean success = false;
try {
// dispatch the staff to the worker
Directive d = new Directive(
controller
.getActiveWorkerManagersName(),
new WorkerManagerAction[] { new LaunchStaffAction(
t) });
success = worker.dispatch(t.getJobID(), d, false, false, job.getNumAttemptRecovery());
job.updateStaffStatus(staffs[i], new StaffStatus(job.getJobID(), staffs[i].getStaffID(), 0,
StaffStatus.State.UNASSIGNED, "running", "groomServer",
StaffStatus.Phase.STARTING));
// update the WorkerManagerStatus Cache
WorkerManagerStatus new_gss = staffs[i]
.getWorkerManagerStatus();
int currentStaffsCount = new_gss
.getRunningStaffsCount();
new_gss
.setRunningStaffsCount((currentStaffsCount + 1));
controller
.updateWhiteWorkerManagersKey(staffs[i]
.getWorkerManagerStatus(), new_gss);
LOG.info(t.getStaffAttemptId()
+ " is divided to the "
+ new_gss.getWorkerManagerName());
} catch (IOException ioe) {
WorkerManagerStatus wms = staffs[i].getWorkerManagerStatus();
LOG.error("Fail to assign staff-" + staffs[i].getStaffId() + " to "
+ wms.getWorkerManagerName());
if (!success) {
job.addBlackListWorker(staffs[i].getWorkerManagerStatus());
worker.addFailedJob(job.getJobID());
if (worker.getFailedJobCounter() > controller.getMaxFailedJobOnWorker()) {
controller.removeWorkerFromWhite(wms);//white
wms.setPauseTime(System.currentTimeMillis());
controller.addWorkerToGray(wms, worker);//gray
LOG.info(wms.getWorkerManagerName()
+ " will be transferred from [WhiteList] to [GrayList]");
}
i--;
} else {
LOG.error("Exception has been catched in SimpleStaffScheduler--normalSchedule !", ioe);
Fault f = new Fault(Fault.Type.DISK, Fault.Level.WARNING, job.getJobID(), ioe.toString());
job.getController().recordFault(f);
job.getController().recovery(job.getJobID());
try {
job.getController().killJob(job.getJobID());
} catch (IOException e) {
LOG.error("Kill Job", e);
}
}
}
} else {
LOG.warn("Currently master only shcedules job in running state. "
+ "This may be refined in the future. JobId:"
+ job.getJobID());
}// if-else
}// if
}// for
job.getGssc().setCheckNumBase();
job.getGssc().start();
}// schedule
public void recoverySchedule(JobInProgress job) {
int remainingStaffsLoad = job.getNumBspStaff();
List<JobInProgress> jip_list;
//add lock to WAIT_QUEUE
synchronized(WAIT_QUEUE){
jip_list = new ArrayList<JobInProgress>(queueManager.findQueue(WAIT_QUEUE).getJobs());
}
//calculate the load-factor
for(JobInProgress jip : jip_list) {
remainingStaffsLoad += jip.getNumBspStaff();
}
ClusterStatus clusterStatus = controller.getClusterStatus(false);
@SuppressWarnings("unused")
double staffsLoadFactor = ((double)remainingStaffsLoad)/clusterStatus.getMaxClusterStaffs();
Collection<WorkerManagerStatus> glist = controller.workerServerStatusKeySet();
LOG.info("recoverySchedule--glist.size(): " + glist.size());
WorkerManagerStatus[] gss = (WorkerManagerStatus[]) glist.toArray(new WorkerManagerStatus[glist.size()]);
LOG.info("recoverySchedule-- WorkerManagerStatus[] gss.size: "
+ gss.length + "gss[0]: " + gss[0].getWorkerManagerName());
StaffInProgress[] staffs = job.getStaffInProgress();
for(int i=0; i<staffs.length; i++) {
WorkerManagerProtocol worker = null;
boolean success = false;
try {
if(staffs[i].getStaffStatus(staffs[i].getStaffID()).getRunState()
== StaffStatus.State.WORKER_RECOVERY) {
LOG.info("recoverySchedule ----WORKER_RECOVERY");
job.obtainNewStaff(gss, i, 1.0, true);
worker = controller.findWorkerManager(staffs[i].getWorkerManagerStatus());
Directive d = new Directive(controller.getActiveWorkerManagersName(),
new WorkerManagerAction[] { new LaunchStaffAction(staffs[i].getS()) });
d.setFaultSSStep(job.getFaultSSStep());
if (staffs[i].getChangeWorkerState() == true) {
success = worker.dispatch(staffs[i].getS().getJobID(), d,
true, true, job.getNumAttemptRecovery());
} else {
success = worker.dispatch(staffs[i].getS().getJobID(), d,
true, false, job.getNumAttemptRecovery());
}
//update the WorkerManagerStatus Cache
WorkerManagerStatus new_gss = staffs[i].getWorkerManagerStatus();
int currentStaffsCount = new_gss.getRunningStaffsCount();
new_gss.setRunningStaffsCount((currentStaffsCount+1));
controller.updateWhiteWorkerManagersKey(staffs[i].getWorkerManagerStatus(), new_gss);
LOG.info(staffs[i].getS().getStaffAttemptId()
+ " is divided to the " + new_gss.getWorkerManagerName());
} else if(staffs[i].getStaffStatus(staffs[i].getStaffID()).getRunState()
== StaffStatus.State.STAFF_RECOVERY) {
LOG.info("recoverySchedule ----STAFF_RECOVERY");
Map<String, Integer> workerManagerToTimes = job.getStaffToWMTimes().get(staffs[i].getStaffID());
@SuppressWarnings("unused")
String lastWMName = getTheLastWMName(workerManagerToTimes);
job.obtainNewStaff(gss, i, 1.0, true);
worker = controller.findWorkerManager(staffs[i].getWorkerManagerStatus());
Directive d = new Directive(controller.getActiveWorkerManagersName(),
new WorkerManagerAction[] { new LaunchStaffAction(staffs[i].getS()) });
d.setFaultSSStep(job.getFaultSSStep());
if (staffs[i].getChangeWorkerState() == true) {
success = worker.dispatch(staffs[i].getS().getJobID(), d,
true, true, job.getNumAttemptRecovery());
} else {
success = worker.dispatch(staffs[i].getS().getJobID(), d,
true, false, job.getNumAttemptRecovery());
}
//update the WorkerManagerStatus Cache
WorkerManagerStatus new_gss = staffs[i].getWorkerManagerStatus();
int currentStaffsCount = new_gss.getRunningStaffsCount();
new_gss.setRunningStaffsCount((currentStaffsCount+1));
controller.updateWhiteWorkerManagersKey(staffs[i].getWorkerManagerStatus(), new_gss);
LOG.info(staffs[i].getS().getStaffAttemptId()
+ " is divided to the " + new_gss.getWorkerManagerName());
}
} catch (Exception e) {
WorkerManagerStatus wms = staffs[i].getWorkerManagerStatus();
LOG.error("Fail to assign staff-" + staffs[i].getStaffId() + " to " + wms.getWorkerManagerName());
if (!success) {
job.addBlackListWorker(staffs[i].getWorkerManagerStatus());
worker.addFailedJob(job.getJobID());
if (worker.getFailedJobCounter() > controller.getMaxFailedJobOnWorker()) {
controller.removeWorkerFromWhite(wms);//white
wms.setPauseTime(System.currentTimeMillis());
controller.addWorkerToGray(wms, worker);//gray
LOG.info(wms.getWorkerManagerName()
+ " will be transferred from [WhiteList] to [GrayList]");
}
i--;
} else {
LOG.error("Exception has been catched in SimpleStaffScheduler--recoverySchedule !", e);
Fault f = new Fault(Fault.Type.DISK, Fault.Level.WARNING, job.getJobID(), e.toString());
job.getController().recordFault(f);
job.getController().recovery(job.getJobID());
try {
job.getController().killJob(job.getJobID());
} catch (IOException oe) {
LOG.error("Kill Job", oe);
}
}
}
}
job.getStatus().setRecovery(true);
job.getRecoveryBarrier(job.getWMNames());
LOG.warn("leave---jip.getRecoveryBarrier(ts, WMNames.size());");
while (true) {
try {
if (job.isCommandBarrier()) {
LOG.info("[recoverySchedule] quit");
break;
}
Thread.sleep(1000);
} catch (Exception e) {
Fault f = new Fault(Fault.Type.SYSTEMSERVICE, Fault.Level.INDETERMINATE,
job.getJobID(), e.toString());
job.getController().recordFault(f);
job.getController().recovery(job.getJobID());
try {
job.getController().killJob(job.getJobID());
} catch (IOException ioe) {
LOG.error("[Kill Job Exception]", ioe);
}
}
}
}
private String getTheLastWMName(Map<String, Integer> map) {
String lastLWName = null;
int lastLaunchWorker = 0;
int i = 0;
Set<String> keySet = map.keySet();
Iterator<String> it = keySet.iterator();
while (it.hasNext()) {
lastLaunchWorker ++;
it.next();
}
Iterator<String> iter = keySet.iterator();
while (iter.hasNext()) {
i++;
String key = iter.next();
if (i == lastLaunchWorker) {
lastLWName = key;
}
}
LOG.info("last---getTheLastWMName(Map<String, Integer> map:)" + " " + lastLWName);
return lastLWName;
}
}// JobProcessor
public SimpleStaffScheduler() {
this.syncServer = new SynchronizationServer();
this.jobListener = new JobListener();
this.jobProcessor = new JobProcessor();
}
/**
* start: create queues for job and the root directory on the ZooKeeper
* cluster, and then start the simple scheduler thread
*/
@Override
public void start() {
this.queueManager = new QueueManager(getConf());
this.queueManager.createFCFSQueue(WAIT_QUEUE);
this.queueManager.createFCFSQueue(PROCESSING_QUEUE);
this.queueManager.createFCFSQueue(FINISHED_QUEUE);
this.queueManager.createFCFSQueue(FAILED_QUEUE);
this.controller.addJobInProgressListener(this.jobListener);
this.initialized = true;
// start the Synchronization Server and Scheduler Server.
this.syncServer.startServer();
this.jobProcessor.start();
}
/**
* terminate: cleanup when close the cluster. Include: remove the
* jobLinstener, delete the root directory on ZooKeeper, and stop the
* scheduler thread
*/
@SuppressWarnings("deprecation")
@Override
public void stop() {
this.initialized = false;
this.jobProcessor.stop();
boolean isSuccess = this.syncServer.stopServer();
if (isSuccess) {
LOG.info("Success to cleanup the nodes on ZooKeeper");
} else {
LOG.error("Fail to cleanup the nodes on ZooKeeper");
}
if (this.jobListener != null) {
this.controller.removeJobInProgressListener(this.jobListener);
}
}
@Override
public Collection<JobInProgress> getJobs(String queue) {
return (queueManager.findQueue(queue)).getJobs();
}
}