/**
* StaffInProgress.java
*/
package com.chinamobile.bcbsp.bspstaff;
import java.io.IOException;
import java.util.Map;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobStatus;
import com.chinamobile.bcbsp.bspcontroller.BSPController;
import com.chinamobile.bcbsp.bspcontroller.JobInProgress;
import com.chinamobile.bcbsp.client.BSPJobClient.RawSplit;
import com.chinamobile.bcbsp.util.BSPJobID;
import com.chinamobile.bcbsp.util.StaffAttemptID;
import com.chinamobile.bcbsp.util.StaffID;
import com.chinamobile.bcbsp.util.StaffStatus;
import com.chinamobile.bcbsp.workermanager.WorkerManagerStatus;
/**
* StaffInProgress
*
* StaffInProgress maintains all the info needed for a Staff in the lifetime of
* its owning Job.
*
* @author
* @version
*/
public class StaffInProgress {
private static final Log LOG = LogFactory.getLog(StaffInProgress.class);
// Constants
private static final int MAX_TASK_EXECS = 1;
private int maxStaffAttempts = 4;
private boolean failed = false;
private static final int NUM_ATTEMPTS_PER_RESTART = 1000;
// Job Meta
private String jobFile = null;
private int partition;
private StaffID id;
private JobInProgress job;
private int completes = 0;
// RawSplit info
private RawSplit rawSplit;
// WorkerManagertatus info
private WorkerManagerStatus wms = null;
private long startTime = 0;
// The 'next' usable staff ID of this tip
private int nextStaffId = 0;
// The staff ID that took this TIP to SUCCESS
private StaffAttemptID successfulStaffID;
// The first Staff ID of this tip
private StaffAttemptID firstStaffID;
// Map from task Id -> GroomServer Id, contains tasks that are
// currently runnings
private ConcurrentHashMap<StaffAttemptID, String> activeStaffs = new ConcurrentHashMap<StaffAttemptID, String>();
// All attempt Ids of this TIP
/** Map from taskId -> StaffStatus */
/**
* @param staffID
* Review comment:
* This TreeMap object is used in some critical sections, it may not be thread-safe
* Review time: 2011-11-30
* Reviewer: Hongxu Zhang
* Fix log:
* Truly the use of staffStatus in critical sections may not be thread-safe because
* TreeMap is not thread-safe, so we change activeStaffs to ConcurrentHashMap
*/
private ConcurrentHashMap<StaffAttemptID, StaffStatus> staffStatuses = new ConcurrentHashMap<StaffAttemptID, StaffStatus>();
private BSPJobID jobId;
private Staff s = null;
private StaffAttemptID sid = null;
private boolean changeWorkerState = false;
/**
* Constructor for new nexus between BSPController and WorkerManager.
*
* @param jobId
* is identification of JobInProgress.
* @param jobFile
* the path of job file
* @param partition
* which partition this StaffInProgress owns.
*/
public StaffInProgress(BSPJobID jobId, String jobFile, int partition) {
this.jobId = jobId;
this.jobFile = jobFile;
this.partition = partition;
this.id = new StaffID(jobId, partition);
}
public StaffInProgress(BSPJobID jobId, String jobFile,
BSPController master, Configuration conf, JobInProgress job,
int partition, RawSplit rawSplit) {
this.jobId = jobId;
this.jobFile = jobFile;
this.job = job;
this.partition = partition;
this.rawSplit = rawSplit;
this.id = new StaffID(jobId, partition);
}
/**
* Return a Staff that can be sent to a WorkerManager for execution.
*/
public Staff getStaffToRun(WorkerManagerStatus status) throws IOException {
this.wms = status;
if (nextStaffId < (MAX_TASK_EXECS + maxStaffAttempts)) {
int attemptId = job.getNumAttemptRecovery()
* NUM_ATTEMPTS_PER_RESTART + nextStaffId;
sid = new StaffAttemptID(id, attemptId);
++nextStaffId;
} else {
LOG.warn("Exceeded limit of " + (MAX_TASK_EXECS + maxStaffAttempts)
+ " attempts for the tip '" + getSIPId() + "'");
return null;
}
this.s = new BSPStaff(jobId, jobFile, sid, partition, rawSplit
.getClassName(), rawSplit.getBytes());
activeStaffs.put(sid, status.getWorkerManagerName());
return s;
}
public void getStaffToRun(WorkerManagerStatus status, boolean recovery) throws IOException {
this.wms = status;
LOG.info("Recovery: getStaffToRun" + " " + recovery);
activeStaffs.put(sid, status.getWorkerManagerName());
}
public boolean getChangeWorkerState() {
return changeWorkerState;
}
public void setChangeWorkerState(boolean changeWorkerState) {
this.changeWorkerState = changeWorkerState;
}
/**
* Return the start time
*/
public long getStartTime() {
return startTime;
}
/**
* Return the parent job
*/
public JobInProgress getJob() {
return job;
}
public StaffID getSIPId() {
return id;
}
public StaffID getStaffId() {
return this.id;
}
public RawSplit getRawSplit() {
return this.rawSplit;
}
public WorkerManagerStatus getWorkerManagerStatus() {
return this.wms;
}
public Map<StaffAttemptID, String> getStaffs() {
return activeStaffs;
}
public Staff getS() {
return s;
}
public StaffAttemptID getStaffID() {
return sid;
}
/**
* Is the Staff associated with staffID the first attempt of the tip?
*
* @param staffID
* @return Returns true if the Staff is the first attempt of the tip
*/
public boolean isFirstAttempt(StaffAttemptID staffID) {
return firstStaffID == null ? false : firstStaffID.equals(staffID);
}
/**
* Is this tip currently running any tasks?
*
* @return true if any tasks are running
*/
public boolean isRunning() {
return !activeStaffs.isEmpty();
}
/**
* Is this tip complete?
*
* @return <code>true</code> if the tip is complete, else <code>false</code>
*/
public synchronized boolean isComplete() {
return (completes > 0);
}
/**
* Is the given staffID the one that took this tip to completion?
*
* @param staffID
* staffID of attempt to check for completion
* @return <code>true</code> if staffID is complete, else <code>false</code>
*/
public boolean isComplete(StaffAttemptID staffID) {
return (completes > 0 && staffID.equals(getSuccessfulStaffid()));
}
private TreeSet<StaffAttemptID> staffReportedClosed = new TreeSet<StaffAttemptID>();
public boolean shouldCloseForClosedJob(StaffAttemptID sid) {
StaffStatus ss = ( StaffStatus ) staffStatuses.get(sid);
if ((ss != null) && (!staffReportedClosed.contains(sid))
&& (job.getStatus().getRunState() != JobStatus.RUNNING)) {
staffReportedClosed.add(sid);
return true;
} else {
return false;
}
}
public void completed(StaffAttemptID staffID) {
LOG.info("Staff '" + staffID.getStaffID().toString()
+ "' has completed.");
StaffStatus status = ( StaffStatus ) staffStatuses.get(staffID);
status.setRunState(StaffStatus.State.SUCCEEDED);
activeStaffs.remove(staffID);
setSuccessfulStaffid(staffID);
this.completes++;
}
public void terminated(StaffAttemptID staffID) {
LOG.info("Staff '" + staffID.getStaffID().toString() + "' has failed.");
StaffStatus status = ( StaffStatus ) staffStatuses.get(staffID);
status.setRunState(StaffStatus.State.FAILED);
activeStaffs.remove(staffID);
}
private void setSuccessfulStaffid(StaffAttemptID staffID) {
this.successfulStaffID = staffID;
}
private StaffAttemptID getSuccessfulStaffid() {
return successfulStaffID;
}
public void updateStatus(StaffStatus status) {
staffStatuses.put(status.getStaffId(), status);
}
public StaffStatus getStaffStatus(StaffAttemptID staffID) {
return this.staffStatuses.get(staffID);
}
public void kill() {
this.failed = true;
}
public boolean isFailed() {
return failed;
}
}