/** * CopyRight by Chinamobile * * JobInProgress.java * * JobInProgress is the center for controlling the job running. It maintains all * important information about the job and the staffs, including the status. */ package com.chinamobile.bcbsp.bspcontroller; import java.io.DataInputStream; import java.io.IOException; import java.net.URI; //import java.net.URI; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import com.chinamobile.bcbsp.Constants; import com.chinamobile.bcbsp.client.BSPJobClient; import com.chinamobile.bcbsp.client.BSPJobClient.RawSplit; import com.chinamobile.bcbsp.fault.storage.Fault; import com.chinamobile.bcbsp.sync.GeneralSSController; import com.chinamobile.bcbsp.sync.GeneralSSControllerInterface; import com.chinamobile.bcbsp.sync.SuperStepCommand; import com.chinamobile.bcbsp.sync.SuperStepReportContainer; import com.chinamobile.bcbsp.util.*; import com.chinamobile.bcbsp.api.AggregateValue; import com.chinamobile.bcbsp.api.Aggregator; import com.chinamobile.bcbsp.bspstaff.StaffInProgress; import com.chinamobile.bcbsp.bspstaff.Staff; import com.chinamobile.bcbsp.workermanager.WorkerManagerStatus; /** * JobInProgress is the center for controlling the job running. It maintains all * important information about the job and the staffs, including the status. * * @author * @version */ public class JobInProgress implements JobInProgressControlInterface { /** * Used when a kill is issued to a job which is initializing. */ public static class KillInterruptedException extends InterruptedException { private static final long serialVersionUID = 1L; public KillInterruptedException(String msg) { super(msg); } } private static final Log LOG = LogFactory.getLog(JobInProgress.class); private boolean staffsInited = false; private boolean checkPointNext = false; private Configuration conf; private JobProfile profile; private JobStatus status; private Path jobFile = null; private Path localJobFile = null; private Path localJarFile = null; private LocalFileSystem localFs; private long startTime; private long launchTime; private long finishTime; private BSPJobID jobId; private BSPJob job; private final BSPController controller; private StaffInProgress staffs[] = new StaffInProgress[0]; private int superStepCounter; //record taskID and staff status private List<StaffAttemptID> attemptIDList= new ArrayList<StaffAttemptID>(); private int numBSPStaffs = 0; private GeneralSSControllerInterface gssc; private HashMap<String, ArrayList<StaffAttemptID>> workersToStaffs = new HashMap<String, ArrayList<StaffAttemptID>>(); //record the same staff run on different workers' times private LinkedHashMap<StaffAttemptID, Map<String, Integer>> staffToWMTimes = new LinkedHashMap<StaffAttemptID, Map<String, Integer>>(); private int checkPointFrequency = 0; private int attemptRecoveryCounter = 0; private int maxAttemptRecoveryCounter = 0; private int maxStaffAttemptRecoveryCounter = 0; private int ableCheckPoint = 0; private int faultSSStep; private HashMap<WorkerManagerStatus, Integer> failedRecord = new HashMap<WorkerManagerStatus, Integer>(); private ArrayList<WorkerManagerStatus> blackList = new ArrayList<WorkerManagerStatus>(); /** the priority level of the job */ private String priority = Constants.PRIORITY.NORMAL;// default // For Aggregation /** Map for user registered aggregate values. */ private HashMap<String, Class<? extends AggregateValue<?>>> nameToAggregateValue = new HashMap<String, Class<? extends AggregateValue<?>>>(); /** Map for user registered aggregatros. */ private HashMap<String, Class<? extends Aggregator<?>>> nameToAggregator = new HashMap<String, Class<? extends Aggregator<?>>>(); @SuppressWarnings("unchecked") private HashMap<String, ArrayList<AggregateValue>> aggregateValues = new HashMap<String, ArrayList<AggregateValue>>(); @SuppressWarnings("unchecked") private HashMap<String, AggregateValue> aggregateResults = new HashMap<String, AggregateValue>(); private List<String> WMNames = new ArrayList<String>(); public JobInProgress(BSPJobID jobId, Path jobFile, BSPController controller, Configuration conf) throws IOException { this.jobId = jobId; this.localFs = FileSystem.getLocal(conf); this.jobFile = jobFile; this.controller = controller; this.status = new JobStatus(jobId, null, 0L, 0L, JobStatus.State.PREP .value()); this.startTime = System.currentTimeMillis(); this.superStepCounter = 0; this.maxAttemptRecoveryCounter = conf.getInt( Constants.BC_BSP_JOB_RECOVERY_ATTEMPT_MAX, 0); this.maxStaffAttemptRecoveryCounter = conf.getInt( Constants.BC_BSP_STAFF_RECOVERY_ATTEMPT_MAX, 0); this.localJobFile = controller .getLocalPath(Constants.BC_BSP_LOCAL_SUBDIR_CONTROLLER + "/" + jobId + ".xml"); this.localJarFile = controller .getLocalPath(Constants.BC_BSP_LOCAL_SUBDIR_CONTROLLER + "/" + jobId + ".jar"); Path jobDir = controller.getSystemDirectoryForJob(jobId); FileSystem fs = jobDir.getFileSystem(conf); fs.copyToLocalFile(jobFile, localJobFile); job = new BSPJob(jobId, localJobFile.toString()); this.conf = job.getConf(); this.numBSPStaffs = job.getNumBspStaff(); this.profile = new JobProfile(job.getUser(), jobId, jobFile.toString(), job.getJobName()); status.setUsername(job.getUser()); status.setStartTime(startTime); String jarFile = job.getJar(); if (jarFile != null) { fs.copyToLocalFile(new Path(jarFile), localJarFile); } this.priority = job.getPriority(); setCheckPointFrequency(); // For aggregation. /** Add the user program jar to the system's classpath. */ ClassLoaderUtil.addClassPath(localJarFile.toString()); loadAggregators(); this.gssc = new GeneralSSController(jobId); } public JobInProgress(BSPJob job, BSPJobID jobId, BSPController controller, int staffNum, HashMap<Integer, String[]> locations) { this.jobId = jobId; this.controller = controller; this.superStepCounter = 0; this.numBSPStaffs = staffNum; staffs = new StaffInProgress[locations.size()]; for (int i = 0; i < this.numBSPStaffs; i++) { RawSplit split = new RawSplit(); split.setLocations(locations.get(i)); split.setClassName("yes"); staffs[i] = new StaffInProgress(this.jobId, null, this.controller, null, this, i, split); } this.job = job; loadAggregators(); } @SuppressWarnings("unchecked") private void loadAggregators() { // load aggregators and aggregate values. int aggregateNum = this.job.getAggregateNum(); String[] aggregateNames = this.job.getAggregateNames(); for (int i = 0; i < aggregateNum; i ++) { String name = aggregateNames[i]; this.nameToAggregator.put(name, this.job.getAggregatorClass(name)); this.nameToAggregateValue.put(name, job.getAggregateValueClass(name)); this.aggregateValues.put(name, new ArrayList<AggregateValue>()); } } public BSPController getController() { return controller; } public BSPJob getJob() { return job; } public HashMap<StaffAttemptID, Map<String, Integer>> getStaffToWMTimes() { return staffToWMTimes; } public JobProfile getProfile() { return profile; } public JobStatus getStatus() { return status; } public synchronized long getLaunchTime() { return launchTime; } public long getStartTime() { return startTime; } public String getPriority() { return priority; } public int getNumBspStaff() { return numBSPStaffs; } public StaffInProgress[] getStaffInProgress() { return staffs; } public long getFinishTime() { return finishTime; } public int getFaultSSStep() { return faultSSStep; } public void setFaultSSStep(int faultSSStep) { this.faultSSStep = faultSSStep; } public GeneralSSControllerInterface getGssc() { return gssc; } /** * If one task of this job is failed on this worker, then record the number of failing to execute the job on the worker. * If the failed number is more than a threshold, then this worker is gray for the job. That means return <code>true</code>, * else return <code>false</code>. * @param wms * @return */ public boolean addFailedWorker(WorkerManagerStatus wms) { int counter = 1; if (this.failedRecord.containsKey(wms)) { counter = this.failedRecord.get(wms); counter++; } this.failedRecord.put(wms, counter); if (this.failedRecord.get(wms) > 2) { this.blackList.add(wms); LOG.warn("Warn: " + wms.getWorkerManagerName() + " is added into the BlackList of job " + this.jobId.toString() + " because the failed attempts is up to threshold:" + 2); return true; } else { return false; } } public void addBlackListWorker(WorkerManagerStatus wms) { this.blackList.add(wms); } /** * @return the number of desired tasks. */ public int desiredBSPStaffs() { return numBSPStaffs; } /** * @return The JobID of this JobInProgress. */ public BSPJobID getJobID() { return jobId; } public synchronized StaffInProgress findStaffInProgress(StaffID id) { if (areStaffsInited()) { for (StaffInProgress sip : staffs) { if (sip.getStaffId().equals(id)) { return sip; } } } return null; } public synchronized boolean areStaffsInited() { return this.staffsInited; } public String toString() { return "jobName:" + profile.getJobName() + "\n" + "submit user:" + profile.getUser() + "\n" + "JobId:" + jobId + "\n" + "JobFile:" + jobFile + "\n"; } // /////////////////////////////////////////////////// // Create/manage tasks // /////////////////////////////////////////////////// public void initStaffs() throws IOException { if (staffsInited) { return; } if (LOG.isDebugEnabled()) { LOG.debug("numBSPStaffs: " + numBSPStaffs); } // read the input split info from HDFS Path sysDir = new Path(this.controller.getSystemDir()); FileSystem fs = sysDir.getFileSystem(conf); DataInputStream splitFile = fs.open(new Path(conf .get(Constants.USER_BC_BSP_JOB_SPLIT_FILE))); RawSplit[] splits; try { splits = BSPJobClient.readSplitFile(splitFile); } finally { splitFile.close(); } // adjust number of map staffs to actual number of splits this.staffs = new StaffInProgress[numBSPStaffs]; for (int i = 0; i < numBSPStaffs; i++) { if (i < splits.length) { // this staff will load data from DFS staffs[i] = new StaffInProgress(getJobID(), this.jobFile .toString(), this.controller, this.conf, this, i, splits[i]); } else { // create a disable split. this only happen in Hash. RawSplit split = new RawSplit(); split.setClassName("no"); split.setDataLength(0); split.setBytes("no".getBytes(), 0, 2); split.setLocations(new String[] { "no" }); // this staff will not load data from DFS staffs[i] = new StaffInProgress(getJobID(), this.jobFile .toString(), this.controller, this.conf, this, i, split); } } // Update job status this.status.setRunState(JobStatus.RUNNING); staffsInited = true; LOG.debug("Job is initialized."); } public Staff obtainNewStaff(WorkerManagerStatus[] gss, int i, double staffsLoadFactor) { Staff result = null; try { if (!staffs[i].getRawSplit().getClassName().equals("no")) { // this staff need to load data according to the split info String[] locations = staffs[i].getRawSplit().getLocations(); int tmp_count = 0; int currentStaffs = 0; int maxStaffs = 0; int loadStaffs = 0; String tmp_location = locations[0]; WorkerManagerStatus gss_tmp; for (String location : locations) { gss_tmp = findWorkerManagerStatus(gss, location); if (gss_tmp == null) { continue; } currentStaffs = gss_tmp.getRunningStaffsCount(); maxStaffs = gss_tmp.getMaxStaffsCount(); loadStaffs = Math.min(maxStaffs, ( int ) Math .ceil(staffsLoadFactor * maxStaffs)); if ((loadStaffs - currentStaffs) > tmp_count) { tmp_count = loadStaffs - currentStaffs; tmp_location = location; } } if (tmp_count > 0) { WorkerManagerStatus status = findWorkerManagerStatus(gss, tmp_location); result = staffs[i].getStaffToRun(status); updateStaffToWMTimes(i, tmp_location); } else { result = staffs[i] .getStaffToRun(findMaxFreeWorkerManagerStatus(gss, staffsLoadFactor)); updateStaffToWMTimes(i, tmp_location); } } else { result = staffs[i].getStaffToRun(findMaxFreeWorkerManagerStatus( gss, staffsLoadFactor)); } } catch (IOException ioe) { LOG.error("Exception has been catched in JobInProgress--obtainNewStaff !", ioe); Fault f = new Fault(Fault.Type.DISK, Fault.Level.WARNING, this.getJobID(), ioe.toString()); this.getController().recordFault(f); this.getController().recovery(this.getJobID()); try { this.getController().killJob(this.getJobID()); } catch (IOException e) { LOG.error("Kill Exception", e); } } String name = staffs[i].getWorkerManagerStatus().getWorkerManagerName(); LOG.info("obtainNewWorker--[Init]" + name); if (workersToStaffs.containsKey(name)) { workersToStaffs.get(name).add(result.getStaffAttemptId()); LOG.info("The workerName has already existed and add the staff directly"); } else { ArrayList<StaffAttemptID> list = new ArrayList<StaffAttemptID>(); list.add(result.getStaffAttemptId()); attemptIDList.add(result.getStaffAttemptId()); workersToStaffs.put(name, list); LOG.info("Add the workerName " + name + " and the size of all workers is " + this.workersToStaffs.size()); } return result; } public void obtainNewStaff(WorkerManagerStatus[] gss, int i, double tasksLoadFactor, boolean recovery) { staffs[i].setChangeWorkerState(true); LOG.info("obtainNewStaff" + " " + recovery); try { staffs[i].getStaffToRun(findMaxFreeWorkerManagerStatus(gss, 1.0), true); } catch (IOException ioe) { LOG.error("Exception has been catched in JobInProgress--obtainNewStaff !", ioe); Fault f = new Fault(Fault.Type.DISK, Fault.Level.WARNING, this.getJobID(), ioe.toString()); this.getController().recordFault(f); this.getController().recovery(this.getJobID()); try { this.getController().killJob(job.getJobID()); } catch (IOException e) { LOG.error("IOException", e); } } String name = staffs[i].getWorkerManagerStatus().getWorkerManagerName(); LOG.info("obtainNewWorker--[recovery]" + name); if(workersToStaffs.containsKey(name)){ workersToStaffs.get(name).add(staffs[i].getS().getStaffAttemptId()); LOG.info("The workerName has already existed and add the staff directly"); }else{ ArrayList<StaffAttemptID> list = new ArrayList<StaffAttemptID>(); list.add(staffs[i].getS().getStaffAttemptId()); workersToStaffs.put(name, list); LOG.info("Add the workerName " + name + " and the size of all workers is " + this.workersToStaffs.size()); } } private void updateStaffToWMTimes(int i, String WorkerManagerName) { if(staffToWMTimes.containsKey(staffs[i].getStaffID())) { Map<String, Integer> workerManagerToTimes = staffToWMTimes.get(staffs[i].getStaffID()); int runTimes = 0; if(workerManagerToTimes.containsKey(WorkerManagerName)) { runTimes = workerManagerToTimes.get(WorkerManagerName) + 1; workerManagerToTimes.remove(WorkerManagerName); workerManagerToTimes.put(WorkerManagerName, runTimes); } else { workerManagerToTimes.put(WorkerManagerName, 1); } staffToWMTimes.remove(staffs[i].getStaffID()); staffToWMTimes.put(staffs[i].getStaffID(), workerManagerToTimes); } else { Map<String, Integer> workerManagerToTimes = new LinkedHashMap<String, Integer>(); workerManagerToTimes.put(WorkerManagerName, 1); staffToWMTimes.put(staffs[i].getStaffID(), workerManagerToTimes); } LOG.info("updateStaffToWMTimes---staffId: " + staffs[i].getStaffID() + " StaffWMTimes: " + staffToWMTimes); } /** Find the WorkerManagerStatus according to the WorkerManager name */ public WorkerManagerStatus findWorkerManagerStatus( WorkerManagerStatus[] wss, String name) { for (WorkerManagerStatus e : wss) { if (this.blackList.contains(e)) { continue; } if (e.getWorkerManagerName().indexOf(name) != -1) return e; } return null; } public WorkerManagerStatus findMaxFreeWorkerManagerStatus( WorkerManagerStatus[] wss, double staffsLoadFactor) { int currentStaffs = 0; int maxStaffs = 0; int loadStaffs = 0; int tmp_count = 0; WorkerManagerStatus status = null; for (WorkerManagerStatus wss_tmp : wss) { if (this.blackList.contains(wss_tmp)) { continue; } currentStaffs = wss_tmp.getRunningStaffsCount(); maxStaffs = wss_tmp.getMaxStaffsCount(); loadStaffs = Math.min(maxStaffs, ( int ) Math.ceil(staffsLoadFactor * maxStaffs)); if ((loadStaffs - currentStaffs) > tmp_count) { tmp_count = loadStaffs - currentStaffs; status = wss_tmp; } } return status; } public synchronized void updateStaffStatus(StaffInProgress sip, StaffStatus staffStatus) { sip.updateStatus(staffStatus); // update sip if (superStepCounter < staffStatus.getSuperstepCount()) { superStepCounter = ( int ) staffStatus.getSuperstepCount(); } } public void setAttemptRecoveryCounter() { this.attemptRecoveryCounter ++; this.setFaultSSStep(superStepCounter); } public int getNumAttemptRecovery() { return this.attemptRecoveryCounter; } public int getMaxAttemptRecoveryCounter() { return maxAttemptRecoveryCounter; } public int getMaxStaffAttemptRecoveryCounter() { return maxStaffAttemptRecoveryCounter; } public void setPriority(String priority) { this.priority = priority; } public void setCheckPointFrequency() { int defaultF = conf.getInt( Constants.DEFAULT_BC_BSP_JOB_CHECKPOINT_FREQUENCY, 0); if (defaultF == 0) { this.checkPointFrequency = defaultF; } else { this.checkPointFrequency = conf.getInt( Constants.USER_BC_BSP_JOB_CHECKPOINT_FREQUENCY, defaultF); } } public void setCheckPointFrequency(int cpf){ this.checkPointFrequency = cpf; LOG.info("The current [CheckPointFrequency] is:" + this.checkPointFrequency); } public void setCheckPointNext() { this.checkPointNext = true; LOG.info("The next superstep [" + (this.superStepCounter) + " or " + (this.superStepCounter + 1) + "] will execute checkpoint operation"); } public int getCheckPointFrequency() { return this.checkPointFrequency; } public boolean isCheckPoint() { if (this.checkPointFrequency == 0 || this.superStepCounter == 0) { return false; } if (this.checkPointNext) { return true; } if ((this.superStepCounter % this.checkPointFrequency) == 0) { return true; } else { return false; } } public boolean isRecovery() { return this.status.getRunState() == JobStatus.RECOVERY; } @Override public void setAbleCheckPoint(int ableCheckPoint) { this.ableCheckPoint = ableCheckPoint; LOG.info("The ableCheckPoint is " + this.ableCheckPoint); } // Note: Client get the progress by this.status @Override public void setSuperStepCounter(int superStepCounter) { this.superStepCounter = superStepCounter; this.status.setprogress(this.superStepCounter + 1); } @Override public int getSuperStepCounter() { return this.superStepCounter; } @SuppressWarnings("unchecked") public String[] generalAggregate(SuperStepReportContainer[] ssrcs) { String[] results; // To get the aggregation values from the ssrcs. for(int i = 0; i < ssrcs.length; i ++) { String[] aggValues = ssrcs[i].getAggValues(); for(int j = 0; j < aggValues.length; j ++) { String[] aggValueRecord = aggValues[j].split(Constants.KV_SPLIT_FLAG); String aggName = aggValueRecord[0]; String aggValueString = aggValueRecord[1]; AggregateValue aggValue = null; try { aggValue = this.nameToAggregateValue.get(aggName).newInstance(); aggValue.initValue(aggValueString); // init the aggValue from its string form. } catch (InstantiationException e1) { LOG.error("InstantiationException", e1); } catch (IllegalAccessException e2) { LOG.error("IllegalAccessException", e2); }//end-try if (aggValue != null) { ArrayList<AggregateValue> list = this.aggregateValues.get(aggName); list.add(aggValue); // put the value to the values' list for aggregation ahead. }//end-if }//end-for }//end-for // To aggregate the values from the aggregateValues. this.aggregateResults.clear();// Clear the results' container before a new calculation. // To calculate the aggregations. for (Entry<String, Class<? extends Aggregator<?>>> entry : this.nameToAggregator.entrySet()) { Aggregator<AggregateValue> aggregator = null; try { aggregator = ( Aggregator<AggregateValue> ) entry.getValue().newInstance(); } catch (InstantiationException e1) { LOG.error("InstantiationException", e1); } catch (IllegalAccessException e2) { LOG.error("IllegalAccessException", e2); } if (aggregator != null) { ArrayList<AggregateValue> aggVals = this.aggregateValues.get(entry.getKey()); AggregateValue resultValue = aggregator.aggregate(aggVals); this.aggregateResults.put(entry.getKey(), resultValue); aggVals.clear();// Clear the initial aggregate values after aggregation completes. } }//end-for /** * To encapsulate the aggregation values to the String[] results. * * The aggValues should be in form as follows: * [ AggregateName \t AggregateValue.toString() ] */ int aggSize = this.aggregateResults.size(); results = new String[aggSize]; int i_a = 0; for (Entry<String, AggregateValue> entry : this.aggregateResults.entrySet()) { results[i_a] = entry.getKey() + Constants.KV_SPLIT_FLAG + entry.getValue().toString(); i_a ++; } return results; } @Override public SuperStepCommand generateCommand(SuperStepReportContainer[] ssrcs) { SuperStepCommand ssc = new SuperStepCommand(); // Note: we must firstly judge whether the fault has happened. LOG.info("[generateCommand]---this.status.getRunState()" + this.status.getRunState()); LOG.info("[generateCommand]---this.status.isRecovery()" + this.status.isRecovery()); if (isRecovery()) { HashMap<Integer, String> partitionToWorkerManagerNameAndPort = convert(); LOG.info("if (isRecovery())--partitionToWorkerManagerName :" + partitionToWorkerManagerNameAndPort); ssc.setCommandType(Constants.COMMAND_TYPE.START_AND_RECOVERY); ssc.setInitReadPath(conf .get(Constants.BC_BSP_CHECKPOINT_WRITEPATH) + "/" + this.jobId.toString() + "/" + this.ableCheckPoint); LOG.info("ableCheckPoint: " + ableCheckPoint); ssc.setAbleCheckPoint(this.ableCheckPoint); ssc.setNextSuperStepNum(this.ableCheckPoint + 1); // If the COMMAND_TYPE is START_AND_RECOVERY, then the ssc.setPartitionToWorkerManagerName must be invoked. ssc.setPartitionToWorkerManagerNameAndPort(partitionToWorkerManagerNameAndPort); LOG.info("end--ssc.setPartitionToWorkerManagerName(partitionToWorkerManagerName);"); this.status.setRunState(JobStatus.RUNNING); return ssc; } String[] aggValues = generalAggregate(ssrcs); // To aggregate from the ssrcs. ssc.setAggValues(aggValues); // To put the aggregation result values into the ssc. long counter = 0; for (int i = 0; i < ssrcs.length; i++) { if (ssrcs[i].getJudgeFlag() > 0) { counter += ssrcs[i].getJudgeFlag(); } } if (counter > 0) { StringBuffer sb = new StringBuffer("[Active]" + counter); for (int i = 0; i < aggValues.length; i++) { sb.append(" || [AGG" + (i + 1) + "]" + aggValues[i]); } LOG.info("STATISTICS DATA : " + sb.toString()); if (isCheckPoint()) { this.checkPointNext = false; ssc.setOldCheckPoint(this.ableCheckPoint); LOG.info("jip--ableCheckPoint: " + this.ableCheckPoint); ssc.setCommandType(Constants.COMMAND_TYPE.START_AND_CHECKPOINT); ssc.setInitWritePath(conf .get(Constants.BC_BSP_CHECKPOINT_WRITEPATH) + "/" + this.jobId.toString() + "/" + this.superStepCounter); ssc.setAbleCheckPoint(this.superStepCounter); ssc.setNextSuperStepNum(this.superStepCounter + 1); } else { ssc.setCommandType(Constants.COMMAND_TYPE.START); ssc.setNextSuperStepNum(this.superStepCounter + 1); } } else { ssc.setCommandType(Constants.COMMAND_TYPE.STOP); ssc.setNextSuperStepNum(this.superStepCounter); } return ssc; } // convert for SScommand private HashMap<Integer, String> convert() { StaffInProgress[] staffs = this.getStaffInProgress(); HashMap<String, ArrayList<StaffAttemptID>> workersToStaffs = this.getWorkersToStaffs(); HashMap<Integer, String> partitionToWorkerManagerNameAndPort = new HashMap<Integer, String>(); ArrayList<StaffAttemptID> staffAttemptIDs = null; StaffAttemptID staffAttemptID = null; for (String workerManagerName : workersToStaffs.keySet()) { staffAttemptIDs = workersToStaffs.get(workerManagerName); for(int i=0; i<staffAttemptIDs.size(); i++) { staffAttemptID = staffAttemptIDs.get(i); for(int j=0; j<staffs.length; j++) { if(staffAttemptID.equals(staffs[j].getStaffID())) { partitionToWorkerManagerNameAndPort.put(staffs[j].getS().getPartition(), workerManagerName); } } } } return partitionToWorkerManagerNameAndPort; } public HashMap<String, ArrayList<StaffAttemptID>> getWorkersToStaffs() { return this.workersToStaffs; } public boolean removeStaffFromWorker(String workerName, StaffAttemptID staffId) { boolean success = false; if (this.workersToStaffs.containsKey(workerName)) { if (this.workersToStaffs.get(workerName).contains(staffId)) { this.workersToStaffs.get(workerName).remove(staffId); if (this.workersToStaffs.get(workerName).size() == 0) { this.workersToStaffs.remove(workerName); LOG.info("removeStaffFromWorker " + workerName); } success = true; } } return success; } /** * The job is dead. We're now GC'ing it, getting rid of the job from all * tables. Be sure to remove all of this job's tasks from the various * tables. */ private void garbageCollect() { try { // Cleanup the ZooKeeper. gssc.cleanup(); // Cleanup the local file. if (localJobFile != null) { localFs.delete(localJobFile, true); localJobFile = null; } if (localJarFile != null) { localFs.delete(localJarFile, true); localJarFile = null; } FileSystem fs = FileSystem.get(conf); fs.delete(new Path(profile.getJobFile()).getParent(), true); } catch (Exception e) { LOG.error("[garbageCollect> Error cleaning up]" + e.getMessage()); } } @Override public void completedJob() { this.status.setRunState(JobStatus.SUCCEEDED); this.status.setprogress(this.superStepCounter + 1); this.finishTime = System.currentTimeMillis(); this.status.setFinishTime(this.finishTime); this.controller.removeFromJobListener(this.jobId); cleanCheckpoint(); garbageCollect(); LOG.info("Job successfully done."); } @Override public void failedJob() { this.status.setRunState(JobStatus.FAILED); this.status.setprogress(this.superStepCounter + 1); this.finishTime = System.currentTimeMillis(); this.status.setFinishTime(this.finishTime); this.controller.removeFromJobListener(jobId); gssc.stop(); cleanCheckpoint(); garbageCollect(); LOG.warn("Job failed."); } public void killJob() { this.status.setRunState(JobStatus.KILLED); this.status.setprogress(this.superStepCounter + 1); this.finishTime = System.currentTimeMillis(); this.status.setFinishTime(this.finishTime); for (int i = 0; i < staffs.length; i++) { staffs[i].kill(); } this.controller.removeFromJobListener(jobId); gssc.stop(); cleanCheckpoint(); garbageCollect(); } public void killJobRapid() { this.status.setRunState(JobStatus.KILLED); this.status.setprogress(this.superStepCounter); this.finishTime = System.currentTimeMillis(); this.status.setFinishTime(this.finishTime); this.controller.removeFromJobListener(jobId); gssc.stop(); cleanCheckpoint(); } private boolean cleanCheckpoint() { try { String uri = conf.get(Constants.BC_BSP_CHECKPOINT_WRITEPATH) + "/" + job.getJobID().toString() + "/"; FileSystem fs = FileSystem.get(URI.create(uri), conf); if(fs.exists(new Path(uri))) { fs.delete(new Path(uri), true); } return true; } catch (IOException e) { LOG.error("Exception has happened and been catched!", e); return false; } } @Override public int getCheckNum() { return this.workersToStaffs.size(); } @Override public void reportLOG(String log) { LOG.info("GeneralSSController: " + log); } public StaffAttemptID[] getAttemptIDList(){ return attemptIDList.toArray(new StaffAttemptID[attemptIDList.size()]); } public void getRecoveryBarrier(List<String> WMNames) { gssc.recoveryBarrier(WMNames); } /** * Only for fault-tolerance. * If the command has been write on the ZooKeeper, return true, else return false. * * @return */ public boolean isCommandBarrier() { return this.gssc.isCommandBarrier(); } public List<String> getWMNames() { return WMNames; } public void addWMNames(String name) { WMNames.add(name); } public void cleanWMNames() { this.WMNames.clear(); } }