/** * CopyRight by Chinamobile * * BSPStaff.java */ package com.chinamobile.bcbsp.bspstaff; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.DataInput; import java.io.DataInputStream; import java.io.DataOutput; import java.io.IOException; import java.net.InetSocketAddress; import java.net.URI; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.serializer.Deserializer; import org.apache.hadoop.io.serializer.SerializationFactory; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RPC.Server; import org.apache.hadoop.ipc.VersionedProtocol; import org.apache.hadoop.util.ReflectionUtils; import com.chinamobile.bcbsp.ActiveMQBroker; import com.chinamobile.bcbsp.BSPConfiguration; import com.chinamobile.bcbsp.Constants; import com.chinamobile.bcbsp.api.AggregateValue; import com.chinamobile.bcbsp.api.Aggregator; import com.chinamobile.bcbsp.api.BSP; import com.chinamobile.bcbsp.api.Edge; import com.chinamobile.bcbsp.api.Partitioner; import com.chinamobile.bcbsp.api.RecordParse; import com.chinamobile.bcbsp.api.Vertex; import com.chinamobile.bcbsp.comm.BSPMessage; import com.chinamobile.bcbsp.comm.Communicator; import com.chinamobile.bcbsp.comm.CommunicatorInterface; import com.chinamobile.bcbsp.fault.storage.Checkpoint; import com.chinamobile.bcbsp.fault.storage.Fault; import com.chinamobile.bcbsp.graph.GraphDataFactory; import com.chinamobile.bcbsp.graph.GraphDataInterface; import com.chinamobile.bcbsp.io.InputFormat; import com.chinamobile.bcbsp.io.OutputFormat; import com.chinamobile.bcbsp.io.RecordReader; import com.chinamobile.bcbsp.io.RecordWriter; import com.chinamobile.bcbsp.partition.HashPartitioner; import com.chinamobile.bcbsp.partition.HashWithBalancerWritePartition; import com.chinamobile.bcbsp.partition.HashWritePartition; import com.chinamobile.bcbsp.partition.NotDivideWritePartition; import com.chinamobile.bcbsp.partition.RecordParseDefault; import com.chinamobile.bcbsp.partition.WritePartition; import com.chinamobile.bcbsp.sync.StaffSSController; import com.chinamobile.bcbsp.sync.StaffSSControllerInterface; import com.chinamobile.bcbsp.sync.SuperStepCommand; import com.chinamobile.bcbsp.sync.SuperStepReportContainer; import com.chinamobile.bcbsp.util.BSPJob; import com.chinamobile.bcbsp.util.BSPJobID; import com.chinamobile.bcbsp.util.StaffAttemptID; import com.chinamobile.bcbsp.workermanager.WorkerAgentProtocol; import com.chinamobile.bcbsp.workermanager.WorkerManager; /** * BSPStaff * * A BSPStaff is an entity that executes the local computation of a BSPJob. A * BSPJob usually consists of many BSPStaffs which are distributed among the * workers. * * @author * @version */ public class BSPStaff extends Staff { private WorkerAgentForStaffInterface staffAgent; private BSPJob bspJob; private ActiveMQBroker activeMQBroker; private int activeMQPort; private CommunicatorInterface communicator; // split information private BytesWritable rawSplit = new BytesWritable(); private String rawSplitClass; private GraphDataInterface graphData; // <partitionID--hostName:port1-port2> private HashMap<Integer, String> partitionToWorkerManagerHostWithPorts = new HashMap<Integer, String>(); private HashMap<Integer, Integer> hashBucketToPartition = null; // variable for barrier private StaffSSControllerInterface sssc; private int staffNum = 0; private int workerMangerNum = 0; private int localBarrierNum = 0; // variable for local computation private int maxSuperStepNum = 0; private int currentSuperStepCounter = 0; private long activeCounter = 0; private boolean flag = true; private SuperStepCommand ssc; // For Partition private Partitioner<Text> partitioner; private int numCopy = 100; private int lost = 0; // For Aggregation /** Map for user registered aggregate values. */ private HashMap<String, Class<? extends AggregateValue<?>>> nameToAggregateValue = new HashMap<String, Class<? extends AggregateValue<?>>>(); /** Map for user registered aggregatros. */ private HashMap<String, Class<? extends Aggregator<?>>> nameToAggregator = new HashMap<String, Class<? extends Aggregator<?>>>(); // Map to cache of the aggregate values aggregated for each vertex. @SuppressWarnings("unchecked") private HashMap<String, AggregateValue> aggregateValues = new HashMap<String, AggregateValue>(); // Map to instance of the aggregate values for the current vertex. @SuppressWarnings("unchecked") private HashMap<String, AggregateValue> aggregateValuesCurrent = new HashMap<String, AggregateValue>(); // Map to cache of the aggregate values calculated last super step. @SuppressWarnings("unchecked") private HashMap<String, AggregateValue> aggregateResults = new HashMap<String, AggregateValue>(); private RecordParse recordParse = null; private static final Log LOG = LogFactory.getLog(BSPStaff.class); private int recoveryTimes = 0; public BSPStaff() { } public BSPStaff(BSPJobID jobId, String jobFile, StaffAttemptID staffId, int partition, String splitClass, BytesWritable split) { this.jobId = jobId; this.jobFile = jobFile; this.sid = staffId; this.partition = partition; this.rawSplitClass = splitClass; this.rawSplit = split; } public int getStaffNum() { return staffNum; } public int getNumCopy() { return numCopy; } public void setNumCopy(int numCopy) { this.numCopy = numCopy; } public HashMap<Integer, Integer> getHashBucketToPartition() { return this.hashBucketToPartition; } public void setHashBucketToPartition( HashMap<Integer, Integer> hashBucketToPartition) { this.hashBucketToPartition = hashBucketToPartition; } public GraphDataInterface getGraphData() { return graphData; } public void setGraphData(GraphDataInterface graph) { this.graphData = graph; } @Override public BSPStaffRunner createRunner(WorkerManager workerManager) { return new BSPStaffRunner(this, workerManager, this.bspJob); } /** * loadData: load data for the staff * * @param job * @param umbilical * @return boolean * @throws ClassNotFoundException * @throws IOException * @throws InterruptedException */ @SuppressWarnings("unchecked") public boolean loadData(BSPJob job, WorkerAgentProtocol workerAgent, WorkerAgentForStaffInterface aStaffAgent) throws ClassNotFoundException, IOException, InterruptedException { // rebuild the input split RecordReader input = null; org.apache.hadoop.mapreduce.InputSplit split = null; if (rawSplitClass.equals("no")) { input = null; } else { DataInputBuffer splitBuffer = new DataInputBuffer(); splitBuffer.reset(rawSplit.getBytes(), 0, rawSplit.getLength()); SerializationFactory factory = new SerializationFactory( job.getConf()); Deserializer<? extends org.apache.hadoop.mapreduce.InputSplit> deserializer = ( Deserializer<? extends org.apache.hadoop.mapreduce.InputSplit> ) factory .getDeserializer(job.getConf() .getClassByName(rawSplitClass)); deserializer.open(splitBuffer); split = deserializer.deserialize(null); // rebuild the InputFormat class according to the user configuration InputFormat inputformat = ( InputFormat ) ReflectionUtils .newInstance( job.getConf() .getClass( Constants.USER_BC_BSP_JOB_INPUT_FORMAT_CLASS, InputFormat.class), job.getConf()); inputformat.initialize(job.getConf()); input = inputformat.createRecordReader(split, job); input.initialize(split, job.getConf()); } SuperStepReportContainer ssrc = new SuperStepReportContainer(); ssrc.setPartitionId(this.partition); this.numCopy = ( int ) (1 / (job.getConf().getFloat( Constants.USER_BC_BSP_JOB_BALANCE_FACTOR, ( float ) Constants.USER_BC_BSP_JOB_BALANCE_FACTOR_DEFAULT))); ssrc.setNumCopy(numCopy); ssrc.setCheckNum(this.staffNum); StaffSSControllerInterface sssc = new StaffSSController(this.jobId, this.sid, workerAgent); long start = System.currentTimeMillis(); LOG.info("in BCBSP with PartitionType is: Hash" + " start time:" + start); if (this.staffNum == 1 || job.getConf().getBoolean(Constants.USER_BC_BSP_JOB_ISDIVIDE, false)) { this.partitioner = ( Partitioner<Text> ) ReflectionUtils .newInstance( job.getConf() .getClass( Constants.USER_BC_BSP_JOB_PARTITIONER_CLASS, HashPartitioner.class), job .getConf()); this.partitioner.setNumPartition(this.staffNum); this.partitioner.intialize(job, split); WritePartition writePartition = new NotDivideWritePartition(); RecordParse recordParse = ( RecordParse ) ReflectionUtils .newInstance( job.getConf() .getClass( Constants.USER_BC_BSP_JOB_RECORDPARSE_CLASS, RecordParseDefault.class), job .getConf()); recordParse.init(job); writePartition.setRecordParse(recordParse); writePartition.setStaff(this); writePartition.write(input); ssrc.setDirFlag(new String[] { "1" }); ssrc.setCheckNum(this.staffNum); sssc.loadDataBarrier(ssrc, Constants.PARTITION_TYPE.HASH); LOG.info("The number of verteices from other staff that cound not be parsed:" + this.lost); LOG.info("in BCBSP with PartitionType is:HASH" + " the number of HeadNode in this partition is:" + graphData.sizeForAll()); graphData.finishAdd(); ssrc.setCheckNum(this.staffNum * 2); ssrc.setDirFlag(new String[] { "2" }); sssc.loadDataBarrier(ssrc, Constants.PARTITION_TYPE.HASH); } else { this.partitioner = ( Partitioner<Text> ) ReflectionUtils .newInstance( job.getConf() .getClass( Constants.USER_BC_BSP_JOB_PARTITIONER_CLASS, HashPartitioner.class), job .getConf()); WritePartition writePartition = ( WritePartition ) ReflectionUtils .newInstance( job.getConf() .getClass( Constants.USER_BC_BSP_JOB_WRITEPARTITION_CLASS, HashWritePartition.class), job .getConf()); int multiple = 1; if (writePartition instanceof HashWithBalancerWritePartition) { this.partitioner.setNumPartition(this.staffNum * numCopy); multiple = 2; } else { this.partitioner.setNumPartition(this.staffNum); multiple = 1; } this.partitioner.intialize(job, split); RecordParse recordParse = ( RecordParse ) ReflectionUtils .newInstance( job.getConf() .getClass( Constants.USER_BC_BSP_JOB_RECORDPARSE_CLASS, RecordParseDefault.class), job .getConf()); recordParse.init(job); this.recordParse = ( RecordParse ) ReflectionUtils.newInstance( job.getConf().getClass( Constants.USER_BC_BSP_JOB_RECORDPARSE_CLASS, RecordParseDefault.class), job.getConf()); this.recordParse.init(job); writePartition.setPartitioner(partitioner); writePartition.setRecordParse(recordParse); writePartition.setStaff(this); writePartition.setWorkerAgent(aStaffAgent); writePartition.setSsrc(ssrc); writePartition.setSssc(sssc); writePartition.setTotalCatchSize(job.getConf().getInt( Constants.USER_BC_BSP_JOB_TOTALCACHE_SIZE, Constants.USER_BC_BSP_JOB_TOTALCACHE_SIZE_DEFAULT)); int threadNum = job.getConf().getInt( Constants.USER_BC_BSP_JOB_SENDTHREADNUMBER, Constants.USER_BC_BSP_JOB_SENDTHREADNUMBER_DEFAULT); if (threadNum > this.staffNum) threadNum = this.staffNum - 1; writePartition.setSendThreadNum(threadNum); writePartition.write(input); ssrc.setDirFlag(new String[] { "1" }); ssrc.setCheckNum(this.staffNum * multiple); sssc.loadDataBarrier(ssrc, Constants.PARTITION_TYPE.HASH); LOG.info("The number of verteices from other staff that cound not be parsed:" + this.lost); LOG.info("in BCBSP with PartitionType is:HASH" + " the number of HeadNode in this partition is:" + graphData.sizeForAll()); graphData.finishAdd(); ssrc.setCheckNum(this.staffNum * (multiple + 1)); ssrc.setDirFlag(new String[] { "2" }); sssc.loadDataBarrier(ssrc, Constants.PARTITION_TYPE.HASH); } long end = System.currentTimeMillis(); LOG.info("in BCBSP with PartitionType is:HASH" + " end time:" + end); LOG.info("in BCBSP with PartitionType is:HASH" + " using time:" + ( float ) (end - start) / 1000 + " seconds"); return true; } /** * saveResult: save the local computation result on the HDFS(SequenceFile) * * @param job * @param staff * @return boolean * @throws IOException */ @SuppressWarnings("unchecked") public boolean saveResult(BSPJob job, Staff staff, WorkerAgentProtocol workerAgent) { try { OutputFormat outputformat = ( OutputFormat ) ReflectionUtils .newInstance( job.getConf() .getClass( Constants.USER_BC_BSP_JOB_OUTPUT_FORMAT_CLASS, OutputFormat.class), job.getConf()); outputformat.initialize(job.getConf()); RecordWriter output = outputformat .getRecordWriter(job, this.sid); for (int i = 0; i < graphData.sizeForAll(); i++) { Vertex<?, ?, Edge> vertex = graphData.getForAll(i); StringBuffer outEdges = new StringBuffer(); for (Edge edge : vertex.getAllEdges()) { outEdges.append(edge.getVertexID() + Constants.SPLIT_FLAG + edge.getEdgeValue() + Constants.SPACE_SPLIT_FLAG); } if (outEdges.length() > 0) { int j = outEdges.length(); outEdges.delete(j - 1, j - 1); } output.write(new Text(vertex.getVertexID() + Constants.SPLIT_FLAG + vertex.getVertexValue()), new Text(outEdges.toString())); } output.close(job); graphData.clean(); } catch (Exception e) { LOG.error("Exception has been catched in BSPStaff--saveResult !", e); BSPConfiguration conf = new BSPConfiguration(); if(this.recoveryTimes < conf.getInt(Constants.BC_BSP_JOB_RECOVERY_ATTEMPT_MAX, 0)) { recovery(job, staff, workerAgent); } else { workerAgent.setStaffStatus( this.sid, Constants.SATAFF_STATUS.FAULT, new Fault(Fault.Type.DISK, Fault.Level.INDETERMINATE, workerAgent .getWorkerManagerName(job.getJobID(), this.sid), e.toString(), job .toString(), this.sid.toString()), 2); LOG.info("=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*"); LOG.error("Other Exception has happened and been catched, " + "the exception will be reported to WorkerManager", e); } } return true; } public boolean recovery(BSPJob job, Staff staff, WorkerAgentProtocol workerAgent) { this.recoveryTimes ++; boolean success = saveResult(job, staff, workerAgent); if(success == true) { return true; } else { return false; } } // Just for testing public void displayFirstRoute() { for (Entry<Integer, String> e : this.partitionToWorkerManagerNameAndPort .entrySet()) { LOG.info("partitionToWorkerManagerName : " + e.getKey() + " " + e.getValue()); } } // Just for testing public void displaySecondRoute() { for (Entry<Integer, Integer> e : this.hashBucketToPartition.entrySet()) { LOG.info("partitionToRange : " + e.getKey() + " " + e.getValue()); } } public int getLocalBarrierNumber(String hostName) { int localBarrierNumber = 0; for (Entry<Integer, String> entry : this.partitionToWorkerManagerNameAndPort .entrySet()) { String workerManagerName = entry.getValue().split(":")[0]; if (workerManagerName.equals(hostName)) { localBarrierNumber++; } } return localBarrierNumber; } private boolean deleteOldCheckpoint(int oldCheckpoint, BSPJob job) { LOG.info("deleteOldCheckpoint--oldCheckpoint: " + oldCheckpoint); try { Configuration conf = new Configuration(); BSPConfiguration bspConf = new BSPConfiguration(); String uri = bspConf.get(Constants.BC_BSP_HDFS_NAME) + job.getConf().get(Constants.BC_BSP_CHECKPOINT_WRITEPATH) + "/" + job.getJobID().toString() + "/" + oldCheckpoint + "/"; FileSystem fs = FileSystem.get(URI.create(uri), conf); if (fs.exists(new Path(uri))) { fs.delete(new Path(uri), true); } } catch (IOException e) { LOG.error("Exception has happened and been catched!", e); return false; } return true; } @SuppressWarnings("unchecked") @Override /** * run the local computation. * @param job * @param staff * @param workerAgent * @param recovery * @param changeWorkerState * @param failCounter * @param hostName * @return * Review comment: * (1) The codes inside this method are too messy. * Review time: 2011-11-30 * Reviewer: Hongxu Zhang. * * Fix log: * (1) To make the codes neat and well-organized, I use more empty lines and annotations * to organize the codes. * Fix time: 2011-12-1 * Programmer: Hu Zheng. */ public void run(BSPJob job, Staff staff, WorkerAgentProtocol workerAgent, boolean recovery, boolean changeWorkerState, int failCounter, String hostName) { // record the number of failures of this staff LOG.info("BSPStaff---run()--changeWorkerState: " + changeWorkerState); staff.setFailCounter(failCounter); LOG.info("[HostName] " + hostName); // initialize the relative variables this.bspJob = job; long start = 0, end = 0; int superStepCounter = 0; this.maxSuperStepNum = job.getNumSuperStep(); this.staffNum = job.getNumBspStaff(); SuperStepReportContainer ssrc = new SuperStepReportContainer(); ssrc.setPartitionId(this.partition); sssc = new StaffSSController(this.jobId, this.sid, workerAgent); Checkpoint cp = new Checkpoint(job); if(graphDataFactory == null) graphDataFactory = new GraphDataFactory(job.getConf()); try { if (recovery == false) { //if it is a recovery staff // schedule Staff Barrier ssrc.setCheckNum(this.staffNum); int partitionRPCPort = workerAgent.getFreePort(); ssrc.setPort1(partitionRPCPort); this.activeMQPort = workerAgent.getFreePort(); ssrc.setPort2(this.activeMQPort); LOG.info("[BSPStaff] Get the port for partitioning RPC is : " + partitionRPCPort + "!"); LOG.info("[BSPStaff] Get the port for ActiveMQ Broker is : " + this.activeMQPort + "!"); this.partitionToWorkerManagerHostWithPorts = sssc.scheduleBarrier(ssrc); //record the map from partitions to workermanagers for (Integer e : this.partitionToWorkerManagerHostWithPorts.keySet()) { String[] nameAndPorts = this.partitionToWorkerManagerHostWithPorts.get(e).split(":"); String[] ports = nameAndPorts[1].split("-"); this.partitionToWorkerManagerNameAndPort.put(e, nameAndPorts[0] + ":" + ports[1]); } // For partition and for WorkerManager to invoke rpc method of Staff. this.staffAgent = new WorkerAgentForStaff(job.getConf()); workerAgent.setStaffAgentAddress(this.sid, this.staffAgent.address()); //initialize the number of local staffs and the number of workers of the same job this.localBarrierNum = getLocalBarrierNumber(hostName); this.workerMangerNum = workerAgent.getNumberWorkers(this.jobId, this.sid); displayFirstRoute(); // load Data for the staff /** * Review comment: * there are too many if else structure * which may lead to a obstacle against extension and reusing * Review time: 2011-11-30 * Reviewer: HongXu Zhang * * Fix log: * we use the factory pattern to implement the creation of a graph data object * Fix time: 2011-12-2 * Programmer: Hu Zheng */ int version = job.getGraphDataVersion(); this.graphData = this.graphDataFactory.createGraphData(version, this); /** Clock */ start = System.currentTimeMillis(); loadData(job, workerAgent, this.staffAgent); end = System.currentTimeMillis(); LOG.info("[==>Clock<==] <load Data> used " + (end - start) / 1000f + " seconds"); } else { LOG.info("The recoveried staff begins to read checkpoint"); LOG.info("The fault SuperStepCounter is : " + job.getInt("staff.fault.superstep", 0)); //schedule a barrier this.ssc = sssc.secondStageSuperStepBarrierForRecovery(job.getInt("staff.fault.superstep", 0)); this.setPartitionToWorkerManagerNameAndPort(ssc.getPartitionToWorkerManagerNameAndPort()); this.localBarrierNum = getLocalBarrierNumber(hostName); ArrayList<String> tmp = new ArrayList<String>(); for (String str : this.partitionToWorkerManagerNameAndPort.values()) { if (!tmp.contains(str)) { tmp.add(str); } } workerAgent.setNumberWorkers(this.jobId, this.sid, tmp.size()); tmp.clear(); this.workerMangerNum = workerAgent.getNumberWorkers(this.jobId, this.sid); this.currentSuperStepCounter = ssc.getAbleCheckPoint(); // clean first int version = job.getGraphDataVersion(); this.graphData = this.graphDataFactory.createGraphData(version, this); this.graphData.clean(); this.graphData = cp.readCheckPoint(new Path(ssc.getInitReadPath()), job, staff); ssrc.setLocalBarrierNum(this.localBarrierNum); ssrc.setStageFlag(Constants.SUPERSTEP_STAGE.READ_CHECKPOINT_STAGE); ssrc.setDirFlag(new String[] { "read" }); ssrc.setCheckNum(this.workerMangerNum * 1); // Get the new port of ActiveMQ. this.activeMQPort = workerAgent.getFreePort(); ssrc.setPort2(this.activeMQPort); LOG.info("[BSPStaff] ReGet the port for ActiveMQ Broker is : " + this.activeMQPort + "!"); this.partitionToWorkerManagerNameAndPort = sssc.checkPointStageSuperStepBarrier( this.currentSuperStepCounter, ssrc); displayFirstRoute(); this.currentSuperStepCounter = ssc.getNextSuperStepNum(); this.partitioner = ( Partitioner<Text> ) ReflectionUtils .newInstance( job.getConf() .getClass( Constants.USER_BC_BSP_JOB_PARTITIONER_CLASS, HashPartitioner.class), job .getConf()); WritePartition writePartition = ( WritePartition ) ReflectionUtils .newInstance( job.getConf() .getClass( Constants.USER_BC_BSP_JOB_WRITEPARTITION_CLASS, HashWritePartition.class), job .getConf()); if (writePartition instanceof HashWithBalancerWritePartition) { this.partitioner.setNumPartition(this.staffNum * numCopy); } else { this.partitioner.setNumPartition(this.staffNum); } org.apache.hadoop.mapreduce.InputSplit split = null; if (rawSplitClass.equals("no")) { } else { DataInputBuffer splitBuffer = new DataInputBuffer(); splitBuffer.reset(rawSplit.getBytes(), 0, rawSplit.getLength()); SerializationFactory factory = new SerializationFactory( job.getConf()); Deserializer<? extends org.apache.hadoop.mapreduce.InputSplit> deserializer = ( Deserializer<? extends org.apache.hadoop.mapreduce.InputSplit> ) factory .getDeserializer(job.getConf().getClassByName( rawSplitClass)); deserializer.open(splitBuffer); split = deserializer.deserialize(null); } this.partitioner.intialize(job, split); displayFirstRoute(); } } catch (ClassNotFoundException cnfE) { LOG.error("Exception has been catched in BSPStaff--run--before local computing !", cnfE); workerAgent.setStaffStatus( staff.getStaffAttemptId(), Constants.SATAFF_STATUS.FAULT, new Fault(Fault.Type.SYSTEMSERVICE, Fault.Level.CRITICAL, workerAgent.getWorkerManagerName(job.getJobID(), staff.getStaffAttemptId()), cnfE.toString(), job.toString(), staff .getStaffAttemptId().toString()), 0); return; } catch (IOException ioE) { LOG.error("Exception has been catched in BSPStaff--run--before local computing !", ioE); workerAgent.setStaffStatus( staff.getStaffAttemptId(), Constants.SATAFF_STATUS.FAULT, new Fault(Fault.Type.DISK, Fault.Level.INDETERMINATE, workerAgent.getWorkerManagerName(job.getJobID(), staff.getStaffAttemptId()), ioE.toString(), job.toString(), staff.getStaffAttemptId() .toString()), 0); return; } catch (InterruptedException iE) { LOG.error("Exception has been catched in BSPStaff--run--before local computing !", iE); workerAgent.setStaffStatus( staff.getStaffAttemptId(), Constants.SATAFF_STATUS.FAULT, new Fault(Fault.Type.SYSTEMSERVICE, Fault.Level.CRITICAL, workerAgent.getWorkerManagerName(job.getJobID(), staff.getStaffAttemptId()), iE.toString(), job.toString(), staff.getStaffAttemptId() .toString()), 0); return; } BSP bsp = ( BSP ) ReflectionUtils.newInstance( job.getConf().getClass(Constants.USER_BC_BSP_JOB_WORK_CLASS, BSP.class), job.getConf()); /** Clock */ start = System.currentTimeMillis(); // load aggregators and aggregate values. loadAggregators(job); end = System.currentTimeMillis(); LOG.info("[==>Clock<==] <loadAggregators> used " + (end - start) / 1000f + " seconds"); try { // configuration before local computation bsp.setup(staff); /** Clock */ start = System.currentTimeMillis(); // Start an ActiveMQ Broker, create a communicator, initialize it, and start it startActiveMQBroker(hostName); this.communicator = new Communicator(this.jobId, job, this.getPartition(), partitioner); this.communicator.initialize(this.getHashBucketToPartition(), this.getPartitionToWorkerManagerNameAndPort(), this.graphData); this.communicator.start(); end = System.currentTimeMillis(); LOG.info("[==>Clock<==] <Initialize Communicator> used " + (end - start) / 1000f + " seconds"); // begin local computation while (this.flag) { this.activeCounter = 0; if (recovery == false) { superStepCounter = this.currentSuperStepCounter; } else { superStepCounter = 0; recovery = false; } // Begin the communicator.From this moment, // the parallel sending and receiving threads have begun. this.communicator.begin(superStepCounter); // Initialize before each super step. SuperStepContext ssContext = new SuperStepContext(job, superStepCounter); publishAggregateValues(ssContext); bsp.initBeforeSuperStep(ssContext); initBeforeSuperStepForAggregateValues(ssContext); /** Clock */ start = System.currentTimeMillis(); LOG.info("BSPStaff--run: superStepCounter: " + superStepCounter); long loadGraphTime = 0; long aggregateTime = 0; long computeTime = 0; long collectMsgsTime = 0; int tmpCounter = this.graphData.sizeForAll(); for (int i = 0; i < tmpCounter; i++) { /** Clock */ long tmpStart = System.currentTimeMillis(); Vertex vertex = graphData.getForAll(i); if (vertex == null) { LOG.error("Fail to get the HeadNode of index[" + i + "] " + "and the system will skip the record"); continue; } loadGraphTime = loadGraphTime + (System.currentTimeMillis() - tmpStart); // Get the incomed message queue for this vertex. ConcurrentLinkedQueue<BSPMessage> messages = this.communicator .getMessageQueue(String.valueOf(vertex .getVertexID())); // Aggregate the new values for each vertex. Clock the time cost. tmpStart = System.currentTimeMillis(); aggregate(messages, job, vertex, this.currentSuperStepCounter); aggregateTime = aggregateTime + (System.currentTimeMillis() - tmpStart); // If the vertex is inactive and the size of messages for it is 0, skip the compute. boolean activeFlag = graphData.getActiveFlagForAll(i); if (!activeFlag && (messages.size() == 0)) continue; Iterator<BSPMessage> messagesIter = messages.iterator(); // Call the compute function for local computation. BSPStaffContext context = new BSPStaffContext(job, vertex, superStepCounter); /*Publish the total result aggregate values into the bsp's cache * for the user's function's accession in the next super step.*/ publishAggregateValues(context); /** Clock */ tmpStart = System.currentTimeMillis(); bsp.compute(messagesIter, context); computeTime = computeTime + (System.currentTimeMillis() - tmpStart); /** Clock */ messages.clear(); // Write the new vertex value to the graph. this.graphData.set(i, context.getVertex(), context.getActiveFLag()); /** Clock */ tmpStart = System.currentTimeMillis(); // Collect the messages sent by this node. collectMessages(context); collectMsgsTime = collectMsgsTime + (System.currentTimeMillis() - tmpStart); /** Clock */ }// end-for LOG.info("[BSPStaff] Vertex computing is over for the super step <" + this.currentSuperStepCounter + ">"); end = System.currentTimeMillis(); /** Clocks */ LOG.info("[==>Clock<==] <Vertex computing> used " + (end - start) / 1000f + " seconds"); LOG.info("[==>Clock<==] ...(Load Graph Data Time) used " + loadGraphTime / 1000f + " seconds"); LOG.info("[==>Clock<==] ...(Aggregate Time) used " + aggregateTime / 1000f + " seconds"); LOG.info("[==>Clock<==] ...(Compute Time) used " + computeTime / 1000f + " seconds"); LOG.info("[==>Clock<==] ...(Collect Messages Time) used " + collectMsgsTime / 1000f + " seconds"); /** Clocks */ /** Clock */ start = System.currentTimeMillis(); // Notify the communicator that there will be no more messages // for sending. this.communicator.noMoreMessagesForSending(); // Wait for all of the messages have been sent over. while (true) { if (this.communicator.isSendingOver()) break; } end = System.currentTimeMillis(); LOG.info("[==>Clock<==] <Wait for sending over> used " + (end - start)/1000f + " seconds"); /**Clock*/ LOG.info("===========Sending Over============"); /** Clock */ start = end; // Barrier for sending messages over. ssrc.setLocalBarrierNum(this.localBarrierNum); ssrc.setStageFlag(Constants.SUPERSTEP_STAGE.FIRST_STAGE);// ssrc.setDirFlag(new String[] { "1" }); ssrc.setCheckNum(this.workerMangerNum); sssc.firstStageSuperStepBarrier(this.currentSuperStepCounter, ssrc); end = System.currentTimeMillis(); /** Clock */ LOG.info("[==>Clock<==] <Sending over sync> used " + (end - start) / 1000f + " seconds"); /** Clock */ start = end; // Notify the communicator that there will be no more messages // for receiving. Wait for the receiving thread died. this.communicator.noMoreMessagesForReceiving(); while (true) { if (this.communicator.isReceivingOver()) break; } end = System.currentTimeMillis(); /**Clock*/ LOG.info("[==>Clock<==] <Wait for receiving over> used " + (end - start)/1000f + " seconds"); LOG.info("===========Receiving Over==========="); /**Clock*/ start = end; // Barrier for receiving messages over. ssrc.setLocalBarrierNum(this.localBarrierNum); ssrc.setStageFlag(Constants.SUPERSTEP_STAGE.FIRST_STAGE); ssrc.setDirFlag(new String[] { "2" }); ssrc.setCheckNum(this.workerMangerNum * 2); sssc.firstStageSuperStepBarrier(this.currentSuperStepCounter, ssrc); end = System.currentTimeMillis(); /** Clock */ LOG.info("[==>Clock<==] <Receiving over sync> used " + (end - start) / 1000f + " seconds"); this.graphData.showMemoryInfo(); // Exchange the incoming and incomed queues. this.communicator.exchangeIncomeQueues(); LOG.info("[BSPStaff] Communicator has received " + this.communicator.getIncomedQueuesSize() + " messages totally for the super step <" + this.currentSuperStepCounter + ">"); // decide whether to continue the next super-step or not if ((this.currentSuperStepCounter + 1) >= this.maxSuperStepNum) { this.communicator.clearOutgoingQueues(); this.communicator.clearIncomedQueues(); this.activeCounter = 0; } else { this.activeCounter = this.graphData.getActiveCounter(); } LOG.info("[Active Vertex]" + this.activeCounter); /** Clock */ start = System.currentTimeMillis(); // Encapsulate the aggregate values into String[]. String[] aggValues = encapsulateAggregateValues(); end = System.currentTimeMillis(); LOG.info("[==>Clock<==] <Encapsulate aggregate values> used " + (end - start) / 1000f + " seconds"); /** Clock */ /** Clock */ start = end; // Set the aggregate values into the super step report container. ssrc.setAggValues(aggValues); ssrc.setLocalBarrierNum(this.localBarrierNum); ssrc.setStageFlag(Constants.SUPERSTEP_STAGE.SECOND_STAGE); LOG.info("[WorkerManagerNum]" + this.workerMangerNum); ssrc.setCheckNum(this.workerMangerNum + 1); ssrc.setJudgeFlag(this.activeCounter + this.communicator.getIncomedQueuesSize()); this.ssc = sssc.secondStageSuperStepBarrier( this.currentSuperStepCounter, ssrc); LOG.info("[==>Clock<==] <StaffSSController's rebuild session> used " + StaffSSController.rebuildTime / 1000f + " seconds"); StaffSSController.rebuildTime = 0; if (ssc.getCommandType() == Constants.COMMAND_TYPE.START_AND_RECOVERY) { LOG.info("[Command]--[routeTableSize]" + ssc.getPartitionToWorkerManagerNameAndPort() .size()); this.setPartitionToWorkerManagerNameAndPort(ssc .getPartitionToWorkerManagerNameAndPort()); ArrayList<String> tmp = new ArrayList<String>(); for (String str : this.partitionToWorkerManagerNameAndPort .values()) { if (!tmp.contains(str)) { tmp.add(str); } } this.localBarrierNum = getLocalBarrierNumber(hostName); workerAgent.setNumberWorkers(this.jobId, this.sid, tmp.size()); tmp.clear(); this.workerMangerNum = workerAgent.getNumberWorkers( this.jobId, this.sid); displayFirstRoute(); } end = System.currentTimeMillis(); /** Clock */ LOG.info("[==>Clock<==] <SuperStep sync> used " + (end - start) / 1000f + " seconds"); /** Clock */ start = end; // Get the aggregate values from the super step command. // Decapsulate the aggregate values from String[]. aggValues = this.ssc.getAggValues(); if (aggValues != null) { decapsulateAggregateValues(aggValues); } end = System.currentTimeMillis(); /** Clock */ LOG.info("[==>Clock<==] <Decapsulate aggregate values> used " + (end - start) / 1000f + " seconds"); /** Clock */ start = end; switch (ssc.getCommandType()) { case Constants.COMMAND_TYPE.START: LOG.info("Get the CommandTye is : START"); this.currentSuperStepCounter = ssc .getNextSuperStepNum(); this.flag = true; break; case Constants.COMMAND_TYPE.START_AND_CHECKPOINT: LOG.info("Get the CommandTye is : START_AND_CHECKPOINT"); boolean success = cp.writeCheckPoint(this.graphData, new Path(ssc.getInitWritePath()), job, staff); if (success == true) { deleteOldCheckpoint(ssc.getOldCheckPoint(), job); } ssrc.setLocalBarrierNum(this.localBarrierNum); ssrc.setStageFlag(Constants.SUPERSTEP_STAGE.WRITE_CHECKPOINT_SATGE); ssrc.setDirFlag(new String[] { "write" }); ssrc.setCheckNum(this.workerMangerNum * 3); sssc.checkPointStageSuperStepBarrier( this.currentSuperStepCounter, ssrc); this.currentSuperStepCounter = ssc .getNextSuperStepNum(); this.flag = true; break; case Constants.COMMAND_TYPE.START_AND_RECOVERY: LOG.info("Get the CommandTye is : START_AND_RECOVERY"); this.currentSuperStepCounter = ssc.getAbleCheckPoint(); // clean first int version = job.getGraphDataVersion(); this.graphData = this.graphDataFactory.createGraphData(version, this); this.graphData.clean(); this.graphData = cp.readCheckPoint( new Path(ssc.getInitReadPath()), job, staff); ssrc.setPartitionId(this.partition); ssrc.setLocalBarrierNum(this.localBarrierNum); ssrc.setStageFlag(Constants.SUPERSTEP_STAGE.READ_CHECKPOINT_STAGE); ssrc.setDirFlag(new String[] { "read" }); ssrc.setCheckNum(this.workerMangerNum * 1); ssrc.setPort2(this.activeMQPort); LOG.info("[BSPStaff] Get the port for ActiveMQ Broker is : " + this.activeMQPort + "!"); this.partitionToWorkerManagerNameAndPort = sssc.checkPointStageSuperStepBarrier( this.currentSuperStepCounter, ssrc); displayFirstRoute(); this.communicator.setPartitionToWorkerManagerNamePort( this.partitionToWorkerManagerNameAndPort); this.currentSuperStepCounter = ssc.getNextSuperStepNum(); this.communicator.clearOutgoingQueues(); this.communicator.clearIncomedQueues(); recovery = true; this.flag = true; break; case Constants.COMMAND_TYPE.STOP: LOG.info("Get the CommandTye is : STOP"); LOG.info("Staff will save the computation result and then quit!"); this.currentSuperStepCounter = ssc.getNextSuperStepNum(); this.flag = false; break; default: LOG.error("ERROR! " + ssc.getCommandType() + " is not a valid CommandType, so the staff will save the " + "computation result and quit!"); flag = false; } // Report the status at every superstep. workerAgent.setStaffStatus(this.sid, Constants.SATAFF_STATUS.RUNNING, null, 1); } this.communicator.complete(); } catch (IOException ioe) { LOG.error("Exception has been catched in BSPStaff--run--during local computing !", ioe); workerAgent.setStaffStatus( this.sid, Constants.SATAFF_STATUS.FAULT, new Fault(Fault.Type.DISK, Fault.Level.CRITICAL, workerAgent.getWorkerManagerName(job.getJobID(), this.sid), ioe.toString(), job.toString(), this.sid.toString()), 1); LOG.info("=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*"); LOG.error("IO Exception has happened and been catched, " + "the exception will be reported to WorkerManager", ioe); LOG.error("Staff will quit abnormally"); return; } catch (Exception e) { LOG.error("Exception has been catched in BSPStaff--run--during local computing !", e); workerAgent.setStaffStatus( this.sid, Constants.SATAFF_STATUS.FAULT, new Fault(Fault.Type.SYSTEMSERVICE, Fault.Level.INDETERMINATE, workerAgent .getWorkerManagerName(job.getJobID(), this.sid), e.toString(), job .toString(), this.sid.toString()), 1); LOG.info("=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*"); LOG.error("Other Exception has happened and been catched, " + "the exception will be reported to WorkerManager", e); LOG.error("Staff will quit abnormally"); return; } // save the computation result try { saveResult(job, staff, workerAgent); ssrc.setLocalBarrierNum(this.localBarrierNum); ssrc.setStageFlag(Constants.SUPERSTEP_STAGE.SAVE_RESULT_STAGE); ssrc.setDirFlag(new String[] { "1", "2", "write", "read" }); sssc.saveResultStageSuperStepBarrier(this.currentSuperStepCounter, ssrc); // cleanup after local computation bsp.cleanup(staff); stopActiveMQBroker(); done(workerAgent); workerAgent.setStaffStatus(this.sid, Constants.SATAFF_STATUS.SUCCEED, null, 1); LOG.info("The max SuperStep num is " + this.maxSuperStepNum); LOG.info("=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*"); LOG.info("Staff is completed successfully"); } catch (Exception e) { LOG.error("Exception has been catched in BSPStaff--run--after local computing !", e); workerAgent.setStaffStatus( this.sid, Constants.SATAFF_STATUS.FAULT, new Fault(Fault.Type.SYSTEMSERVICE, Fault.Level.INDETERMINATE, workerAgent .getWorkerManagerName(job.getJobID(), this.sid), e.toString(), job .toString(), this.sid.toString()), 2); LOG.info("=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*"); LOG.error("Other Exception has happened and been catched, " + "the exception will be reported to WorkerManager", e); } } public BSPJob getConf() { return this.bspJob; } public void setConf(BSPJob bspJob) { this.bspJob = bspJob; } /** Write and read split info to WorkerManager */ @Override public void write(DataOutput out) throws IOException { super.write(out); Text.writeString(out, rawSplitClass); rawSplit.write(out); } @Override public void readFields(DataInput in) throws IOException { super.readFields(in); rawSplitClass = Text.readString(in); rawSplit.readFields(in); } private void collectMessages(BSPStaffContext context) { Iterator<BSPMessage> it = context.getMessages(); while (it.hasNext()) { try { this.communicator.send(it.next()); } catch (Exception e) { LOG.error("<collectMessages>", e); } } context.cleanMessagesCache(); } public HashMap<Integer, String> getPartitionToWorkerManagerNameAndPort() { return this.partitionToWorkerManagerNameAndPort; } @SuppressWarnings("unchecked") private void loadAggregators(BSPJob job) { int aggregateNum = job.getAggregateNum(); String[] aggregateNames = job.getAggregateNames(); for (int i = 0; i < aggregateNum; i++) { String name = aggregateNames[i]; this.nameToAggregator.put(name, job.getAggregatorClass(name)); this.nameToAggregateValue.put(name, job.getAggregateValueClass(name)); } try { // Instanciate each aggregate values. for (Entry<String, Class<? extends AggregateValue<?>>> entry : this.nameToAggregateValue .entrySet()) { String aggName = entry.getKey(); AggregateValue aggValue; aggValue = entry.getValue().newInstance(); this.aggregateValuesCurrent.put(aggName, aggValue); } } catch (InstantiationException e) { LOG.error("[BSPStaff:loadAggregators]", e); } catch (IllegalAccessException e) { LOG.error("[BSPStaff:loadAggregators]", e); } } @SuppressWarnings("unchecked") private void initBeforeSuperStepForAggregateValues(SuperStepContext ssContext) { for (Entry<String, AggregateValue> entry : this.aggregateValuesCurrent .entrySet()) { AggregateValue aggValue = entry.getValue(); aggValue.initBeforeSuperStep(ssContext); } } @SuppressWarnings("unchecked") private void aggregate(ConcurrentLinkedQueue<BSPMessage> messages, BSPJob job, Vertex vertex, int superStepCount) { try { for (Entry<String, Class<? extends AggregateValue<?>>> entry : this.nameToAggregateValue .entrySet()) { String aggName = entry.getKey(); // Init the aggregate value for this head node. AggregateValue aggValue1 = this.aggregateValuesCurrent.get(aggName); AggregationContext aggContext = new AggregationContext(job, vertex, superStepCount); publishAggregateValues(aggContext); aggValue1.initValue(messages.iterator(), aggContext); // Get the current aggregate value. AggregateValue aggValue0; aggValue0 = this.aggregateValues.get(aggName); // Get the aggregator for this kind of aggregate value. Aggregator<AggregateValue> aggregator; aggregator = ( Aggregator<AggregateValue> ) this.nameToAggregator .get(aggName).newInstance(); // Aggregate if (aggValue0 == null) { // the first time aggregate. aggValue0 = ( AggregateValue ) aggValue1.clone(); this.aggregateValues.put(aggName, aggValue0); } else { ArrayList<AggregateValue> tmpValues = new ArrayList<AggregateValue>(); tmpValues.add(aggValue0); tmpValues.add(aggValue1); AggregateValue aggValue = aggregator .aggregate(tmpValues); this.aggregateValues.put(aggName, aggValue); } } } catch (InstantiationException e) { LOG.error("[BSPStaff:aggregate]", e); } catch (IllegalAccessException e) { LOG.error("[BSPStaff:aggregate]", e); } } /** * To encapsulate the aggregation values to the String[]. * * The aggValues should be in form as follows: [ AggregateName \t * AggregateValue.toString() ] * * @return String[] */ @SuppressWarnings("unchecked") private String[] encapsulateAggregateValues() { int aggSize = this.aggregateValues.size(); String[] aggValues = new String[aggSize]; int i_a = 0; for (Entry<String, AggregateValue> entry : this.aggregateValues .entrySet()) { aggValues[i_a] = entry.getKey() + Constants.KV_SPLIT_FLAG + entry.getValue().toString(); i_a++; } // The cache for this super step should be cleared for next super step. this.aggregateValues.clear(); return aggValues; } /** * To decapsulate the aggregation values from the String[]. * * The aggValues should be in form as follows: [ AggregateName \t * AggregateValue.toString() ] * * @param aggValues * String[] */ @SuppressWarnings("unchecked") private void decapsulateAggregateValues(String[] aggValues) { for (int i = 0; i < aggValues.length; i++) { String[] aggValueRecord = aggValues[i] .split(Constants.KV_SPLIT_FLAG); String aggName = aggValueRecord[0]; String aggValueString = aggValueRecord[1]; AggregateValue aggValue = null; try { aggValue = this.nameToAggregateValue.get(aggName).newInstance(); aggValue.initValue(aggValueString); // init the aggValue from // its string form. } catch (InstantiationException e1) { LOG.error("ERROR", e1); } catch (IllegalAccessException e1) { LOG.error("ERROR", e1); }// end-try if (aggValue != null) { this.aggregateResults.put(aggName, aggValue); }// end-if }// end-for } /** * To publish the aggregate values into the bsp's cache for user's accession * for the next super step. * * @param BSPStaffContext * context */ @SuppressWarnings("unchecked") private void publishAggregateValues(BSPStaffContext context) { for (Entry<String, AggregateValue> entry : this.aggregateResults .entrySet()) { context.addAggregateValues(entry.getKey(), entry.getValue()); } } /** * To publish the aggregate values into the super step context for * the bsp.initBeforeSuperStep for the next super step. * * @param context * SuperStepContext */ @SuppressWarnings("unchecked") private void publishAggregateValues(SuperStepContext context) { for (Entry<String, AggregateValue> entry : this.aggregateResults .entrySet()) { context.addAggregateValues(entry.getKey(), entry.getValue()); } } /** * To publish the aggregate values into the aggregation context for the * aggregation value's init of each vertex. * * @param context * AggregationContext */ @SuppressWarnings("unchecked") private void publishAggregateValues(AggregationContext context) { for (Entry<String, AggregateValue> entry : this.aggregateResults .entrySet()) { context.addAggregateValues(entry.getKey(), entry.getValue()); } } /** * WorkerAgentForStaffInterface.java */ public interface WorkerAgentForStaffInterface extends VersionedProtocol { public static final long versionID = 0L; /** * This method is used to worker which this worker's partition id equals * belongPartition. * * @param jobId * @param staffId * @param belongPartition * @return */ public WorkerAgentForStaffInterface getWorker(BSPJobID jobId, StaffAttemptID staffId, int belongPartition); /** * This method is used to put the HeadNode to WorkerAgentForJob's map. * * @param jobId * @param staffId * @param belongPartition * the partitionID which the HeadNode belongs to * @param hnlist * HeadNode list */ public void putHeadNode(BSPJobID jobId, StaffAttemptID staffId, int belongPartition, BytesWritable data); /** * Get the address of this WorkerAgentForStaff. * * @return address */ public String address(); /** * This method will be invoked before the staff be killed, to notice the * staff to do some cleaning operations. */ public void onKillStaff(); } /** * WorkerAgentForStaff.java * * @author root * */ public class WorkerAgentForStaff implements WorkerAgentForStaffInterface { // <partitionID, hostName:port1-port2> private HashMap<Integer, String> partitionToWorkerManagerHostWithPorts = new HashMap<Integer, String>(); private final Map<InetSocketAddress, WorkerAgentForStaffInterface> workers = new ConcurrentHashMap<InetSocketAddress, WorkerAgentForStaffInterface>(); private InetSocketAddress workAddress; private Server server = null; private Configuration conf; public WorkerAgentForStaff(Configuration conf) { this.partitionToWorkerManagerHostWithPorts = BSPStaff.this.partitionToWorkerManagerHostWithPorts; this.conf = conf; String[] hostandports = this.partitionToWorkerManagerHostWithPorts.get(BSPStaff.this.partition).split(":"); LOG.info(this.partitionToWorkerManagerHostWithPorts.get(BSPStaff.this.partition)); String[] ports = hostandports[1].split("-"); workAddress = new InetSocketAddress(hostandports[0], Integer.parseInt(ports[0])); reinitialize(); } private void reinitialize() { try { LOG.info("reinitialize() the WorkerAgentForStaff: " + jobId.toString()); server = RPC.getServer(this, workAddress.getHostName(), workAddress.getPort(), conf); server.start(); LOG.info("WorkerAgentForStaff address:" + workAddress.getHostName() + " port:" + workAddress.getPort()); } catch (IOException e) { LOG.error("[reinitialize]", e); } } protected WorkerAgentForStaffInterface getWorkerAgentConnection( InetSocketAddress addr) { WorkerAgentForStaffInterface worker; synchronized (this.workers) { worker = workers.get(addr); if (worker == null) { try { worker = ( WorkerAgentForStaffInterface ) RPC.getProxy( WorkerAgentForStaffInterface.class, WorkerAgentForStaffInterface.versionID, addr, this.conf); } catch (IOException e) { LOG.error("[getWorkerAgentConnection]", e); } this.workers.put(addr, worker); } } return worker; } private InetSocketAddress getAddress(String peerName) { String[] workerAddrParts = peerName.split(":"); return new InetSocketAddress(workerAddrParts[0], Integer.parseInt(workerAddrParts[1])); } /** * This method is used to get worker * * @param jobId * @param staffId * @param belongPartition * @return */ public WorkerAgentForStaffInterface getWorker(BSPJobID jobId, StaffAttemptID staffId, int belongPartition) { String dstworkerName = null; dstworkerName = this.partitionToWorkerManagerHostWithPorts .get(belongPartition);// hostName:port1-port2 String[] hostAndPorts = dstworkerName.split(":"); String[] ports = hostAndPorts[1].split("-"); dstworkerName = hostAndPorts[0] + ":" + ports[0]; WorkerAgentForStaffInterface work = workers .get(getAddress(dstworkerName)); if (work == null) { work = getWorkerAgentConnection(getAddress(dstworkerName)); } return work; } /** * This method is used to put the HeadNode to WorkerAgentForJob's map. * * @param jobId * @param staffId * @param belongPartition * the partitionID which the HeadNode belongs to * @param hnlist * HeadNode list */ @SuppressWarnings("unchecked") public void putHeadNode(BSPJobID jobId, StaffAttemptID staffId, int belongPartition, BytesWritable data) { DataInputStream in = new DataInputStream(new BufferedInputStream( new ByteArrayInputStream(data.getBytes()))); try { while (true) { Text key = new Text(); key.readFields(in); Text value = new Text(); value.readFields(in); if (key.getLength() > 0 && value.getLength() > 0) { if (BSPStaff.this.recordParse == null) { //LOG.error("Test Null: BSPStaff.this.recordParse is NULL"); } Vertex vertex = BSPStaff.this.recordParse.recordParse( key.toString(), value.toString()); if (vertex == null) { BSPStaff.this.lost++; continue; } BSPStaff.this.graphData.addForAll(vertex); } else { break; } } } catch (IOException e) { LOG.error("ERROR", e); } } @Override public long getProtocolVersion(String arg0, long arg1) throws IOException { return WorkerAgentForStaffInterface.versionID; } @Override public String address() { String hostName = this.workAddress.getHostName(); int port = this.workAddress.getPort(); return new String(hostName + ":" + port); } @Override public void onKillStaff() { BSPStaff.this.stopActiveMQBroker(); } } private void startActiveMQBroker(String hostName) { // brokerName = "hostName-partitionID" this.activeMQBroker = new ActiveMQBroker(hostName + "-" + this.partition); try { this.activeMQBroker.startBroker(this.activeMQPort); LOG.info("[BSPStaff] starts ActiveMQ Broker successfully!"); } catch (Exception e) { LOG.error("[BSPStaff] caught: ", e); } } private void stopActiveMQBroker() { if (this.activeMQBroker != null) { try { this.activeMQBroker.stopBroker(); LOG.info("[BSPStaff] stops ActiveMQ Broker successfully!"); } catch (Exception e) { LOG.error("[BSPStaff] caught: ", e); } } } }