/** * CopyRight by Chinamobile * * GeneralSSController.java */ package com.chinamobile.bcbsp.sync; import java.util.ArrayList; import java.util.List; import org.apache.commons.logging.LogFactory; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.WatchedEvent; import org.apache.zookeeper.Watcher; import org.apache.zookeeper.ZooKeeper; import org.apache.zookeeper.ZooDefs.Ids; import org.apache.zookeeper.data.Stat; import org.mortbay.log.Log; import com.chinamobile.bcbsp.BSPConfiguration; import com.chinamobile.bcbsp.Constants; import com.chinamobile.bcbsp.bspcontroller.JobInProgressControlInterface; import com.chinamobile.bcbsp.util.BSPJobID; /** * GeneralSSController * * GeneralSSController for completing the general SuperStep synchronization * control. This class is connected to JobInProgress. * * @author * @version */ public class GeneralSSController implements Watcher, GeneralSSControllerInterface { private static final org.apache.commons.logging.Log LOG = LogFactory.getLog(GeneralSSController.class); private BSPConfiguration conf; private JobInProgressControlInterface jip; private BSPJobID jobId; private int superStepCounter = 0; private int faultSuperStepCounter = 0; private int checkNumBase; private ZooKeeper zk = null; private final String zookeeperAddr; private final String bspZKRoot; private volatile Integer mutex = 0; private int stageFlag = 1; private ZooKeeperRun zkRun = new ZooKeeperRun(); public class ZooKeeperRun extends Thread { public void startNextSuperStep(SuperStepCommand ssc) throws Exception { int nextSuperStep = ssc.getNextSuperStepNum(); jip.reportLOG(jobId.toString() + "the next superstepnum is : " + nextSuperStep); Stat s = null; s = zk.exists(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + nextSuperStep, false); if (s == null) { zk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + nextSuperStep, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } else { jip.reportLOG("The node hash exists" + bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + nextSuperStep); List<String> tmpList = new ArrayList<String>(); Stat tmpStat = null; tmpList = zk.getChildren(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + nextSuperStep, false); for (String e : tmpList) { tmpStat = zk.exists(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + nextSuperStep + "/" + e, false); zk.delete(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + nextSuperStep + "/" + e, tmpStat .getAversion()); } } s = zk.exists(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + nextSuperStep, false); if (s == null) { zk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + nextSuperStep, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } else { List<String> tmpList = new ArrayList<String>(); Stat tmpStat = null; tmpList = zk.getChildren(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + nextSuperStep, false); for (String e : tmpList) { tmpStat = zk.exists(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + nextSuperStep + "/" + e, false); zk.delete(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + nextSuperStep + "/" + e, tmpStat .getAversion()); } } zk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter + "/" + Constants.COMMAND_NAME, ssc.toString().getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); jip.reportLOG(jobId.toString() + " command of next is " + ssc.toString()); jip.reportLOG(jobId.toString() + " [Write Command Path] " + bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter + "/" + Constants.COMMAND_NAME); jip.reportLOG(jobId.toString() + " leave the barrier of " + superStepCounter); } public void stopNextSuperStep(String command) throws Exception { zk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter + "/" + Constants.COMMAND_NAME, command.getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); jip.reportLOG(jobId.toString() + " command of next is " + command); jip.reportLOG(jobId.toString() + " prepare to quit"); } public void cleanReadHistory(int ableCheckPoint) { List<String> tmpList = new ArrayList<String>(); Stat tmpStat = null; try { tmpList = zk.getChildren(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + ableCheckPoint, false); for (String e : tmpList) { tmpStat = zk.exists(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + ableCheckPoint + "/" + e, false); zk.delete(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + ableCheckPoint + "/" + e, tmpStat .getAversion()); jip.reportLOG("The node hash exists" + bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + ableCheckPoint + "/" + e); } } catch (Exception exc) { jip.reportLOG(jobId.toString() + " [cleanReadHistory]" + exc.getMessage()); } } /** * This is a thread and execute the logic control */ public void run() { Stat s = null; boolean jobEndFlag = true; // create the directory for the 0th SuperStep try { s = zk.exists(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + superStepCounter, false); if (s == null) { zk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + superStepCounter, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } s = zk.exists(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter, false); if (s == null) { zk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } } catch (Exception e) { jip.reportLOG(jobId.toString() + " [run]" + e.getMessage()); } while (jobEndFlag) { try { setStageFlag(Constants.SUPERSTEP_STAGE.FIRST_STAGE); generalSuperStepBarrier(checkNumBase * 2); setStageFlag(Constants.SUPERSTEP_STAGE.SECOND_STAGE); SuperStepCommand ssc = getSuperStepCommand(checkNumBase); switch (ssc.getCommandType()) { case Constants.COMMAND_TYPE.START: startNextSuperStep(ssc); superStepCounter = ssc.getNextSuperStepNum(); jip.setSuperStepCounter(superStepCounter); break; case Constants.COMMAND_TYPE.START_AND_CHECKPOINT: startNextSuperStep(ssc); generalSuperStepBarrier(checkNumBase * 3); jip.setAbleCheckPoint(superStepCounter); LOG.info("ableCheckPoint: " + superStepCounter); superStepCounter = ssc.getNextSuperStepNum(); jip.setSuperStepCounter(superStepCounter); break; case Constants.COMMAND_TYPE.START_AND_RECOVERY: cleanReadHistory(ssc.getAbleCheckPoint()); startNextSuperStep(ssc); setCheckNumBase(); superStepCounter = ssc.getAbleCheckPoint(); generalSuperStepBarrier(checkNumBase * 1); superStepCounter = ssc.getNextSuperStepNum(); jip.setSuperStepCounter(superStepCounter); break; case Constants.COMMAND_TYPE.STOP: stopNextSuperStep(ssc.toString()); jobEndFlag = quitBarrier(); break; default: jip.reportLOG(jobId.toString() + " Unkonwn command of " + ssc.getCommandType()); } } catch (Exception e) { jip.reportLOG(jobId.toString() + "error: " + e.toString()); } }// while(jobEndFlag) }// run } /** * Generate the GeneralSSController to control the synchronization between * SuperSteps * * @param jobId */ @SuppressWarnings("unused") public GeneralSSController(BSPJobID jobId) { this.jobId = jobId; this.conf = new BSPConfiguration(); this.zookeeperAddr = conf.get(Constants.ZOOKEEPER_QUORUM) + ":" + conf.getInt(Constants.ZOOKEPER_CLIENT_PORT, Constants.DEFAULT_ZOOKEPER_CLIENT_PORT); this.bspZKRoot = Constants.BSPJOB_ZOOKEEPER_DIR_ROOT; // adjust the location. This function must be located there. // If it is located in run(), may be crashed on ZooKeeper cluster. setup(); } @Override public boolean isCommandBarrier() { try { List<String> list = new ArrayList<String>(); list = zk.getChildren(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + faultSuperStepCounter, false); jip.reportLOG("[isCommandBarrier] path: " + bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + faultSuperStepCounter); if (list.size() < checkNumBase + 1) { jip.reportLOG("[isCommandBarrier] " + list.size() + " instead of " + (checkNumBase + 1)); jip.reportLOG("[isCommandBarrier] " + list.toString()); return false; } else { jip.reportLOG("[isCommandBarrier] " + list.size()); return true; } } catch (Exception e) { jip.reportLOG("[isCommandBarrier] " + e.getMessage()); return false; } } @Override public void setJobInProgressControlInterface( JobInProgressControlInterface jip) { this.jip = jip; this.superStepCounter = jip.getSuperStepCounter(); } @Override public void setCheckNumBase() { this.checkNumBase = jip.getCheckNum(); } public int getStageFlag() { return stageFlag; } public void setStageFlag(int stageFlag) { this.stageFlag = stageFlag; } /** * Connect to ZooKeeper cluster and create the root directory for the job */ @Override public void setup() { try { this.zk = new ZooKeeper(this.zookeeperAddr, 3000, this); if (zk != null) { Stat s = null; // create the directory for scheduler s = zk.exists(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-s", false); if (s == null) { zk.create(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-s", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } // create the directory for load data s = zk.exists(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-d", false); if (s == null) { zk.create(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-d", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } // create the directory for SuperStep s = zk.exists(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-ss", false); if (s == null) { zk.create(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-ss", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } // create the directory for SuperStep Command s = zk.exists(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-sc", false); if (s == null) { zk.create(this.bspZKRoot + "/" + this.jobId.toString().substring(17) + "-sc", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } } } catch (Exception e) { jip.reportLOG(jobId.toString() + " [setup]" + e.getMessage()); } } /** * Connect to ZooKeeper cluster and delete the directory for the job */ @Override public void cleanup() { Stat statJob = null; Stat statStaff = null; Stat tmpStat = null; List<String> list = new ArrayList<String>(); List<String> tmpList = new ArrayList<String>(); try { // cleanup the directory of scheduler try { list.clear(); list = zk.getChildren(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-s", false); for (String e : list) { statStaff = zk.exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-s" + "/" + e, false); zk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-s" + "/" + e, statStaff.getVersion()); } } catch (Exception e) { // Undo } finally { statJob = zk.exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-s", false); zk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-s", statJob.getVersion()); } jip.reportLOG(jobId.toString() + "delete the -s"); // cleanup the directory of load data try { list.clear(); list = zk.getChildren(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-d", false); for (String e : list) { statStaff = zk.exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-d" + "/" + e, false); zk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-d" + "/" + e, statStaff.getVersion()); } } catch (Exception e) { // Undo } finally { statJob = zk.exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-d", false); zk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-d", statJob.getVersion()); } jip.reportLOG(jobId.toString() + "delete the -d"); // cleanup the directory of SuperStep control list.clear(); list = zk.getChildren(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-ss", false); for (String e : list) { try { tmpList.clear(); tmpList = zk.getChildren(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + e, false); for (String ee : tmpList) { tmpStat = zk.exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + e + "/" + ee, false); zk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + e + "/" + ee, tmpStat.getAversion()); } } catch (Exception exc) { // Undo } finally { statStaff = zk.exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + e, false); zk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + e, statStaff.getVersion()); } } statJob = zk.exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-ss", false); zk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-ss", statJob.getVersion()); jip.reportLOG(jobId.toString() + "delete the -ss"); // cleanup the directory of SuperStep command list.clear(); list = zk.getChildren(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-sc", false); for (String e : list) { try { tmpList.clear(); tmpList = zk.getChildren(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + e, false); for (String ee : tmpList) { tmpStat = zk.exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + e + "/" + ee, false); zk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + e + "/" + ee, tmpStat.getAversion()); } } catch (Exception exc) { // Undo } finally { statStaff = zk.exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + e, false); zk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + e, statStaff.getVersion()); } } statJob = zk.exists(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-sc", false); zk.delete(this.bspZKRoot + "/" + jobId.toString().substring(17) + "-sc", statJob.getVersion()); jip.reportLOG(jobId.toString() + "delete the -sc"); } catch (KeeperException e) { jip.reportLOG(jobId.toString() + "delet error: " + e.toString()); } catch (InterruptedException e) {; jip.reportLOG(jobId.toString() + "delet error: " + e.toString()); } } @Override public void start() { this.zkRun.start(); } @Override @SuppressWarnings("deprecation") public void stop() { this.zkRun.stop(); } @Override public boolean generalSuperStepBarrier(int checkNum) { List<String> list = new ArrayList<String>(); try { // make sure that all staffs complete the computation and // receiving-messages jip.reportLOG(jobId.toString() + " enter the barrier of " + superStepCounter); while (true) { synchronized (mutex) { list.clear(); list = zk.getChildren(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + superStepCounter, true); if (list.size() < checkNum) { mutex.wait(); } else { break; } } }// while(true) return true; } catch (KeeperException e) { jip.reportLOG(jobId.toString() + "error: " + e.toString()); return false; } catch (InterruptedException e) { jip.reportLOG(jobId.toString() + "error: " + e.toString()); return false; } } @Override public SuperStepCommand getSuperStepCommand(int checkNum) { Stat s = null; List<String> list = new ArrayList<String>(); try { // make sure that all staffs have reported the info while (true) { synchronized (mutex) { list.clear(); list = zk.getChildren(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter, true); if (list.size() < checkNum) { jip.reportLOG("[getSuperStepCommand]: " + list.size() + " instead of " + checkNum); mutex.wait(); } else { jip.reportLOG("[getSuperStepCommand]: " + list.size()); break; } } }// while(true) // give the command to all staffs according to the report info SuperStepReportContainer[] ssrcs = new SuperStepReportContainer[checkNumBase]; int counter = 0; for (String e : list) { s = zk.exists(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter + "/" + e, false); byte[] b = zk.getData(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + superStepCounter + "/" + e, false, s); ssrcs[counter++] = new SuperStepReportContainer(new String(b)); } SuperStepCommand ssc = jip.generateCommand(ssrcs); return ssc; } catch (KeeperException e) { e.printStackTrace(); jip.reportLOG(jobId.toString() + "error: " + e.toString()); return null; } catch (InterruptedException e) { e.printStackTrace(); jip.reportLOG(jobId.toString() + "error: " + e.toString()); return null; } } @SuppressWarnings("finally") @Override public boolean quitBarrier() { List<String> list = new ArrayList<String>(); try { while (true) { synchronized (mutex) { list.clear(); list = zk.getChildren(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + superStepCounter, true); if (list.size() > 0) { mutex.wait(); } else { break; } } }// while(true) } catch (KeeperException e) { e.printStackTrace(); jip.reportLOG(jobId.toString() + "error: " + e.toString()); } catch (InterruptedException e) { e.printStackTrace(); jip.reportLOG(jobId.toString() + "error: " + e.toString()); } finally { jip.completedJob(); return false; } } @Override public void process(WatchedEvent event) { synchronized (mutex) { mutex.notify(); } } @Override public void recoveryBarrier(List<String> WMNames) { Log.info("recoveryBarrier: this.superStepCounter " + superStepCounter); faultSuperStepCounter = superStepCounter; int base = WMNames.size(); switch (this.stageFlag) { case Constants.SUPERSTEP_STAGE.FIRST_STAGE : try{ jip.reportLOG("recoveried: " + this.jobId.toString() + " enter the firstStageSuperStepBarrier of " + Integer.toString(superStepCounter)); for(int i=0; i<base*2; i++) {// WMNames.get(0) zk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-ss" + "/" + Integer.toString(superStepCounter) + "/" + WMNames.get(0) + "-recovery" + i, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);//slave2-recovery0 Log.info("first--recoveryBarrier: " + "recovery" + i); } jip.reportLOG("recoveried: " + this.jobId.toString() + " enter the secondStageSuperStepBarrier(first) of " + Integer.toString(superStepCounter)); for(int i=0; i<base; i++) { zk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + Integer.toString(superStepCounter) + "/" + WMNames.get(i) + "-recovery" + i, "RECOVERY".getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); Log.info("second-(first)--recoveryBarrier: " + "recovery" + i); } }catch(KeeperException e){ e.printStackTrace(); }catch(InterruptedException e){ e.printStackTrace(); } break; case Constants.SUPERSTEP_STAGE.SECOND_STAGE ://int 4 try{ jip.reportLOG("recoveried " + this.jobId.toString() + " enter the secondStageSuperStepBarrier(second) of superStepCounter: " + Integer.toString(superStepCounter)); for(int i=0; i<base; i++) { zk.create(bspZKRoot + "/" + jobId.toString().substring(17) + "-sc" + "/" + Integer.toString(superStepCounter) + "/" + WMNames.get(i) + "-recovery" + i, "RECOVERY".getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); Log.info("second--recoveryBarrier: " + "recovery" + i); } }catch(KeeperException e){ e.printStackTrace(); }catch(InterruptedException e){ e.printStackTrace(); } break; default : jip.reportLOG(jobId.toString() + " Unkonwn command of " ); }//switch } }