/** * */ package system; import exceptions.DataNotFoundException; import exceptions.IllegalClassException; import exceptions.IllegalInputException; import exceptions.NoResourcesException; import graphs.GraphPartitioner; import java.io.IOException; import java.rmi.RemoteException; import java.rmi.registry.LocateRegistry; import java.rmi.registry.Registry; import java.rmi.server.UnicastRemoteObject; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Vector; import java.util.logging.Logger; import utility.JPregelLogger; import utility.Pair; import api.Vertex; /** * @author Manasa Chandrasekhar * @author Kowshik Prakasam * */ public class MasterImpl extends UnicastRemoteObject implements ManagerToMaster, Runnable { /** * */ private int superStep; public int getSuperStep() { return superStep; } public void setSuperStep(int superStep) { this.superStep = superStep; } private static final long serialVersionUID = -7962409918852099855L; private GraphPartitioner gp; private Thread superstepExecutorThread; private Thread faultDetectorThread; public Map<String, WorkerManager> idManagerMap; private Logger logger; private String id; private String vertexClassName; private List<String> returnedManagers; private boolean allDone; private int participatingMgrs; public synchronized int getParticipatingMgrs() { return participatingMgrs; } public synchronized void setParticipatingMgrs(int participatingMgrs) { this.participatingMgrs = participatingMgrs; } private int numMachines; private FaultDetector aFaultDetector; private boolean isActive; private boolean isWkrMgrsInitialized; private int lastCheckPoint; public String getVertexClassName() { return vertexClassName; } public void setVertexClassName(String vertexClassName) { this.vertexClassName = vertexClassName; logger.info("set vertexClassName : " + vertexClassName); } private static final String LOG_FILE_PREFIX = JPregelConstants.LOG_DIR + "master"; private static final String LOG_FILE_SUFFIX = ".log"; private static final int PORT_NUMBER = 3672; public class FaultDetector implements Runnable { private Logger logger; private static final String LOG_FILE_PREFIX = JPregelConstants.LOG_DIR + "faultdetector"; private static final String LOG_FILE_SUFFIX = ".log"; private void initLogger() throws IOException { this.logger = JPregelLogger.getLogger(this.getID(), LOG_FILE_PREFIX + LOG_FILE_SUFFIX); } public FaultDetector() throws IOException { initLogger(); } @Override public void run() { while (true) { WorkerManager aWkrMgr = null; String wkrMgrID = null; try { for (Map.Entry<String, WorkerManager> e : idManagerMap .entrySet()) { wkrMgrID = e.getKey(); aWkrMgr = e.getValue(); aWkrMgr.isAlive(); } } catch (RemoteException e) { this.logger .severe("Worker manager : " + wkrMgrID + " died"); System.err.println("Worker manager : "+wkrMgrID+" died"); e.printStackTrace(); System.err.println("Deactivating master"); // deactivate master deactivate(); synchronized (idManagerMap) { idManagerMap.remove(wkrMgrID); } // check if this fellow has returned already. decrement // counter otherwise. if (!returnedManagers.contains(wkrMgrID)) { setParticipatingMgrs(getParticipatingMgrs() - 1); this.logger .info(wkrMgrID + " didn't report completion earlier .. so decrementing participatingMgrs to " + participatingMgrs); System.err.println(wkrMgrID + " didn't report completion earlier .. so decrementing participatingMgrs to " + participatingMgrs); if (getParticipatingMgrs() == 0) { allDone = true; } } // for all worker managers, other than the failed // manager, // stop the superstep immediately. stopSuperStep(); // for all worker managers, other than the failed // manager, // restore state. try { restoreState(); } catch (IOException e1) { e1.printStackTrace(); } catch (IllegalInputException e1) { e1.printStackTrace(); } catch (DataNotFoundException e1) { e1.printStackTrace(); } catch (InstantiationException e1) { e1.printStackTrace(); } catch (IllegalAccessException e1) { e1.printStackTrace(); } catch (ClassNotFoundException e1) { e1.printStackTrace(); } catch (NoResourcesException e1) { e1.printStackTrace(); } //activating master again setSuperStep(getLastCheckPoint()); setWkrMgrsInitialized(true); activate(); } } } public String getID() { return "FaultDetector"; } } /** * @return */ private int getLastCheckPoint() { return this.lastCheckPoint; } /** * @return */ private void setLastCheckPoint(int checkPoint) { this.lastCheckPoint = checkPoint; } public MasterImpl(String vertexClassName, int numMachines) throws IOException { this.setId("Master"); initLogger(); this.lastCheckPoint = JPregelConstants.FIRST_SUPERSTEP; this.setNumMachines(numMachines); this.setSuperStep(JPregelConstants.FIRST_SUPERSTEP); this.setVertexClassName(vertexClassName); this.returnedManagers = new Vector<String>(); this.idManagerMap = new HashMap<String, WorkerManager>(); this.superstepExecutorThread = new Thread(this, getId()); this.aFaultDetector = new FaultDetector(); this.faultDetectorThread = new Thread(this.aFaultDetector, this.aFaultDetector.getID()); } /** * @param numMachines */ private void setNumMachines(int numMachines) { this.numMachines = numMachines; } public void setId(String id) { this.id = id; } public String getId() { return id; } private void initLogger() throws IOException { this.logger = JPregelLogger.getLogger(getId(), LOG_FILE_PREFIX + LOG_FILE_SUFFIX); } /* * (non-Javadoc) * * @see system.ManagerToMaster#register(system.WorkerManager, * java.lang.String) */ @Override public synchronized void register(WorkerManager aWorkerManager, String id) throws RemoteException { this.idManagerMap.put(id, aWorkerManager); logger.info("registered worker manager : " + id); logger.info("size of map : " + idManagerMap.size()); if (idManagerMap.size() == this.numMachines) { try { executeTask(); } catch (IOException e) { logger.severe(e.toString()); throw new RemoteException(e.toString()); } } } public static void main(String args[]) throws IllegalClassException { String vertexClassName = args[0]; int numMachines = Integer.parseInt(args[1]); try { Class<?> c = Class.forName(vertexClassName); if (!c.getSuperclass().equals(Vertex.class)) { throw new IllegalClassException(vertexClassName); } } catch (ClassNotFoundException e) { System.err.println("Client vertex class not found !"); e.printStackTrace(); return; } if (System.getSecurityManager() == null) { System.setSecurityManager(new SecurityManager()); } try { MasterImpl master = new MasterImpl(vertexClassName, numMachines); Registry registry = LocateRegistry.createRegistry(PORT_NUMBER); registry.rebind(ManagerToMaster.SERVICE_NAME, master); System.err.println("Master instance bound"); } catch (Exception e) { System.err.println("Can't bind Master instance"); e.printStackTrace(); } } public int getWorkerMgrsCount() { logger.info("returning : " + idManagerMap.size()); return this.idManagerMap.size(); } public int getWorkerMgrThreads() { return JPregelConstants.WORKER_MGR_THREADS; } public synchronized boolean isActive() { return this.isActive; } public synchronized void deactivate() { this.isActive = false; } public synchronized void activate() { this.isActive = true; } public void executeTask() throws RemoteException { try { this.gp = new GraphPartitioner(JPregelConstants.GRAPH_FILE, this, this.getVertexClassName()); } catch (IOException e) { logger.severe(e.toString()); throw new RemoteException(e.getMessage(),e); } this.setAllDone(true); this.activate(); logger.info("Starting superstep executor thread"); superstepExecutorThread.start(); logger.info("Starting fault detector thread"); faultDetectorThread.start(); } public void run() { while (true) { if (this.isActive()) { try { while (!allDone()) { } synchronized (idManagerMap) { if (!isWkrMgrsInitialized()) { initializeWorkerManagers(); setWkrMgrsInitialized(true); } if (findActiveManagers(this.getSuperStep()) > 0) { try { Thread.sleep(10); } catch (InterruptedException e) { e.printStackTrace(); } this.setAllDone(false); this.setParticipatingMgrs(this.idManagerMap.size()); this.returnedManagers.clear(); for (Map.Entry<String, WorkerManager> e : this.idManagerMap .entrySet()) { String aWkrMgrId = e.getKey(); WorkerManager aWkrMgr = e.getValue(); logger.info("Commencing superstep : " + this.getSuperStep() + " in worker manager : " + aWkrMgrId); aWkrMgr.beginSuperStep(this.getSuperStep(), this.isCheckPoint()); } logger.info("Waiting for worker managers to complete execution"); while (isActive() && !allDone()) { } if (isActive()) { logger.info("Superstep over : " + this.getSuperStep()); this.setSuperStep(this.getSuperStep() + 1); } } else { logger.info("-----------------------------------------------------"); logger.info("Writing Solutions"); // Writing solutions writeSolutions(); System.err.println("Computations completed. Solutions written to solutions/. Logs in logs/ !"); break; } } } catch (RemoteException e) { logger.severe(e.toString()); e.printStackTrace(); this.deactivate(); } catch (IOException e) { logger.severe(e.getMessage()); e.printStackTrace(); break; } catch (IllegalInputException e) { logger.severe(e.getMessage()); e.printStackTrace(); break; } catch (DataNotFoundException e) { logger.severe(e.getMessage()); e.printStackTrace(); break; } catch (InstantiationException e) { logger.severe(e.getMessage()); e.printStackTrace(); break; } catch (IllegalAccessException e) { logger.severe(e.getMessage()); e.printStackTrace(); break; } catch (ClassNotFoundException e) { logger.severe(e.getMessage()); e.printStackTrace(); break; } catch (NoResourcesException e) { logger.severe(e.getMessage()); e.printStackTrace(); break; } } } } /** * @param b */ private synchronized void setWkrMgrsInitialized(boolean newState) { this.isWkrMgrsInitialized = newState; } /** * @return */ private synchronized boolean isWkrMgrsInitialized() { return this.isWkrMgrsInitialized; } /** * @return */ private boolean isCheckPoint() { if (this.getSuperStep() % JPregelConstants.CHECKPOINT_INTERVAL == 0) { return true; } return false; } /** * @throws RemoteException * */ private void writeSolutions() throws RemoteException { for (Map.Entry<String, WorkerManager> e : this.idManagerMap.entrySet()) { WorkerManager aWkrMgr = e.getValue(); aWkrMgr.writeSolutions(); } } /** * @return * @throws RemoteException */ private int findActiveManagers(int superStep) throws RemoteException { if (superStep == JPregelConstants.FIRST_SUPERSTEP) { return this.idManagerMap.size(); } int activeManagers = 0; for (Map.Entry<String, WorkerManager> e : this.idManagerMap.entrySet()) { MessageSpooler aSpooler = (MessageSpooler) e.getValue(); if (!aSpooler.isQueueEmpty()) { activeManagers++; } } return activeManagers; } /** * @return */ private synchronized boolean allDone() { return allDone; } private void restoreState() throws IOException, IllegalInputException, DataNotFoundException, InstantiationException, IllegalAccessException, ClassNotFoundException, NoResourcesException { if (idManagerMap.size() == 0) { throw new NoResourcesException( "No worker managers available to the Master for queueing jobs"); } List<List<Integer>> assignedPartitions = this.assignPartitions(); Map<Integer, Pair<String, String>> partitionWkrMgrMap = this .getPartitionMap(assignedPartitions); this.restoreState(assignedPartitions); // Write partition - worker manager map to file DataLocator dl = DataLocator.getDataLocator(gp.getPartitionSize()); dl.writePartitionMap(partitionWkrMgrMap); logger.info("Restored state"); } private void restoreState(List<List<Integer>> assignedPartitions) { int index = 0; for (Map.Entry<String, WorkerManager> anEntry : idManagerMap.entrySet()) { String wkrMgrToBeRestored = anEntry.getKey(); try { System.err.println(wkrMgrToBeRestored + " restoring state"); this.logger.info(wkrMgrToBeRestored + " restoring state"); anEntry.getValue().restoreState(getLastCheckPoint(), assignedPartitions.get(index)); System.err.println(wkrMgrToBeRestored + " state restoration - SUCCESSFUL !"); } catch (RemoteException e) { // can't really catch this now, ignore e.printStackTrace(); } index++; } } private void stopSuperStep() { for (Map.Entry<String, WorkerManager> anEntry : idManagerMap.entrySet()) { String wkrMgrToBeStopped = anEntry.getKey(); try { System.err.println(wkrMgrToBeStopped + " stopping superstep"); this.logger.info(wkrMgrToBeStopped + " stopping superstep"); anEntry.getValue().stopSuperStep(); } catch (RemoteException e1) { // can't really catch this now, ignore e1.printStackTrace(); } } } /** * @throws IOException * @throws DataNotFoundException * @throws IllegalInputException * @throws ClassNotFoundException * @throws IllegalAccessException * @throws InstantiationException * @throws NoResourcesException * */ private void initializeWorkerManagers() throws IOException, IllegalInputException, DataNotFoundException, InstantiationException, IllegalAccessException, ClassNotFoundException, NoResourcesException { if (idManagerMap.size() == 0) { throw new NoResourcesException( "No worker managers available to the Master for queueing jobs"); } int numPartitions = this.gp.partitionGraphs(); logger.info("Num partitions : " + numPartitions); List<List<Integer>> assignedPartitions = this.assignPartitions(); Map<Integer, Pair<String, String>> partitionWkrMgrMap = this .getPartitionMap(assignedPartitions); this.initializeWorkerManagers(assignedPartitions); // Write partition - worker manager map to file DataLocator dl = DataLocator.getDataLocator(gp.getPartitionSize()); dl.writePartitionMap(partitionWkrMgrMap); logger.info("Initialized worker managers : "); dl.clearSolutions(); logger.info("Cleared solutions folder"); } private Map<Integer, Pair<String, String>> getPartitionMap( List<List<Integer>> assignedPartitions) throws RemoteException { Map<Integer, Pair<String, String>> partitionWkrMgrMap = new HashMap<Integer, Pair<String, String>>(); int index = 0; WorkerManager thisWkrMgr = null; String thisWkrMgrId = null; for (Map.Entry<String, WorkerManager> e : this.idManagerMap.entrySet()) { thisWkrMgrId = e.getKey(); thisWkrMgr = e.getValue(); List<Integer> thisWkrMgrPartitions = assignedPartitions.get(index); for (int partition : thisWkrMgrPartitions) { partitionWkrMgrMap.put(partition, new Pair<String, String>( thisWkrMgrId, thisWkrMgr.getHostInfo())); } index++; } return partitionWkrMgrMap; } private void initializeWorkerManagers(List<List<Integer>> assignedPartitions) throws RemoteException { int index = 0; WorkerManager thisWkrMgr = null; for (Map.Entry<String, WorkerManager> e : this.idManagerMap.entrySet()) { thisWkrMgr = e.getValue(); List<Integer> thisWkrMgrPartitions = assignedPartitions.get(index); thisWkrMgr.initialize(thisWkrMgrPartitions, this.getWorkerMgrThreads(), this.gp.getPartitionSize(), this.gp.getNumVertices()); index++; } } private List<List<Integer>> assignPartitions() throws NoResourcesException, IOException, IllegalInputException, DataNotFoundException, InstantiationException, IllegalAccessException, ClassNotFoundException { if (idManagerMap.size() == 0) { throw new NoResourcesException( "No worker managers available to the Master for queueing jobs"); } logger.info(idManagerMap.toString()); List<List<Integer>> assignedPartitions = new Vector<List<Integer>>(); int numMgrPartitions = this.gp.getNumberOfPartitions() / this.getWorkerMgrsCount(); List<Integer> wkrMgrPartitions = new Vector<Integer>(); int partitionCount = 0; WorkerManager thisWkrMgr = null; for (Map.Entry<String, WorkerManager> e : this.idManagerMap.entrySet()) { if (thisWkrMgr != null) { logger.info("Generating partitions for worker manager : " + thisWkrMgr.getId() + " -> " + wkrMgrPartitions); assignedPartitions.add(wkrMgrPartitions); } wkrMgrPartitions = new Vector<Integer>(); thisWkrMgr = e.getValue(); for (int i = 0; i < numMgrPartitions; i++, partitionCount++) { wkrMgrPartitions.add(partitionCount); logger.info("added : " + partitionCount); } logger.info("End of loop : " + wkrMgrPartitions); } while (partitionCount < this.gp.getNumberOfPartitions()) { wkrMgrPartitions.add(partitionCount); partitionCount++; } assignedPartitions.add(wkrMgrPartitions); logger.info("Assigned partitions : " + assignedPartitions); return assignedPartitions; } /* * (non-Javadoc) * * @see system.ManagerToMaster#endSuperStep(java.lang.String) */ @Override public void endSuperStep(String wkrMgrId) throws RemoteException { // this fellow shouldn't have returned before .. checking just in // case there is a double endSuperStep() done by a worker manager // during a stopSuperStep() call if (!this.returnedManagers.contains(wkrMgrId) && superStep == this.getSuperStep()) { // Checking if any managers are yet to report completion if (this.getParticipatingMgrs() > 0) { logger.info("Worker manager : " + wkrMgrId + " has reported completion of superstep : " + this.getSuperStep()); this.setParticipatingMgrs(this.getParticipatingMgrs() - 1); this.returnedManagers.add(wkrMgrId); if (this.getParticipatingMgrs() == 0) { logger.info("All worker managers reported completion of superstep : " + this.getSuperStep()); if (this.isCheckPoint() && this.isActive()) { logger.info("#############################"); logger.info("Checkpointed data at superstep : " + this.getSuperStep()); logger.info("#############################"); this.setLastCheckPoint(this.getSuperStep()); } setAllDone(true); } } } } /** * @param b */ private synchronized void setAllDone(boolean allDone) { this.allDone = allDone; } }