/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package se.kth.karamel.backend; import com.google.common.collect.Lists; import com.google.gson.JsonObject; import org.apache.log4j.Logger; import se.kth.karamel.backend.converter.ChefJsonGenerator; import se.kth.karamel.backend.converter.UserClusterDataExtractor; import se.kth.karamel.backend.dag.Dag; import se.kth.karamel.backend.kandy.KandyRestClient; import se.kth.karamel.backend.launcher.Launcher; import se.kth.karamel.backend.launcher.amazon.Ec2Launcher; import se.kth.karamel.backend.launcher.baremetal.BaremetalLauncher; import se.kth.karamel.backend.launcher.google.GceLauncher; import se.kth.karamel.backend.launcher.nova.NovaLauncher; import se.kth.karamel.backend.launcher.occi.OcciLauncher; import se.kth.karamel.backend.machines.MachinesMonitor; import se.kth.karamel.backend.running.model.ClusterRuntime; import se.kth.karamel.backend.running.model.Failure; import se.kth.karamel.backend.running.model.GroupRuntime; import se.kth.karamel.backend.running.model.MachineRuntime; import se.kth.karamel.backend.running.model.tasks.DagBuilder; import se.kth.karamel.backend.stats.ClusterStatistics; import se.kth.karamel.common.clusterdef.Baremetal; import se.kth.karamel.common.clusterdef.Ec2; import se.kth.karamel.common.clusterdef.Gce; import se.kth.karamel.common.clusterdef.Nova; import se.kth.karamel.common.clusterdef.Occi; import se.kth.karamel.common.clusterdef.Provider; import se.kth.karamel.common.clusterdef.json.JsonCluster; import se.kth.karamel.common.clusterdef.json.JsonGroup; import se.kth.karamel.common.exception.KaramelException; import se.kth.karamel.common.stats.ClusterStats; import se.kth.karamel.common.stats.PhaseStat; import se.kth.karamel.common.util.Settings; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; /** * * @author kamal */ public class ClusterManager implements Runnable { public static enum Command { LAUNCH_CLUSTER, INTERRUPT_CLUSTER, TERMINATE_CLUSTER, SUBMIT_INSTALL_DAG, SUBMIT_PURGE_DAG, INTERRUPT_DAG, PAUSE_DAG, RESUME_DAG; } public static boolean EXIT_ON_COMPLETION = false; private static final Logger logger = Logger.getLogger(ClusterManager.class); private final JsonCluster definition; private final ClusterRuntime runtime; private final MachinesMonitor machinesMonitor; private final ClusterStatusMonitor clusterStatusMonitor; private Dag currentDag; private final BlockingQueue<Command> cmdQueue = new ArrayBlockingQueue<>(2); ExecutorService tpool; private final ClusterContext clusterContext; private Map<Class, Launcher> launchers = new HashMap<>(); private Future<?> clusterManagerFuture = null; private Future<?> machinesMonitorFuture = null; private Future<?> clusterStatusFuture = null; private boolean stopping = false; private final ClusterStats stats = new ClusterStats(); public ClusterManager(JsonCluster definition, ClusterContext clusterContext) throws KaramelException { this.clusterContext = clusterContext; this.definition = definition; this.runtime = new ClusterRuntime(definition); int totalMachines = UserClusterDataExtractor.totalMachines(definition); machinesMonitor = new MachinesMonitor(definition.getName(), totalMachines, clusterContext.getSshKeyPair()); String yaml = ClusterDefinitionService.jsonToYaml(definition); this.stats.setDefinition(yaml); this.stats.setUserId(Settings.USER_NAME); this.stats.setStartTime(System.currentTimeMillis()); clusterStatusMonitor = new ClusterStatusMonitor(machinesMonitor, definition, runtime, stats); initLaunchers(); } public ClusterStats getStats() { return stats; } public Dag getCurrentDag() { return currentDag; } public MachinesMonitor getMachinesMonitor() { return machinesMonitor; } public JsonCluster getDefinition() { return definition; } public ClusterRuntime getRuntime() { return runtime; } /** * Non-blocking way of controlling the cluster, the quick commands are served immediately while the time-consuming * commands are queued to be served one by one. Commands have different level of priorities and the higher priority * commands invalidated the lower-priority ones. * * Cluster-scope immediate: * - INTERRUPT_CLUSTER * Cluster-scope long-running: * - LAUNCH_CLUSTER * - INTERRUPT_CLUSTER * DAG-scope immediate: * - INTERRUPT_DAG * - PAUSE_DAG * - RESUME_DAG * DAG-scope long-running: * - SUBMIT_INSTALL_DAG * - SUBMIT_PURGE_DAG * * @param command * @throws KaramelException */ public void enqueue(Command command) throws KaramelException { ArrayList<Command> clusterScopeQueuingCommands = Lists.newArrayList( Command.LAUNCH_CLUSTER, Command.TERMINATE_CLUSTER); ArrayList<Command> dagScopeQueuingCommands = Lists.newArrayList( Command.SUBMIT_INSTALL_DAG, Command.SUBMIT_PURGE_DAG); if (clusterScopeQueuingCommands.contains(command)) { cmdQueue.removeAll(dagScopeQueuingCommands); } switch (command) { case LAUNCH_CLUSTER: runtime.resolveFailures(); cmdQueue.offer(command); break; case INTERRUPT_CLUSTER: interupt(); break; case TERMINATE_CLUSTER: runtime.resolveFailures(); cmdQueue.offer(command); break; case INTERRUPT_DAG: cmdQueue.remove(Command.SUBMIT_INSTALL_DAG); cmdQueue.remove(Command.SUBMIT_PURGE_DAG); cmdQueue.remove(Command.PAUSE_DAG); cmdQueue.remove(Command.RESUME_DAG); if (runtime.getPhase() == ClusterRuntime.ClusterPhases.RUNNING_DAG) { interupt(); } break; case SUBMIT_INSTALL_DAG: cmdQueue.offer(command); break; case SUBMIT_PURGE_DAG: cmdQueue.offer(command); break; case PAUSE_DAG: pause(); break; case RESUME_DAG: resume(); break; } } public void interupt() { if (clusterManagerFuture != null && !clusterManagerFuture.isCancelled()) { logger.info(String.format("Forcing to interrupt ClusterManager of '%s'", definition.getName())); clusterManagerFuture.cancel(true); } } public void start() { tpool = Executors.newFixedThreadPool(3); clusterManagerFuture = tpool.submit(this); machinesMonitorFuture = tpool.submit(machinesMonitor); clusterStatusFuture = tpool.submit(clusterStatusMonitor); } public void stop() throws InterruptedException { machinesMonitor.setStopping(true); if (machinesMonitorFuture != null && !machinesMonitorFuture.isCancelled()) { logger.info(String.format("Terminating machines monitor of '%s'", definition.getName())); machinesMonitorFuture.cancel(true); } clusterStatusMonitor.setStopping(true); if (clusterStatusFuture != null && !clusterStatusFuture.isCancelled()) { logger.info(String.format("Terminating cluster status monitor of '%s'", definition.getName())); clusterStatusFuture.cancel(true); } if (clusterManagerFuture != null && !clusterManagerFuture.isCancelled()) { logger.info(String.format("Terminating cluster manager of '%s'", definition.getName())); clusterManagerFuture.cancel(true); } } private void initLaunchers() throws KaramelException { for (JsonGroup group : definition.getGroups()) { Provider provider = UserClusterDataExtractor.getGroupProvider(definition, group.getName()); Launcher launcher = launchers.get(provider.getClass()); if (launcher == null) { if (provider instanceof Ec2) { launcher = new Ec2Launcher(clusterContext.getEc2Context(), clusterContext.getSshKeyPair()); } else if (provider instanceof Baremetal) { launcher = new BaremetalLauncher(clusterContext.getSshKeyPair()); } else if (provider instanceof Gce) { launcher = new GceLauncher(clusterContext.getGceContext(), clusterContext.getSshKeyPair()); } else if (provider instanceof Nova) { launcher = new NovaLauncher(clusterContext.getNovaContext(), clusterContext.getSshKeyPair()); } else if (provider instanceof Occi) { launcher = new OcciLauncher(clusterContext.getOcciContext(), clusterContext.getSshKeyPair()); } launchers.put(provider.getClass(), launcher); } } } private void clean(boolean terminating) { if (!terminating) { LogService.cleanup(definition.getName()); logger.info(String.format("Prelaunch Cleaning '%s' ...", definition.getName())); runtime.setPhase(ClusterRuntime.ClusterPhases.PRECLEANING); } runtime.resolveFailures(); List<GroupRuntime> groups = runtime.getGroups(); List<GroupRuntime> ec2GroupEntities = new ArrayList<>(); for (GroupRuntime group : groups) { if (terminating) { group.setPhase(GroupRuntime.GroupPhase.TERMINATING); } else { group.setPhase(GroupRuntime.GroupPhase.PRECLEANING); } group.getCluster().resolveFailures(); Provider provider = UserClusterDataExtractor.getGroupProvider(definition, group.getName()); ec2GroupEntities.add(group); } try { for (Map.Entry<Class, Launcher> entry : launchers.entrySet()) { Launcher launcher = entry.getValue(); launcher.cleanup(definition, runtime); } for (GroupRuntime group : ec2GroupEntities) { if (terminating) { group.setMachines(Collections.EMPTY_LIST); group.setPhase(GroupRuntime.GroupPhase.NONE); } else { group.setPhase(GroupRuntime.GroupPhase.PRECLEANED); } } } catch (Exception ex) { if (!(ex.getCause() instanceof InterruptedException && stopping)) { logger.error("", ex); runtime.issueFailure(new Failure(Failure.Type.CLEANUP_FAILE, ex.getMessage())); } } if (!terminating && !runtime.isFailed()) { runtime.setPhase(ClusterRuntime.ClusterPhases.PRECLEANED); logger.info(String.format("\\o/\\o/\\o/\\o/\\o/'%s' PRECLEANED \\o/\\o/\\o/\\o/\\o/", definition.getName())); } } private void forkGroups() throws InterruptedException { logger.info(String.format("Forking groups '%s' ...", definition.getName())); runtime.setPhase(ClusterRuntime.ClusterPhases.FORKING_GROUPS); runtime.resolveFailures(); List<GroupRuntime> groups = runtime.getGroups(); for (GroupRuntime group : groups) { if (group.getPhase() == GroupRuntime.GroupPhase.PRECLEANED || (group.getPhase() == GroupRuntime.GroupPhase.FORKING_GROUPS)) { runtime.resolveFailure(Failure.hash(Failure.Type.CREATING_SEC_GROUPS_FAILE, group.getName())); group.setPhase(GroupRuntime.GroupPhase.FORKING_GROUPS); Provider provider = UserClusterDataExtractor.getGroupProvider(definition, group.getName()); Launcher launcher = launchers.get(provider.getClass()); try { String groupId = launcher.forkGroup(definition, runtime, group.getName()); group.setId(groupId); group.setPhase(GroupRuntime.GroupPhase.GROUPS_FORKED); } catch (Exception ex) { if (ex instanceof InterruptedException) { InterruptedException ex1 = (InterruptedException) ex; throw ex1; } else { logger.error("", ex); } runtime.issueFailure(new Failure(Failure.Type.CREATING_SEC_GROUPS_FAILE, group.getName(), ex.getMessage())); } } } if (!runtime.isFailed()) { runtime.setPhase(ClusterRuntime.ClusterPhases.GROUPS_FORKED); logger.info(String.format("\\o/\\o/\\o/\\o/\\o/'%s' GROUPS_FORKED \\o/\\o/\\o/\\o/\\o/", definition.getName())); } } private void runDag(boolean installDag) throws Exception { logger.info(String.format("Running the DAG for '%s' ...", definition.getName())); if (currentDag != null) { logger.info(String.format("Terminating the previous DAG before running the new one for '%s' ...", definition.getName())); currentDag.termiante(); } runtime.setPhase(ClusterRuntime.ClusterPhases.RUNNING_DAG); runtime.resolveFailure(Failure.hash(Failure.Type.DAG_FAILURE, null)); List<GroupRuntime> groups = runtime.getGroups(); for (GroupRuntime group : groups) { group.setPhase(GroupRuntime.GroupPhase.RUNNING_DAG); } try { if (installDag) { Map<String, JsonObject> chefJsons = ChefJsonGenerator. generateClusterChefJsonsForInstallation(definition, runtime); currentDag = DagBuilder.getInstallationDag(definition, runtime, stats, machinesMonitor, chefJsons); } else { Map<String, JsonObject> chefJsons = ChefJsonGenerator. generateClusterChefJsonsForPurge(definition, runtime); currentDag = DagBuilder.getPurgingDag(definition, runtime, stats, machinesMonitor, chefJsons); } currentDag.start(); } catch (Exception ex) { runtime.issueFailure(new Failure(Failure.Type.DAG_FAILURE, ex.getMessage())); throw ex; } while (runtime.getPhase() == ClusterRuntime.ClusterPhases.RUNNING_DAG && !currentDag.isDone()) { Thread.sleep(Settings.CLUSTER_STATUS_CHECKING_INTERVAL); } if (!runtime.isFailed()) { runtime.setPhase(ClusterRuntime.ClusterPhases.DAG_DONE); for (GroupRuntime group : groups) { group.setPhase(GroupRuntime.GroupPhase.DAG_DONE); } logger.info(String.format("\\o/\\o/\\o/\\o/\\o/'%s' DAG IS DONE \\o/\\o/\\o/\\o/\\o/", definition.getName())); if (ClusterManager.EXIT_ON_COMPLETION) { System.exit(0); } } } private void pause() { logger.info(String.format("Pausing '%s'", definition.getName())); machinesMonitor.pause(); runtime.setPaused(true); logger.info(String.format("\\o/\\o/\\o/\\o/\\o/'%s' PAUSED \\o/\\o/\\o/\\o/\\o/", definition.getName())); } private void resume() { logger.info(String.format("Resuming '%s'", definition.getName())); machinesMonitor.resume(); runtime.setPaused(false); logger.info(String.format("\\o/\\o/\\o/\\o/\\o/'%s' RESUMED \\o/\\o/\\o/\\o/\\o/", definition.getName())); } private void terminate() throws InterruptedException, KaramelException { logger.info(String.format("Terminating '%s' ...", definition.getName())); runtime.setPhase(ClusterRuntime.ClusterPhases.TERMINATING); stopping = true; clean(true); stop(); runtime.setPhase(ClusterRuntime.ClusterPhases.NOT_STARTED); KandyRestClient.pushClusterStats(definition.getName(), stats); logger.info(String.format("\\o/\\o/\\o/\\o/\\o/'%s' TERMINATED \\o/\\o/\\o/\\o/\\o/", definition.getName())); } private void forkMachines() throws Exception { logger.info(String.format("Launching '%s' ...", definition.getName())); runtime.setPhase(ClusterRuntime.ClusterPhases.FORKING_MACHINES); runtime.resolveFailure(Failure.hash(Failure.Type.FORK_MACHINE_FAILURE, null)); List<GroupRuntime> groups = runtime.getGroups(); for (GroupRuntime group : groups) { if (group.getPhase() == GroupRuntime.GroupPhase.GROUPS_FORKED || (group.getPhase() == GroupRuntime.GroupPhase.FORKING_MACHINES)) { group.setPhase(GroupRuntime.GroupPhase.FORKING_MACHINES); runtime.resolveFailure(Failure.hash(Failure.Type.FORK_MACHINE_FAILURE, group.getName())); Provider provider = UserClusterDataExtractor.getGroupProvider(definition, group.getName()); Launcher launcher = launchers.get(provider.getClass()); try { List<MachineRuntime> mcs = launcher.forkMachines(definition, runtime, group.getName()); group.setMachines(mcs); machinesMonitor.addMachines(mcs); group.setPhase(GroupRuntime.GroupPhase.MACHINES_FORKED); } catch (Exception ex) { runtime.issueFailure(new Failure(Failure.Type.FORK_MACHINE_FAILURE, group.getName(), ex.getMessage())); throw ex; } } } if (!runtime.isFailed()) { runtime.setPhase(ClusterRuntime.ClusterPhases.MACHINES_FORKED); logger.info(String.format("\\o/\\o/\\o/\\o/\\o/'%s' MACHINES_FORKED \\o/\\o/\\o/\\o/\\o/", definition.getName())); } } @Override public void run() { logger.info(String.format("Cluster-Manager started for '%s' d'-'", definition.getName())); while (true) { try { Command cmd = cmdQueue.take(); logger.info(String.format("Going to serve '%s'", cmd.toString())); switch (cmd) { case LAUNCH_CLUSTER: if (runtime.getPhase() == ClusterRuntime.ClusterPhases.NOT_STARTED || (runtime.getPhase() == ClusterRuntime.ClusterPhases.PRECLEANING && runtime.isFailed())) { ClusterStatistics.startTimer(); clean(false); long duration = ClusterStatistics.stopTimer(); String status = runtime.isFailed() ? "FAILED" : "SUCCEED"; PhaseStat phaseStat = new PhaseStat(ClusterRuntime.ClusterPhases.PRECLEANING.name(), status, duration); stats.addPhase(phaseStat); } if (runtime.getPhase() == ClusterRuntime.ClusterPhases.PRECLEANED || (runtime.getPhase() == ClusterRuntime.ClusterPhases.FORKING_GROUPS && runtime.isFailed())) { ClusterStatistics.startTimer(); forkGroups(); long duration = ClusterStatistics.stopTimer(); String status = runtime.isFailed() ? "FAILED" : "SUCCEED"; PhaseStat phaseStat = new PhaseStat(ClusterRuntime.ClusterPhases.FORKING_GROUPS.name(), status, duration); stats.addPhase(phaseStat); } if (runtime.getPhase() == ClusterRuntime.ClusterPhases.GROUPS_FORKED || (runtime.getPhase() == ClusterRuntime.ClusterPhases.FORKING_MACHINES && runtime.isFailed())) { ClusterStatistics.startTimer(); forkMachines(); long duration = ClusterStatistics.stopTimer(); String status = runtime.isFailed() ? "FAILED" : "SUCCEED"; PhaseStat phaseStat = new PhaseStat(ClusterRuntime.ClusterPhases.FORKING_MACHINES.name(), status, duration); stats.addPhase(phaseStat); } break; case SUBMIT_INSTALL_DAG: if (runtime.getPhase().ordinal() >= ClusterRuntime.ClusterPhases.MACHINES_FORKED.ordinal() && (runtime.getPhase().ordinal() <= ClusterRuntime.ClusterPhases.DAG_DONE.ordinal())) { ClusterStatistics.startTimer(); runDag(true); long duration = ClusterStatistics.stopTimer(); String status = runtime.isFailed() ? "FAILED" : "SUCCEED"; PhaseStat phaseStat = new PhaseStat(ClusterRuntime.ClusterPhases.RUNNING_DAG.name(), status, duration); stats.addPhase(phaseStat); } break; case SUBMIT_PURGE_DAG: if (runtime.getPhase().ordinal() >= ClusterRuntime.ClusterPhases.MACHINES_FORKED.ordinal() && (runtime.getPhase().ordinal() <= ClusterRuntime.ClusterPhases.DAG_DONE.ordinal())) { ClusterStatistics.startTimer(); runDag(false); long duration = ClusterStatistics.stopTimer(); String status = runtime.isFailed() ? "FAILED" : "SUCCEED"; PhaseStat phaseStat = new PhaseStat(ClusterRuntime.ClusterPhases.RUNNING_DAG.name(), status, duration); stats.addPhase(phaseStat); } break; case TERMINATE_CLUSTER: ClusterStatistics.startTimer(); terminate(); long duration = ClusterStatistics.stopTimer(); String status = runtime.isFailed() ? "FAILED" : "SUCCEED"; PhaseStat phaseStat = new PhaseStat(ClusterRuntime.ClusterPhases.TERMINATING.name(), status, duration); stats.addPhase(phaseStat); break; } } catch (java.lang.InterruptedException ex) { if (stopping) { tpool.shutdownNow(); try { tpool.awaitTermination(1, TimeUnit.MINUTES); } catch (InterruptedException ex1) { } logger.info(String.format("Cluster-Manager stoped for '%s' d'-'", definition.getName())); return; } else { logger.warn("Got interrupted, perhaps a higher priority command is comming on.."); } } catch (Exception ex) { logger.error("", ex); } } } }