package uk.ac.imperial.lsds.seepmaster.scheduler; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import uk.ac.imperial.lsds.seep.api.DataReference; import uk.ac.imperial.lsds.seep.api.DataStore; import uk.ac.imperial.lsds.seep.api.RuntimeEvent; import uk.ac.imperial.lsds.seep.api.operator.LogicalOperator; import uk.ac.imperial.lsds.seep.api.operator.UpstreamConnection; import uk.ac.imperial.lsds.seep.comm.Comm; import uk.ac.imperial.lsds.seep.comm.Connection; import uk.ac.imperial.lsds.seep.comm.protocol.Command; import uk.ac.imperial.lsds.seep.comm.protocol.StageStatusCommand; import uk.ac.imperial.lsds.seep.core.DatasetMetadataPackage; import uk.ac.imperial.lsds.seep.scheduler.ScheduleDescription; import uk.ac.imperial.lsds.seep.scheduler.Stage; import uk.ac.imperial.lsds.seep.scheduler.StageStatus; import uk.ac.imperial.lsds.seep.scheduler.StageType; import uk.ac.imperial.lsds.seepmaster.infrastructure.master.InfrastructureManager; import uk.ac.imperial.lsds.seepmaster.scheduler.loadbalancing.LoadBalancingStrategy; import uk.ac.imperial.lsds.seepmaster.scheduler.memorymanagement.MDFMemoryManagementPolicy; import uk.ac.imperial.lsds.seepmaster.scheduler.memorymanagement.MemoryManagementPolicy; import uk.ac.imperial.lsds.seepmaster.scheduler.memorymanagement.MemoryManagementPolicyType; import uk.ac.imperial.lsds.seepmaster.scheduler.memorymanagement.SizeObliviousLRUMemoryManagementPolicy; import uk.ac.imperial.lsds.seepmaster.scheduler.schedulingstrategy.SchedulingStrategy; import com.esotericsoftware.kryo.Kryo; public class SchedulerEngineWorker implements Runnable { final private Logger LOG = LoggerFactory.getLogger(SchedulerEngineWorker.class); private ScheduleDescription scheduleDescription; private SchedulingStrategy schedulingStrategy; private LoadBalancingStrategy loadBalancingStrategy; private ScheduleTracker tracker; private InfrastructureManager inf; private Set<Connection> connections; private Comm comm; private Kryo k; private boolean work = true; // Metrics private long __time_assignWork; private long __time_postCompletion; public SchedulerEngineWorker(ScheduleDescription sdesc, SchedulingStrategy schedulingStrategy, LoadBalancingStrategy loadBalancingStrategy, int mmpType, double dmRatio, InfrastructureManager inf, Comm comm, Kryo k) { this.scheduleDescription = sdesc; this.schedulingStrategy = schedulingStrategy; this.loadBalancingStrategy = loadBalancingStrategy; MemoryManagementPolicy mmp = buildMMP(mmpType, sdesc, dmRatio); this.tracker = new ScheduleTracker(scheduleDescription, mmp); this.inf = inf; this.comm = comm; this.k = k; } private MemoryManagementPolicy buildMMP(int type, ScheduleDescription sd, double dmRatio) { MemoryManagementPolicy mmp = null; if(MemoryManagementPolicyType.LRU.ofType() == type) { mmp = new SizeObliviousLRUMemoryManagementPolicy(); } else if(MemoryManagementPolicyType.MDF.ofType() == type) { mmp = new MDFMemoryManagementPolicy(sd, dmRatio); } return mmp; } public void stop() { this.work = false; } @Override public void run() { LOG.info("[START JOB]"); LOG.info("Scheduling mode: " + this.schedulingStrategy.toString()); long scheduleStart = System.nanoTime(); while(work) { if(tracker.isScheduledFinished()) { long scheduleFinish = System.nanoTime(); long totalScheduleTime = scheduleFinish - scheduleStart; LOG.info("[END JOB] !!! {}", totalScheduleTime); int totalDatasets = tracker.getClusterDatasetRegistry().totalDatasetsGeneratedDuringSchedule(); int totalSpilledDatasets = tracker.getClusterDatasetRegistry().totalDatasetsSpilledToDiskDuringSchedule(); double ratio = (double)totalSpilledDatasets/(double)totalDatasets; double ratioMemory = (1 - ratio); int ratioMemVSDiskAccessedData = tracker.getClusterDatasetRegistry().percentageOfTotalDataAccessedFromMem(); String memUtilization = tracker.getClusterDatasetRegistry().getHistoricMemUtilization(); long totalUpdateTime = tracker.getClusterDatasetRegistry().getMMP().__totalUpdateTime(); LOG.info("Total time spend updating dataset metadata: {}", totalUpdateTime); long totalRankTime = tracker.getClusterDatasetRegistry().getMMP().__totalRankTime(); LOG.info("Total time spend ranking datasets: {}", totalRankTime); LOG.info("Total datasets generated in schedule: {}", totalDatasets); LOG.info("Total datasets spilled during schedule: {}", totalSpilledDatasets); LOG.info("Total time assigning work: {}", this.__time_assignWork); LOG.info("Total time post completion work: {}", this.__time_assignWork); long freeingTime = tracker.getClusterDatasetRegistry().totalTimeFreeingDatasets(); LOG.info("Total time freeing datasets: {}", freeingTime); LOG.info("Ratio hit/miss: {}", ratioMemory); LOG.info("Ratio memAccessedData/diskAccessedData: {}", ratioMemVSDiskAccessedData); LOG.info("Historic mem utilization: {}", memUtilization); work = false; continue; } // At the end of one iteration the worker will have populated the commands that need to be sent to the cluster // Some of these commands are schedule stage commands. Other are about evicting datasets, etc. List<CommandToNode> commands = new ArrayList<>(); Map<Integer, List<RuntimeEvent>> rEvents = null; // Check whether the last executed stage generated runtime events that need to be handled here if(tracker.didLastStageGenerateRuntimeEvents()) { rEvents = tracker.getRuntimeEventsOfLastStageExecution(); // CURRENT EVENTS: // OutOfMemory a dataset was spilled to disk, update any info that exists here about that // A loop was finished, bear that in mind to choose the next stage to schedule } // Get next stage // TODO: make next return a List of next stages Stage nextStage = schedulingStrategy.next(tracker, rEvents); // TODO: (parallel sched) make this receive a list of stages long start = System.nanoTime(); List<CommandToNode> schedCommands = loadBalancingStrategy.assignWorkToWorkers(nextStage, inf, tracker); long end = System.nanoTime(); this.__time_assignWork =__time_assignWork + (end - start); commands.addAll(schedCommands); // append scheduling commands to the commands necessary to send to the cluster // FIXME: avoid extracting conns here. They need to be extracted again immediately after // we should have a tracker entity that receives progressively what to track, and then we // just pass info (the connections) to that guy long stageStart = System.nanoTime(); Set<Connection> euInvolved = new HashSet<>(); for(CommandToNode ctn : commands) { euInvolved.add(ctn.c); } // TODO: (parallel sched) adapt tracking structures to track multiple stages simultaneously trackStageCompletionAsync(nextStage, euInvolved); LOG.info("[START] SCHEDULING Stage {}", nextStage.getStageId()); for(CommandToNode ctn : commands) { boolean success = comm.send_object_sync(ctn.command, ctn.c, k); } // TODO: make this compatible with waiting for multiple parallel schedule stages tracker.waitForFinishedStageAndCompleteBookeeping(nextStage); // Call the post processing event start = System.nanoTime(); List<Command> postCommands = schedulingStrategy.postCompletion(nextStage, tracker); end = System.nanoTime(); this.__time_postCompletion =__time_postCompletion + (end - start); long stageFinish = System.nanoTime(); long totalStageTime = stageFinish - stageStart; LOG.warn("Stage {} finished in: ! {}", nextStage.getStageId(), totalStageTime); if(! commands.isEmpty()) { // TODO: } } } private void trackStageCompletionAsync(Stage stage, Set<Connection> euInvolved) { // Just start the tracker async new Thread(new Runnable() { public void run() { // Wait until stage is completed Set<Integer> euIds = new HashSet<>(); for(Connection c : euInvolved) { euIds.add(c.getId()); } tracker.trackWorkersAndBlock(stage, euIds); } }).start(); } public boolean prepareForStart(Set<Connection> connections) { // Set initial connections in worker this.connections = connections; // Basically change stage status so that SOURCE tasks are ready to run boolean success = true; for(Stage stage : scheduleDescription.getStages()) { if(stage.getStageType().equals(StageType.UNIQUE_STAGE) || stage.getStageType().equals(StageType.SOURCE_STAGE)) { configureInputForInitialStage(connections, stage, scheduleDescription); boolean changed = tracker.setReady(stage); success = success && changed; } } return success; } private void configureInputForInitialStage(Set<Connection> connections, Stage s, ScheduleDescription sd) { // Check whether the stage needs to be configured or whether it comes configured already // such as in the case of a handcrafted schedule if(! s.getInputDataReferences().isEmpty()) { // It's already configured return; } // Get input type from first operator int srcOpId = s.getWrappedOperators().getFirst(); LogicalOperator src = sd.getOperatorWithId(srcOpId); Set<DataReference> refs = new HashSet<>(); // We need to get the DataStore to configure a DataReference DataStore dataStore = null; // We handle here the special case of having a marker source operator, in which case it dissapeared and is null for(UpstreamConnection uc : src.upstreamConnections()) { if (uc.getUpstreamOperator() == null) { dataStore = uc.getDataStore(); } } // If dataStore was not set above, then there is a real source operator, that we set here if(dataStore == null) { dataStore = src.upstreamConnections().iterator().next().getUpstreamOperator().upstreamConnections().iterator().next().getDataStore(); } int streamId = 0; // only one streamId for sources in scheduled mode DataReference dr = DataReference.makeExternalDataReference(dataStore); refs.add(dr); s.addInputDataReference(streamId, refs); } public void newStageStatus(int stageId, int euId, Map<Integer, Set<DataReference>> results, StageStatusCommand.Status status, List<RuntimeEvent> runtimeEvents, DatasetMetadataPackage managedDatasets) { switch(status) { case OK: LOG.info("EU {} finishes stage {}", euId, stageId); tracker.finishStage(euId, stageId, results, runtimeEvents, managedDatasets); break; case FAIL: LOG.info("EU {} has failed executing stage {}", euId, stageId); break; default: LOG.error("Unrecognized STATUS in StageStatusCommand"); } } /** Methods to facilitate testing **/ public ScheduleTracker __tracker_for_testing(){ return tracker; } public Stage __next_stage_scheduler(){ return schedulingStrategy.next(tracker, null); } public void __reset_schedule() { tracker.resetAllStagesTo(StageStatus.WAITING); } }