package uk.ac.imperial.lsds.seepmaster.scheduler.schedulingstrategy; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import uk.ac.imperial.lsds.seep.api.DataReference; import uk.ac.imperial.lsds.seep.api.RuntimeEvent; import uk.ac.imperial.lsds.seep.api.RuntimeEventTypes; import uk.ac.imperial.lsds.seep.api.SeepChooseTask; import uk.ac.imperial.lsds.seep.comm.protocol.Command; import uk.ac.imperial.lsds.seep.scheduler.Stage; import uk.ac.imperial.lsds.seep.scheduler.StageType; import uk.ac.imperial.lsds.seepmaster.scheduler.ClusterDatasetRegistry; import uk.ac.imperial.lsds.seepmaster.scheduler.ScheduleTracker; public class MDFSchedulingStrategy implements SchedulingStrategy { private Map<Integer, Map<Integer, List<Object>>> evaluatedResultsPerChooseStage = new HashMap<>(); private Set<Integer> chooseCandidates = new HashSet<>(); @Override public Stage next(ScheduleTracker tracker, Map<Integer, List<RuntimeEvent>> rEvents) { Stage head = tracker.getHead(); Stage nextToSchedule = nextStageToSchedule(head, tracker); // We want to intercept when the next stage to schedule is CHOOSE, as this means that the explore is done // and that we have already chosen one (CHOOSE has executed eagerly every time an upstream finished running // Here we want to capture the outputs (real data output) of the chosen upstream stage and put them as input of // CHOOSE downstreams, then, make CHOOSE finished and go on with the scheduling process, by calling recursively // to this function if(nextToSchedule.getStageType() == StageType.CHOOSE_STAGE) { // Check whether we have finished the choose stage and we can go ahead // TODO: pick stage according to the currentBestCandidate Set<Stage> upstream = nextToSchedule.getDependencies(); Map<Integer, Set<DataReference>> chosenResultsOfStage = new HashMap<>(); for(Stage s : upstream) { int stageId = s.getStageId(); Set<DataReference> inputs = nextToSchedule.getInputDataReferences().get(stageId); if(chooseCandidates.contains(stageId)) { // Filter out potential inputs of CHOOSE to get only the chosen one chosenResultsOfStage.put(nextToSchedule.getStageId(), inputs); } } // Say that choose is done and assign results to its downstream stages tracker.setFinished(nextToSchedule, chosenResultsOfStage); // Reset CHOOSE structures to support next potential choose evaluatedResultsPerChooseStage = new HashMap<>(); chooseCandidates = new HashSet<>(); // Call recursively to next so that we give worker a stage to schedule nextToSchedule = next(tracker, null); } return nextToSchedule; } private Stage nextStageToSchedule(Stage head, ScheduleTracker tracker) { Stage toReturn = null; if(tracker.isStageReady(head)) { toReturn = head; } else { for(Stage upstream : head.getDependencies()) { if(! tracker.isStageFinished(upstream)) { toReturn = nextStageToSchedule(upstream, tracker); } } } return toReturn; } @Override public List<Command> postCompletion(Stage finishedStage, ScheduleTracker tracker) { // When the recently completed stage is upstream to a Choose task, then we need to store the results it produced and // evaluate them (with the choose task). For that we capture here the runtime events and filter those that contain // results. // IMPORTANT: By results, we mean the results of the EVALUATE function, and not the actual data produced by the // stage that is upstream to the EVALUATE. See "picture" below // SOME_FUNCTION_A ---> EVALUATE ----> CHOOSE_A // SOME_FUNCTION_B ---> EVALUATE ----> CHOOSE_A // SOME_FUNCTION_C ---> EVALUATE ----> CHOOSE_A // That graph depicts a situation where a explore generated three possible instantiations of SOME_FUNCTION, that we // call A, B and C. The three results produced by those are evaluated by EVALUATE, that runs in the cluster in parallel. // In particular, SEEP will pipelines EVALUATE with SOME_FUNCTION. EVALUATE must be implemented by the user of the system // and must generate a RuntimeEvent that contains EvaluatedResults. Then, this scheduler gets those results and uses // them to evaluate CHOOSE. This happens below. // // Some additional detail. SOME_FUNCTION_A, B and C are scheduled sequentially in the cluster (hence each has all // resources available). Whenever one of them finishes, its results are captured here. CHOOSE can be run pairwise. // In that way, as soon as Choose knows whether A or B is better, it can instruct to discard the other results. // This eager evaluation leads to a situation where C and other subsequent tasks will have more memory available. List<Command> commands = new ArrayList<>(); Map<Integer, List<RuntimeEvent>> rEvents = tracker.getRuntimeEventsOfLastStageExecution(); // STORE EVALUATED RESULTS FOR CURRENT STAGE if( ! finishedStage.getDependants().isEmpty()) { StageType st = finishedStage.getDependants().iterator().next().getStageType(); if(st == StageType.CHOOSE_STAGE) { Stage chooseStage = finishedStage.getDependants().iterator().next(); int seepChooseTaskId = chooseStage.getWrappedOperators().iterator().next(); SeepChooseTask sct = (SeepChooseTask) tracker.getScheduleDescription().getOperatorWithId(seepChooseTaskId).getSeepTask(); rEvents = tracker.getRuntimeEventsOfLastStageExecution(); List<Object> evalResult = new ArrayList<>(); for(List<RuntimeEvent> re : rEvents.values()) { for(RuntimeEvent r : re) { if(r.getEvaluateResultsRuntimeEvent().type() == RuntimeEventTypes.EVALUATE_RESULT.ofType()) { evalResult.add(r.getEvaluateResultsRuntimeEvent().getEvaluateResults()); } } } put(seepChooseTaskId, finishedStage.getStageId(), evalResult, evaluatedResultsPerChooseStage); //evaluatedResults.put(finishedStage.getStageId(), evalResult); Map<Integer, List<Object>> evaluatedResults = evaluatedResultsPerChooseStage.get(seepChooseTaskId); // Evaluate choose and get list of stages whose values are still useful this.chooseCandidates = sct.choose(evaluatedResults); ClusterDatasetRegistry cr = tracker.getClusterDatasetRegistry(); // TODO: difference between evaluatedResults and goOn are datasets to evict for(int stageId : evaluatedResults.keySet()) { if(! chooseCandidates.contains(stageId)) { // get upstream of CHOOSE (which is my downstream), then go over output results and get all the // datasets Id, which together in a list are the payload of an eviction command. Stage choose = finishedStage.getDependants().iterator().next(); Map<Integer, Set<DataReference>> badInputs = choose.getInputDataReferences(); for(DataReference drToEvict : badInputs.get(stageId)) { cr.evictDatasetFromCluster(drToEvict.getId()); } } } } } return commands; } private void put(int chooseStageId, int finishedStageId, List<Object> evalResultsOfThisStage, Map<Integer, Map<Integer, List<Object>>> evalResultsPerChooseStage) { if(! evalResultsPerChooseStage.containsKey(chooseStageId)) { evalResultsPerChooseStage.put(chooseStageId, new HashMap<>()); } evalResultsPerChooseStage.get(chooseStageId).put(finishedStageId, evalResultsOfThisStage); } }