/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.http.HttpServer; import org.apache.hadoop.mapreduce.TaskType; import org.apache.hadoop.mapreduce.server.jobtracker.TaskTracker; import org.apache.hadoop.metrics.MetricsContext; import org.apache.hadoop.metrics.MetricsUtil; import org.apache.hadoop.metrics.Updater; import org.apache.hadoop.net.NetworkTopology; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.StringUtils; /** * A {@link TaskScheduler} that implements fair sharing. */ public class FairScheduler extends TaskScheduler { public static final Log LOG = LogFactory.getLog( "org.apache.hadoop.mapred.FairScheduler"); // How often fair shares are re-calculated protected long updateInterval = 500; // How often to dump scheduler state to the event log protected long dumpInterval = 10000; // How often tasks are preempted (must be longer than a couple // of heartbeats to give task-kill commands a chance to act). protected long preemptionInterval = 15000; // Used to iterate through map and reduce task types private static final TaskType[] MAP_AND_REDUCE = new TaskType[] {TaskType.MAP, TaskType.REDUCE}; // Maximum locality delay when auto-computing locality delays private static final long MAX_AUTOCOMPUTED_LOCALITY_DELAY = 15000; protected PoolManager poolMgr; protected LoadManager loadMgr; protected TaskSelector taskSelector; protected WeightAdjuster weightAdjuster; // Can be null for no weight adjuster protected Map<JobInProgress, JobInfo> infos = // per-job scheduling variables new HashMap<JobInProgress, JobInfo>(); protected long lastUpdateTime; // Time when we last updated infos protected long lastPreemptionUpdateTime; // Time when we last updated preemption vars protected boolean initialized; // Are we initialized? protected volatile boolean running; // Are we running? protected boolean assignMultiple; // Simultaneously assign map and reduce? protected int mapAssignCap = -1; // Max maps to launch per heartbeat protected int reduceAssignCap = -1; // Max reduces to launch per heartbeat protected long nodeLocalityDelay; // Time to wait for node locality protected long rackLocalityDelay; // Time to wait for rack locality protected boolean autoComputeLocalityDelay = false; // Compute locality delay // from heartbeat interval protected boolean sizeBasedWeight; // Give larger weights to larger jobs protected boolean waitForMapsBeforeLaunchingReduces = true; protected boolean preemptionEnabled; protected boolean onlyLogPreemption; // Only log when tasks should be killed private Clock clock; private JobListener jobListener; private JobInitializer jobInitializer; private boolean mockMode; // Used for unit tests; disables background updates // and scheduler event log private FairSchedulerEventLog eventLog; protected long lastDumpTime; // Time when we last dumped state to log protected long lastHeartbeatTime; // Time we last ran assignTasks private long lastPreemptCheckTime; // Time we last ran preemptTasksIfNecessary /** * A configuration property that controls the ability of submitting jobs to * pools not declared in the scheduler allocation file. */ public final static String ALLOW_UNDECLARED_POOLS_KEY = "mapred.fairscheduler.allow.undeclared.pools"; private boolean allowUndeclaredPools = false; /** * A class for holding per-job scheduler variables. These always contain the * values of the variables at the last update(), and are used along with a * time delta to update the map and reduce deficits before a new update(). */ static class JobInfo { boolean runnable = false; // Can the job run given user/pool limits? // Does this job need to be initialized? volatile boolean needsInitializing = true; public JobSchedulable mapSchedulable; public JobSchedulable reduceSchedulable; // Variables used for delay scheduling LocalityLevel lastMapLocalityLevel; // Locality level of last map launched long timeWaitedForLocalMap; // Time waiting for local map since last map boolean skippedAtLastHeartbeat; // Was job skipped at previous assignTasks? // (used to update timeWaitedForLocalMap) public JobInfo(JobSchedulable mapSched, JobSchedulable reduceSched) { this.mapSchedulable = mapSched; this.reduceSchedulable = reduceSched; this.lastMapLocalityLevel = LocalityLevel.NODE; } } public FairScheduler() { this(new Clock(), false); } /** * Constructor used for tests, which can change the clock and disable updates. */ protected FairScheduler(Clock clock, boolean mockMode) { this.clock = clock; this.mockMode = mockMode; this.jobListener = new JobListener(); } @Override public void start() { try { Configuration conf = getConf(); // Create scheduling log and initialize it if it is enabled eventLog = new FairSchedulerEventLog(); boolean logEnabled = conf.getBoolean( "mapred.fairscheduler.eventlog.enabled", false); if (!mockMode && logEnabled) { String hostname = "localhost"; if (taskTrackerManager instanceof JobTracker) { hostname = ((JobTracker) taskTrackerManager).getJobTrackerMachine(); } eventLog.init(conf, hostname); } // Initialize other pieces of the scheduler jobInitializer = new JobInitializer(conf, taskTrackerManager); taskTrackerManager.addJobInProgressListener(jobListener); poolMgr = new PoolManager(this); poolMgr.initialize(); loadMgr = (LoadManager) ReflectionUtils.newInstance( conf.getClass("mapred.fairscheduler.loadmanager", CapBasedLoadManager.class, LoadManager.class), conf); loadMgr.setTaskTrackerManager(taskTrackerManager); loadMgr.setEventLog(eventLog); loadMgr.start(); taskSelector = (TaskSelector) ReflectionUtils.newInstance( conf.getClass("mapred.fairscheduler.taskselector", DefaultTaskSelector.class, TaskSelector.class), conf); taskSelector.setTaskTrackerManager(taskTrackerManager); taskSelector.start(); Class<?> weightAdjClass = conf.getClass( "mapred.fairscheduler.weightadjuster", null); if (weightAdjClass != null) { weightAdjuster = (WeightAdjuster) ReflectionUtils.newInstance( weightAdjClass, conf); } updateInterval = conf.getLong( "mapred.fairscheduler.update.interval", 500); dumpInterval = conf.getLong( "mapred.fairscheduler.dump.interval", 10000); preemptionInterval = conf.getLong( "mapred.fairscheduler.preemption.interval", 15000); assignMultiple = conf.getBoolean( "mapred.fairscheduler.assignmultiple", true); mapAssignCap = conf.getInt( "mapred.fairscheduler.assignmultiple.maps", -1); reduceAssignCap = conf.getInt( "mapred.fairscheduler.assignmultiple.reduces", -1); sizeBasedWeight = conf.getBoolean( "mapred.fairscheduler.sizebasedweight", false); preemptionEnabled = conf.getBoolean( "mapred.fairscheduler.preemption", false); onlyLogPreemption = conf.getBoolean( "mapred.fairscheduler.preemption.only.log", false); long defaultDelay = conf.getLong( "mapred.fairscheduler.locality.delay", -1); nodeLocalityDelay = conf.getLong( "mapred.fairscheduler.locality.delay.node", defaultDelay); rackLocalityDelay = conf.getLong( "mapred.fairscheduler.locality.delay.rack", defaultDelay); allowUndeclaredPools = conf.getBoolean(ALLOW_UNDECLARED_POOLS_KEY, true); if (defaultDelay == -1 && (nodeLocalityDelay == -1 || rackLocalityDelay == -1)) { autoComputeLocalityDelay = true; // Compute from heartbeat interval } initialized = true; running = true; lastUpdateTime = clock.getTime(); // Start a thread to update deficits every UPDATE_INTERVAL if (!mockMode) { new UpdateThread().start(); } // Register servlet with JobTracker's Jetty server if (taskTrackerManager instanceof JobTracker) { JobTracker jobTracker = (JobTracker) taskTrackerManager; HttpServer infoServer = jobTracker.infoServer; infoServer.setAttribute("scheduler", this); infoServer.addServlet("scheduler", "/scheduler", FairSchedulerServlet.class); } initMetrics(); eventLog.log("INITIALIZED"); } catch (Exception e) { // Can't load one of the managers - crash the JobTracker now while it is // starting up so that the user notices. throw new RuntimeException("Failed to start FairScheduler", e); } LOG.info("Successfully configured FairScheduler"); } private MetricsUpdater metricsUpdater; // responsible for pushing hadoop metrics /** * Returns the LoadManager object used by the Fair Share scheduler */ LoadManager getLoadManager() { return loadMgr; } /** * Register metrics for the fair scheduler, and start a thread * to update them periodically. */ private void initMetrics() { MetricsContext context = MetricsUtil.getContext("fairscheduler"); metricsUpdater = new MetricsUpdater(); context.registerUpdater(metricsUpdater); } @Override public void terminate() throws IOException { if (eventLog != null) eventLog.log("SHUTDOWN"); running = false; jobInitializer.terminate(); if (jobListener != null) taskTrackerManager.removeJobInProgressListener(jobListener); if (eventLog != null) eventLog.shutdown(); if (metricsUpdater != null) { MetricsContext context = MetricsUtil.getContext("fairscheduler"); context.unregisterUpdater(metricsUpdater); metricsUpdater = null; } } private class JobInitializer { private final int DEFAULT_NUM_THREADS = 1; private ThreadPoolExecutor threadPool; private TaskTrackerManager ttm; public JobInitializer(Configuration conf, TaskTrackerManager ttm) { int numThreads = conf.getInt("mapred.jobinit.threads", DEFAULT_NUM_THREADS); threadPool = new ThreadPoolExecutor(numThreads, numThreads, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>()); // Pre-starting all threads to ensure the threads are executed as JobTracker // instead of the user submitted the job, otherwise job initialization fails // when security is enabled if (threadPool.prestartAllCoreThreads() != numThreads) { throw new RuntimeException("Failed to pre-start threads in JobInitializer"); } this.ttm = ttm; } public void initJob(JobInfo jobInfo, JobInProgress job) { if (!mockMode) { threadPool.execute(new InitJob(jobInfo, job)); } else { new InitJob(jobInfo, job).run(); } } class InitJob implements Runnable { private JobInfo jobInfo; private JobInProgress job; public InitJob(JobInfo jobInfo, JobInProgress job) { this.jobInfo = jobInfo; this.job = job; } public void run() { ttm.initJob(job); } } void terminate() { LOG.info("Shutting down thread pool"); threadPool.shutdownNow(); try { threadPool.awaitTermination(1, TimeUnit.MINUTES); } catch (InterruptedException e) { // Ignore, we are in shutdown anyway. } } } /** * Used to listen for jobs added/removed by our {@link TaskTrackerManager}. */ private class JobListener extends JobInProgressListener { @Override public void jobAdded(JobInProgress job) { synchronized (FairScheduler.this) { eventLog.log("JOB_ADDED", job.getJobID()); JobSchedulable mapSched = ReflectionUtils.newInstance( conf.getClass("mapred.jobtracker.jobSchedulable", JobSchedulable.class, JobSchedulable.class), conf); mapSched.init(FairScheduler.this, job, TaskType.MAP); JobSchedulable redSched = ReflectionUtils.newInstance( conf.getClass("mapred.jobtracker.jobSchedulable", JobSchedulable.class, JobSchedulable.class), conf); redSched.init(FairScheduler.this, job, TaskType.REDUCE); JobInfo info = new JobInfo(mapSched, redSched); infos.put(job, info); poolMgr.addJob(job); // Also adds job into the right PoolScheduable update(); } } @Override public void jobRemoved(JobInProgress job) { synchronized (FairScheduler.this) { eventLog.log("JOB_REMOVED", job.getJobID()); jobNoLongerRunning(job); } } @Override public void jobUpdated(JobChangeEvent event) { eventLog.log("JOB_UPDATED", event.getJobInProgress().getJobID()); } } /** * A thread which calls {@link FairScheduler#update()} ever * <code>UPDATE_INTERVAL</code> milliseconds. */ private class UpdateThread extends Thread { private UpdateThread() { super("FairScheduler update thread"); } public void run() { while (running) { try { Thread.sleep(updateInterval); update(); dumpIfNecessary(); preemptTasksIfNecessary(); } catch (Exception e) { LOG.error("Exception in fair scheduler UpdateThread", e); } } } } /** * Responsible for updating metrics when the metrics context requests it. */ private class MetricsUpdater implements Updater { @Override public void doUpdates(MetricsContext context) { updateMetrics(); } } synchronized void updateMetrics() { poolMgr.updateMetrics(); } @Override public synchronized List<Task> assignTasks(TaskTracker tracker) throws IOException { if (!initialized) // Don't try to assign tasks if we haven't yet started up return null; String trackerName = tracker.getTrackerName(); eventLog.log("HEARTBEAT", trackerName); long currentTime = clock.getTime(); // Compute total runnable maps and reduces, and currently running ones int runnableMaps = 0; int runningMaps = 0; int runnableReduces = 0; int runningReduces = 0; for (Pool pool: poolMgr.getPools()) { runnableMaps += pool.getMapSchedulable().getDemand(); runningMaps += pool.getMapSchedulable().getRunningTasks(); runnableReduces += pool.getReduceSchedulable().getDemand(); runningReduces += pool.getReduceSchedulable().getRunningTasks(); } ClusterStatus clusterStatus = taskTrackerManager.getClusterStatus(); // Compute total map/reduce slots // In the future we can precompute this if the Scheduler becomes a // listener of tracker join/leave events. int totalMapSlots = getTotalSlots(TaskType.MAP, clusterStatus); int totalReduceSlots = getTotalSlots(TaskType.REDUCE, clusterStatus); eventLog.log("RUNNABLE_TASKS", runnableMaps, runningMaps, runnableReduces, runningReduces); // Update time waited for local maps for jobs skipped on last heartbeat updateLocalityWaitTimes(currentTime); // Check for JT safe-mode if (taskTrackerManager.isInSafeMode()) { LOG.info("JobTracker is in safe-mode, not scheduling any tasks."); return null; } TaskTrackerStatus tts = tracker.getStatus(); int mapsAssigned = 0; // loop counter for map in the below while loop int reducesAssigned = 0; // loop counter for reduce in the below while int mapCapacity = maxTasksToAssign(TaskType.MAP, tts); int reduceCapacity = maxTasksToAssign(TaskType.REDUCE, tts); boolean mapRejected = false; // flag used for ending the loop boolean reduceRejected = false; // flag used for ending the loop // Keep track of which jobs were visited for map tasks and which had tasks // launched, so that we can later mark skipped jobs for delay scheduling Set<JobInProgress> visitedForMap = new HashSet<JobInProgress>(); Set<JobInProgress> visitedForReduce = new HashSet<JobInProgress>(); Set<JobInProgress> launchedMap = new HashSet<JobInProgress>(); ArrayList<Task> tasks = new ArrayList<Task>(); // Scan jobs to assign tasks until neither maps nor reduces can be assigned while (true) { // Computing the ending conditions for the loop // Reject a task type if one of the following condition happens // 1. number of assigned task reaches per heatbeat limit // 2. number of running tasks reaches runnable tasks // 3. task is rejected by the LoadManager.canAssign if (!mapRejected) { if (mapsAssigned == mapCapacity || runningMaps == runnableMaps || !loadMgr.canAssignMap(tts, runnableMaps, totalMapSlots, mapsAssigned)) { eventLog.log("INFO", "Can't assign another MAP to " + trackerName); mapRejected = true; } } if (!reduceRejected) { if (reducesAssigned == reduceCapacity || runningReduces == runnableReduces || !loadMgr.canAssignReduce(tts, runnableReduces, totalReduceSlots, reducesAssigned)) { eventLog.log("INFO", "Can't assign another REDUCE to " + trackerName); reduceRejected = true; } } // Exit while (true) loop if // 1. neither maps nor reduces can be assigned // 2. assignMultiple is off and we already assigned one task if (mapRejected && reduceRejected || !assignMultiple && tasks.size() > 0) { break; // This is the only exit of the while (true) loop } // Determine which task type to assign this time // First try choosing a task type which is not rejected TaskType taskType; if (mapRejected) { taskType = TaskType.REDUCE; } else if (reduceRejected) { taskType = TaskType.MAP; } else { // If both types are available, choose the task type with fewer running // tasks on the task tracker to prevent that task type from starving if (tts.countMapTasks() + mapsAssigned <= tts.countReduceTasks() + reducesAssigned) { taskType = TaskType.MAP; } else { taskType = TaskType.REDUCE; } } // Get the map or reduce schedulables and sort them by fair sharing List<PoolSchedulable> scheds = getPoolSchedulables(taskType); Collections.sort(scheds, new SchedulingAlgorithms.FairShareComparator()); boolean foundTask = false; for (Schedulable sched: scheds) { // This loop will assign only one task eventLog.log("INFO", "Checking for " + taskType + " task in " + sched.getName()); Task task = taskType == TaskType.MAP ? sched.assignTask(tts, currentTime, visitedForMap) : sched.assignTask(tts, currentTime, visitedForReduce); if (task != null) { foundTask = true; JobInProgress job = taskTrackerManager.getJob(task.getJobID()); eventLog.log("ASSIGN", trackerName, taskType, job.getJobID(), task.getTaskID()); // Update running task counts, and the job's locality level if (taskType == TaskType.MAP) { launchedMap.add(job); mapsAssigned++; runningMaps++; updateLastMapLocalityLevel(job, task, tts); } else { reducesAssigned++; runningReduces++; } // Add task to the list of assignments tasks.add(task); break; // This break makes this loop assign only one task } // end if(task != null) } // end for(Schedulable sched: scheds) // Reject the task type if we cannot find a task if (!foundTask) { if (taskType == TaskType.MAP) { mapRejected = true; } else { reduceRejected = true; } } } // end while (true) // Mark any jobs that were visited for map tasks but did not launch a task // as skipped on this heartbeat for (JobInProgress job: visitedForMap) { if (!launchedMap.contains(job)) { infos.get(job).skippedAtLastHeartbeat = true; } } // If no tasks were found, return null return tasks.isEmpty() ? null : tasks; } /** * Get maximum number of tasks to assign on a TaskTracker on a heartbeat. * The scheduler may launch fewer than this many tasks if the LoadManager * says not to launch more, but it will never launch more than this number. */ protected int maxTasksToAssign(TaskType type, TaskTrackerStatus tts) { if (!assignMultiple) return 1; int cap = (type == TaskType.MAP) ? mapAssignCap : reduceAssignCap; int availableSlots = (type == TaskType.MAP) ? tts.getAvailableMapSlots(): tts.getAvailableReduceSlots(); if (cap == -1) // Infinite cap; use the TaskTracker's slot count return availableSlots; else return Math.min(cap, availableSlots); } /** * Update locality wait times for jobs that were skipped at last heartbeat. */ private void updateLocalityWaitTimes(long currentTime) { long timeSinceLastHeartbeat = (lastHeartbeatTime == 0 ? 0 : currentTime - lastHeartbeatTime); lastHeartbeatTime = currentTime; for (JobInfo info: infos.values()) { if (info.skippedAtLastHeartbeat) { info.timeWaitedForLocalMap += timeSinceLastHeartbeat; info.skippedAtLastHeartbeat = false; } } } /** * Update a job's locality level and locality wait variables given that that * it has just launched a map task on a given task tracker. */ private void updateLastMapLocalityLevel(JobInProgress job, Task mapTaskLaunched, TaskTrackerStatus tracker) { JobInfo info = infos.get(job); boolean isNodeGroupAware = conf.getBoolean( "net.topology.nodegroup.aware", false); LocalityLevel localityLevel = LocalityLevel.fromTask( job, mapTaskLaunched, tracker, isNodeGroupAware); info.lastMapLocalityLevel = localityLevel; info.timeWaitedForLocalMap = 0; eventLog.log("ASSIGNED_LOC_LEVEL", job.getJobID(), localityLevel); } /** * Get the maximum locality level at which a given job is allowed to * launch tasks, based on how long it has been waiting for local tasks. * This is used to implement the "delay scheduling" feature of the Fair * Scheduler for optimizing data locality. * If the job has no locality information (e.g. it does not use HDFS), this * method returns LocalityLevel.ANY, allowing tasks at any level. * Otherwise, the job can only launch tasks at its current locality level * or lower, unless it has waited at least nodeLocalityDelay or * rackLocalityDelay milliseconds depends on the current level. If it * has waited (nodeLocalityDelay + rackLocalityDelay) milliseconds, * it can go to any level. */ protected LocalityLevel getAllowedLocalityLevel(JobInProgress job, long currentTime) { JobInfo info = infos.get(job); if (info == null) { // Job not in infos (shouldn't happen) LOG.error("getAllowedLocalityLevel called on job " + job + ", which does not have a JobInfo in infos"); return LocalityLevel.ANY; } if (job.nonLocalMaps.size() > 0) { // Job doesn't have locality information return LocalityLevel.ANY; } // Don't wait for locality if the job's pool is starving for maps Pool pool = poolMgr.getPool(job); PoolSchedulable sched = pool.getMapSchedulable(); long minShareTimeout = poolMgr.getMinSharePreemptionTimeout(pool.getName()); long fairShareTimeout = poolMgr.getFairSharePreemptionTimeout(); if (currentTime - sched.getLastTimeAtMinShare() > minShareTimeout || currentTime - sched.getLastTimeAtHalfFairShare() > fairShareTimeout) { eventLog.log("INFO", "No delay scheduling for " + job.getJobID() + " because it is being starved"); return LocalityLevel.ANY; } // In the common case, compute locality level based on time waited switch(info.lastMapLocalityLevel) { case NODE: // Last task launched was node-local if (info.timeWaitedForLocalMap >= nodeLocalityDelay + rackLocalityDelay) return LocalityLevel.ANY; else if (info.timeWaitedForLocalMap >= nodeLocalityDelay) return LocalityLevel.RACK; else return LocalityLevel.NODE; case RACK: // Last task launched was rack-local if (info.timeWaitedForLocalMap >= rackLocalityDelay) return LocalityLevel.ANY; else return LocalityLevel.RACK; default: // Last task was non-local; can launch anywhere return LocalityLevel.ANY; } } /** * Recompute the internal variables used by the scheduler - per-job weights, * fair shares, deficits, minimum slot allocations, and numbers of running * and needed tasks of each type. */ protected void update() { // Making more granular locking so that clusterStatus can be fetched // from Jobtracker without locking the scheduler. ClusterStatus clusterStatus = taskTrackerManager.getClusterStatus(); // Recompute locality delay from JobTracker heartbeat interval if enabled. // This will also lock the JT, so do it outside of a fair scheduler lock. if (autoComputeLocalityDelay) { JobTracker jobTracker = (JobTracker) taskTrackerManager; nodeLocalityDelay = Math.min(MAX_AUTOCOMPUTED_LOCALITY_DELAY, (long) (1.5 * jobTracker.getNextHeartbeatInterval())); rackLocalityDelay = nodeLocalityDelay; } // Got clusterStatus hence acquiring scheduler lock now. synchronized (this) { // Reload allocations file if it hasn't been loaded in a while poolMgr.reloadAllocsIfNecessary(); // Remove any jobs that have stopped running List<JobInProgress> toRemove = new ArrayList<JobInProgress>(); for (JobInProgress job: infos.keySet()) { int runState = job.getStatus().getRunState(); if (runState == JobStatus.SUCCEEDED || runState == JobStatus.FAILED || runState == JobStatus.KILLED) { toRemove.add(job); } } for (JobInProgress job: toRemove) { jobNoLongerRunning(job); } updateRunnability(); // Set job runnability based on user/pool limits // Update demands of jobs and pools for (Pool pool: poolMgr.getPools()) { pool.getMapSchedulable().updateDemand(); pool.getReduceSchedulable().updateDemand(); } // Compute fair shares based on updated demands List<PoolSchedulable> mapScheds = getPoolSchedulables(TaskType.MAP); List<PoolSchedulable> reduceScheds = getPoolSchedulables(TaskType.REDUCE); SchedulingAlgorithms.computeFairShares( mapScheds, clusterStatus.getMaxMapTasks()); SchedulingAlgorithms.computeFairShares( reduceScheds, clusterStatus.getMaxReduceTasks()); // Use the computed shares to assign shares within each pool for (Pool pool: poolMgr.getPools()) { pool.getMapSchedulable().redistributeShare(); pool.getReduceSchedulable().redistributeShare(); } if (preemptionEnabled) updatePreemptionVariables(); } } private void jobNoLongerRunning(JobInProgress job) { assert Thread.holdsLock(this); JobInfo info = infos.remove(job); if (info != null) { info.mapSchedulable.cleanupMetrics(); info.reduceSchedulable.cleanupMetrics(); } poolMgr.removeJob(job); } public List<PoolSchedulable> getPoolSchedulables(TaskType type) { List<PoolSchedulable> scheds = new ArrayList<PoolSchedulable>(); for (Pool pool: poolMgr.getPools()) { scheds.add(pool.getSchedulable(type)); } return scheds; } private void updateRunnability() { // Start by marking everything as not runnable for (JobInfo info: infos.values()) { info.runnable = false; } // Create a list of sorted jobs in order of start time and priority List<JobInProgress> jobs = new ArrayList<JobInProgress>(infos.keySet()); Collections.sort(jobs, new FifoJobComparator()); // Mark jobs as runnable in order of start time and priority, until // user or pool limits have been reached. Map<String, Integer> userJobs = new HashMap<String, Integer>(); Map<String, Integer> poolJobs = new HashMap<String, Integer>(); for (JobInProgress job: jobs) { String user = job.getJobConf().getUser(); String pool = poolMgr.getPoolName(job); int userCount = userJobs.containsKey(user) ? userJobs.get(user) : 0; int poolCount = poolJobs.containsKey(pool) ? poolJobs.get(pool) : 0; if (userCount < poolMgr.getUserMaxJobs(user) && poolCount < poolMgr.getPoolMaxJobs(pool)) { if (job.getStatus().getRunState() == JobStatus.RUNNING || job.getStatus().getRunState() == JobStatus.PREP) { userJobs.put(user, userCount + 1); poolJobs.put(pool, poolCount + 1); JobInfo jobInfo = infos.get(job); if (job.getStatus().getRunState() == JobStatus.RUNNING) { jobInfo.runnable = true; } else { // The job is in the PREP state. Give it to the job initializer // for initialization if we have not already done it. if (jobInfo.needsInitializing) { jobInfo.needsInitializing = false; jobInitializer.initJob(jobInfo, job); } } } } } } public double getJobWeight(JobInProgress job, TaskType taskType) { if (!isRunnable(job)) { // Job won't launch tasks, but don't return 0 to avoid division errors return 1.0; } else { double weight = 1.0; if (sizeBasedWeight) { // Set weight based on runnable tasks JobInfo info = infos.get(job); int runnableTasks = (taskType == TaskType.MAP) ? info.mapSchedulable.getDemand() : info.reduceSchedulable.getDemand(); weight = Math.log1p(runnableTasks) / Math.log(2); } weight *= getPriorityFactor(job.getPriority()); if (weightAdjuster != null) { // Run weight through the user-supplied weightAdjuster weight = weightAdjuster.adjustWeight(job, taskType, weight); } return weight; } } private double getPriorityFactor(JobPriority priority) { switch (priority) { case VERY_HIGH: return 4.0; case HIGH: return 2.0; case NORMAL: return 1.0; case LOW: return 0.5; default: return 0.25; // priority = VERY_LOW } } public PoolManager getPoolManager() { return poolMgr; } private int getTotalSlots(TaskType type, ClusterStatus clusterStatus) { return (type == TaskType.MAP ? clusterStatus.getMaxMapTasks() : clusterStatus.getMaxReduceTasks()); } /** * Update the preemption fields for all PoolScheduables, i.e. the times since * each pool last was at its guaranteed share and at > 1/2 of its fair share * for each type of task. */ private void updatePreemptionVariables() { long now = clock.getTime(); lastPreemptionUpdateTime = now; for (TaskType type: MAP_AND_REDUCE) { for (PoolSchedulable sched: getPoolSchedulables(type)) { if (!isStarvedForMinShare(sched)) { sched.setLastTimeAtMinShare(now); } if (!isStarvedForFairShare(sched)) { sched.setLastTimeAtHalfFairShare(now); } eventLog.log("PREEMPT_VARS", sched.getName(), type, now - sched.getLastTimeAtMinShare(), now - sched.getLastTimeAtHalfFairShare()); } } } /** * Is a pool below its min share for the given task type? */ boolean isStarvedForMinShare(PoolSchedulable sched) { int desiredShare = Math.min(sched.getMinShare(), sched.getDemand()); return (sched.getRunningTasks() < desiredShare); } /** * Is a pool being starved for fair share for the given task type? * This is defined as being below half its fair share. */ boolean isStarvedForFairShare(PoolSchedulable sched) { int desiredFairShare = (int) Math.floor(Math.min( sched.getFairShare() / 2, sched.getDemand())); return (sched.getRunningTasks() < desiredFairShare); } /** * Check for pools that need tasks preempted, either because they have been * below their guaranteed share for minSharePreemptionTimeout or they * have been below half their fair share for the fairSharePreemptionTimeout. * If such pools exist, compute how many tasks of each type need to be * preempted and then select the right ones using preemptTasks. * * This method computes and logs the number of tasks we want to preempt even * if preemption is disabled, for debugging purposes. */ protected void preemptTasksIfNecessary() { if (!preemptionEnabled) return; long curTime = clock.getTime(); if (curTime - lastPreemptCheckTime < preemptionInterval) return; lastPreemptCheckTime = curTime; // Acquire locks on both the JobTracker (task tracker manager) and this // because we might need to call some JobTracker methods (killTask). synchronized (taskTrackerManager) { synchronized (this) { for (TaskType type: MAP_AND_REDUCE) { List<PoolSchedulable> scheds = getPoolSchedulables(type); int tasksToPreempt = 0; for (PoolSchedulable sched: scheds) { tasksToPreempt += tasksToPreempt(sched, curTime); } if (tasksToPreempt > 0) { eventLog.log("SHOULD_PREEMPT", type, tasksToPreempt); if (!onlyLogPreemption) { preemptTasks(scheds, tasksToPreempt); } } } } } } /** * Preempt a given number of tasks from a list of PoolSchedulables. * The policy for this is to pick tasks from pools that are over their fair * share, but make sure that no pool is placed below its fair share in the * process. Furthermore, we want to minimize the amount of computation * wasted by preemption, so out of the tasks in over-scheduled pools, we * prefer to preempt tasks that started most recently. */ private void preemptTasks(List<PoolSchedulable> scheds, int tasksToPreempt) { if (scheds.isEmpty() || tasksToPreempt == 0) return; TaskType taskType = scheds.get(0).getTaskType(); // Collect running tasks of our type from over-scheduled pools List<TaskStatus> runningTasks = new ArrayList<TaskStatus>(); for (PoolSchedulable sched: scheds) { if (sched.getRunningTasks() > sched.getFairShare()) for (JobSchedulable js: sched.getJobSchedulables()) { runningTasks.addAll(getRunningTasks(js.getJob(), taskType)); } } // Sort tasks into reverse order of start time Collections.sort(runningTasks, new Comparator<TaskStatus>() { public int compare(TaskStatus t1, TaskStatus t2) { if (t1.getStartTime() < t2.getStartTime()) return 1; else if (t1.getStartTime() == t2.getStartTime()) return 0; else return -1; } }); // Maintain a count of tasks left in each pool; this is a bit // faster than calling runningTasks() on the pool repeatedly // because the latter must scan through jobs in the pool HashMap<Pool, Integer> tasksLeft = new HashMap<Pool, Integer>(); for (Pool p: poolMgr.getPools()) { tasksLeft.put(p, p.getSchedulable(taskType).getRunningTasks()); } // Scan down the sorted list of task statuses until we've killed enough // tasks, making sure we don't kill too many from any pool for (TaskStatus status: runningTasks) { JobID jobID = status.getTaskID().getJobID(); JobInProgress job = taskTrackerManager.getJob(jobID); Pool pool = poolMgr.getPool(job); PoolSchedulable sched = pool.getSchedulable(taskType); int tasksLeftForPool = tasksLeft.get(pool); if (tasksLeftForPool > sched.getFairShare()) { eventLog.log("PREEMPT", status.getTaskID(), status.getTaskTracker()); try { taskTrackerManager.killTask(status.getTaskID(), false); tasksToPreempt--; if (tasksToPreempt == 0) break; // reduce tasks left for pool tasksLeft.put(pool, --tasksLeftForPool); } catch (IOException e) { LOG.error("Failed to kill task " + status.getTaskID(), e); } } } } /** * Count how many tasks of a given type the pool needs to preempt, if any. * If the pool has been below its min share for at least its preemption * timeout, it should preempt the difference between its current share and * this min share. If it has been below half its fair share for at least the * fairSharePreemptionTimeout, it should preempt enough tasks to get up to * its full fair share. If both conditions hold, we preempt the max of the * two amounts (this shouldn't happen unless someone sets the timeouts to * be identical for some reason). */ protected int tasksToPreempt(PoolSchedulable sched, long curTime) { String pool = sched.getName(); long minShareTimeout = poolMgr.getMinSharePreemptionTimeout(pool); long fairShareTimeout = poolMgr.getFairSharePreemptionTimeout(); int tasksDueToMinShare = 0; int tasksDueToFairShare = 0; if (curTime - sched.getLastTimeAtMinShare() > minShareTimeout) { int target = Math.min(sched.getMinShare(), sched.getDemand()); tasksDueToMinShare = Math.max(0, target - sched.getRunningTasks()); } if (curTime - sched.getLastTimeAtHalfFairShare() > fairShareTimeout) { int target = (int) Math.min(sched.getFairShare(), sched.getDemand()); tasksDueToFairShare = Math.max(0, target - sched.getRunningTasks()); } int tasksToPreempt = Math.max(tasksDueToMinShare, tasksDueToFairShare); if (tasksToPreempt > 0) { String message = "Should preempt " + tasksToPreempt + " " + sched.getTaskType() + " tasks for pool " + sched.getName() + ": tasksDueToMinShare = " + tasksDueToMinShare + ", tasksDueToFairShare = " + tasksDueToFairShare; eventLog.log("INFO", message); LOG.info(message); } return tasksToPreempt; } private List<TaskStatus> getRunningTasks(JobInProgress job, TaskType type) { // Create a list of all running TaskInProgress'es in the job Set<TaskInProgress> tips = new HashSet<TaskInProgress>(); if (type == TaskType.MAP) { // Jobs may have both "non-local maps" which have a split with no locality // info (e.g. the input file is not in HDFS), and maps with locality info, // which are stored in the runningMapCache map from location to task list tips.addAll(job.nonLocalRunningMaps); for (Set<TaskInProgress> set: job.runningMapCache.values()) { tips.addAll(set); } } else { tips.addAll(job.runningReduces); } // Get the active TaskStatus'es for each TaskInProgress (there may be // more than one if the task has multiple copies active due to speculation) List<TaskStatus> statuses = new ArrayList<TaskStatus>(); for (TaskInProgress tip: tips) { for (TaskAttemptID id: tip.getActiveTasks().keySet()) { TaskStatus stat = tip.getTaskStatus(id); // status is null when the task has been scheduled but not yet running if (stat != null) { statuses.add(stat); } } } return statuses; } protected boolean isRunnable(JobInProgress job) { JobInfo info = infos.get(job); if (info == null) return false; return info.runnable; } @Override public synchronized Collection<JobInProgress> getJobs(String queueName) { if (queueName == null) { return null; } Pool myJobPool = poolMgr.getPool(queueName); return myJobPool.getJobs(); } protected void dumpIfNecessary() { long now = clock.getTime(); long timeDelta = now - lastDumpTime; if (timeDelta > dumpInterval && eventLog.isEnabled()) { dump(); lastDumpTime = now; } } /** * Dump scheduler state to the fairscheduler log. */ private synchronized void dump() { synchronized (eventLog) { eventLog.log("BEGIN_DUMP"); // List jobs in order of submit time ArrayList<JobInProgress> jobs = new ArrayList<JobInProgress>(infos.keySet()); Collections.sort(jobs, new Comparator<JobInProgress>() { public int compare(JobInProgress j1, JobInProgress j2) { return (int) Math.signum(j1.getStartTime() - j2.getStartTime()); } }); // Dump info for each job for (JobInProgress job: jobs) { JobProfile profile = job.getProfile(); JobInfo info = infos.get(job); Schedulable ms = info.mapSchedulable; Schedulable rs = info.reduceSchedulable; eventLog.log("JOB", profile.getJobID(), profile.name, profile.user, job.getPriority(), poolMgr.getPoolName(job), job.numMapTasks, ms.getRunningTasks(), ms.getDemand(), ms.getFairShare(), ms.getWeight(), job.numReduceTasks, rs.getRunningTasks(), rs.getDemand(), rs.getFairShare(), rs.getWeight()); } // List pools in alphabetical order List<Pool> pools = new ArrayList<Pool>(poolMgr.getPools()); Collections.sort(pools, new Comparator<Pool>() { public int compare(Pool p1, Pool p2) { if (p1.isDefaultPool()) return 1; else if (p2.isDefaultPool()) return -1; else return p1.getName().compareTo(p2.getName()); }}); for (Pool pool: pools) { int runningMaps = pool.getMapSchedulable().getRunningTasks(); int runningReduces = pool.getReduceSchedulable().getRunningTasks(); String name = pool.getName(); eventLog.log("POOL", name, poolMgr.getPoolWeight(name), pool.getJobs().size(), poolMgr.getAllocation(name, TaskType.MAP), runningMaps, poolMgr.getAllocation(name, TaskType.REDUCE), runningReduces); } // Dump info for each pool eventLog.log("END_DUMP"); } } public Clock getClock() { return clock; } public FairSchedulerEventLog getEventLog() { return eventLog; } public JobInfo getJobInfo(JobInProgress job) { return infos.get(job); } boolean isPreemptionEnabled() { return preemptionEnabled; } long getLastPreemptionUpdateTime() { return lastPreemptionUpdateTime; } /** * Examines the job's pool name to determine if it is a declared pool name (in * the scheduler allocation file). */ @Override public void checkJobSubmission(JobInProgress job) throws UndeclaredPoolException { Set<String> declaredPools = poolMgr.getDeclaredPools(); if (!this.allowUndeclaredPools && !declaredPools.contains(poolMgr.getPoolName(job))) throw new UndeclaredPoolException("Pool name: '" + poolMgr.getPoolName(job) + "' is invalid. Add pool name to the fair scheduler allocation " + "file. Valid pools are: " + StringUtils.join(", ", declaredPools)); } }