/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapred;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.http.HttpServer;
import org.apache.hadoop.ipc.ProtocolSignature;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RPC.Server;
import org.apache.hadoop.mapred.JobInProgress.DataStatistics;
import org.apache.hadoop.mapred.protocal.FairSchedulerProtocol;
import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.server.jobtracker.TaskTracker;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.util.ReflectionUtils;
/**
* A {@link TaskScheduler} that implements fair sharing.
*/
public class FairScheduler extends TaskScheduler
implements FairSchedulerProtocol {
/** How often fair shares are re-calculated */
public static long updateInterval = 500;
public static final Log LOG = LogFactory.getLog(
"org.apache.hadoop.mapred.FairScheduler");
// Maximum locality delay when auto-computing locality delays
private static final long MAX_AUTOCOMPUTED_LOCALITY_DELAY = 15000;
private static final double FIFO_WEIGHT_DECAY_FACTOR = 0.5;
private long dumpStatusPeriod = 300000; // 5 minute
private long lastDumpStatusTime= 0L;
protected int mapPerHeartBeat = 1;
protected int reducePerHeartBeat = 1;
protected PoolManager poolMgr;
protected LoadManager loadMgr;
protected TaskSelector taskSelector;
protected WeightAdjuster weightAdjuster; // Can be null for no weight adjuster
protected Map<JobInProgress, JobInfo> infos = // per-job scheduling variables
new HashMap<JobInProgress, JobInfo>();
protected JobInfoSummary infosummary = new JobInfoSummary();
protected LinkedList<JobInProgress> sortedJobsByMapNeed, sortedJobsByReduceNeed;
protected Comparator<JobInProgress> mapComparator, reduceComparator;
protected long lastUpdateTime; // Time when we last updated infos
protected boolean initialized; // Are we initialized?
protected volatile boolean running; // Are we running?
protected JobComparator jobComparator; // How to sort the jobs
protected boolean assignMultiple; // Simultaneously assign map and reduce?
protected boolean sizeBasedWeight; // Give larger weights to larger jobs
protected boolean waitForMapsBeforeLaunchingReduces = true;
private Clock clock;
private boolean runBackgroundUpdates; // Can be set to false for testing
private JobListener jobListener;
private JobInitializer jobInitializer;
protected long lastHeartbeatTime; // Time we last ran assignTasks
protected long localityDelayNodeLocal; // Time to wait for node locality
protected long localityDelayRackLocal; // Time to wait for rack locality
protected boolean autoComputeLocalityDelay = false; // Compute locality delay
// from heartbeat interval
private Thread updateThread;
protected LocalityLevelManager localManager = null;
// a class which converts and obtains locality level
// How often tasks are preempted (must be longer than a couple
// of heartbeats to give task-kill commands a chance to act).
protected long preemptionInterval = 15000;
protected boolean preemptionEnabled;
private long lastPreemptCheckTime; // Time we last ran preemptTasksIfNecessary
// Used for unit tests; disables background updates
// Used to iterate through map and reduce task types
private static final TaskType[] MAP_AND_REDUCE =
new TaskType[] {TaskType.MAP, TaskType.REDUCE};
// Default parameters for RPC
public static final int DEFAULT_PORT = 50083;
/** RPC server */
Server server = null;
private FairSchedulerMetricsInst fairSchedulerMetrics = null;
/**
* Class holding summary computations over all JobInfo objects
*/
static class JobInfoSummary {
int totalRunningMaps = 0; // sum over all infos.runningMaps
int totalRunningReduces = 0; // sum over all infos.runningReduces
int totalNeededMaps = 0; // sum over all infos.neededMaps
int totalNeededReduces = 0; // sum over all infos.neededReduces
public void reset () {
totalRunningMaps = 0;
totalRunningReduces = 0;
totalNeededMaps = 0;
totalNeededReduces = 0;
}
}
/**
* A class for holding per-job scheduler variables. These always contain the
* values of the variables at the last update(), and are used along with a
* time delta to update the map and reduce deficits before a new update().
*/
static class JobInfo {
boolean runnable = false; // Can the job run given user/pool limits?
// Does this job need to be initialized?
volatile boolean needsInitializing = true;
String poolName = ""; // The pool this job belongs to
double mapWeight = 0; // Weight of job in calculation of map share
double reduceWeight = 0; // Weight of job in calculation of reduce share
long mapDeficit = 0; // Time deficit for maps
long reduceDeficit = 0; // Time deficit for reduces
int totalInitedTasks = 0; // Total initialized tasks
int runningMaps = 0; // Maps running at last update
int runningReduces = 0; // Reduces running at last update
int neededMaps; // Maps needed at last update
int neededReduces; // Reduces needed at last update
int minMaps = 0; // Minimum maps as guaranteed by pool
int minReduces = 0; // Minimum reduces as guaranteed by pool
int maxMaps = 0; // Maximum maps allowed to run
int maxReduces = 0; // Maximum reduces allowed to run
double mapFairShare = 0; // Fair share of map slots at last update
double reduceFairShare = 0; // Fair share of reduce slots at last update
int neededSpeculativeMaps; // Speculative maps needed at last update
int neededSpeculativeReduces; // Speculative reduces needed at last update
// Variables used for delay scheduling
LocalityLevel lastMapLocalityLevel = LocalityLevel.NODE;
// Locality level of last map launched
long timeWaitedForLocalMap; // Time waiting for local map since last map
boolean skippedAtLastHeartbeat; // Was job skipped at previous assignTasks?
// (used to update timeWaitedForLocalMap)
// Variables used for preemption
long lastTimeAtMapMinShare; // When was the job last at its min maps?
long lastTimeAtReduceMinShare; // Similar for reduces.
long lastTimeAtMapHalfFairShare; // When was the job last at half fair maps?
long lastTimeAtReduceHalfFairShare; // Similar for reduces.
public JobInfo(long currentTime) {
lastTimeAtMapMinShare = currentTime;
lastTimeAtReduceMinShare = currentTime;
lastTimeAtMapHalfFairShare = currentTime;
lastTimeAtReduceHalfFairShare = currentTime;
}
}
/**
* A class which converts and obtains locality level
*/
static class LocalityLevelManager {
/**
* Obtain LocalityLevel of a task from its job and tasktracker.
*/
public LocalityLevel taskToLocalityLevel(JobInProgress job,
Task mapTask, TaskTrackerStatus tracker) {
TaskInProgress tip = getTaskInProgress(job, mapTask);
switch (job.getLocalityLevel(tip, tracker)) {
case 0: return LocalityLevel.NODE;
case 1: return LocalityLevel.RACK;
default: return LocalityLevel.ANY;
}
}
private TaskInProgress getTaskInProgress(JobInProgress job, Task mapTask) {
if (!job.inited()) {
return null;
}
TaskID tipId = mapTask.getTaskID().getTaskID();
for (int i = 0; i < job.maps.length; i++) {
if (tipId.equals(job.maps[i].getTIPId())) {
return job.maps[i];
}
}
return null;
}
}
/**
* Represents the level of data-locality at which a job in the fair scheduler
* is allowed to launch tasks. By default, jobs are not allowed to launch
* non-data-local tasks until they have waited a small number of seconds to
* find a slot on a node that they have data on. If a job has waited this
* long, it is allowed to launch rack-local tasks as well (on nodes that may
* not have the task's input data, but share a rack with a node that does).
* Finally, after a further wait, jobs are allowed to launch tasks anywhere
* in the cluster.
*/
public enum LocalityLevel {
NODE (1),
RACK (2),
ANY (Integer.MAX_VALUE);
private final int cacheLevelCap;
LocalityLevel(int cacheLevelCap) {
this.cacheLevelCap = cacheLevelCap;
}
/**
* Obtain a JobInProgress cache level cap to pass to
* {@link JobInProgress#obtainNewMapTask(TaskTrackerStatus, int, int, int)}
* to ensure that only tasks of this locality level and lower are launched.
*/
public int getCacheLevelCap() {
return cacheLevelCap;
}
}
/**
* A clock class - can be mocked out for testing.
*/
static class Clock {
long getTime() {
return System.currentTimeMillis();
}
}
public FairScheduler() {
this(new Clock(), true, new LocalityLevelManager());
}
/**
* Constructor used for tests, which can change the clock, disable updates
* and change locality.
*/
protected FairScheduler(Clock clock, boolean runBackgroundUpdates,
LocalityLevelManager localManager) {
this.clock = clock;
this.runBackgroundUpdates = runBackgroundUpdates;
this.jobListener = new JobListener();
this.localManager = localManager;
}
@Override
public void start() {
try {
Configuration conf = getConf();
jobInitializer = new JobInitializer(conf, taskTrackerManager);
taskTrackerManager.addJobInProgressListener(jobListener);
poolMgr = new PoolManager(conf);
loadMgr = (LoadManager) ReflectionUtils.newInstance(
conf.getClass("mapred.fairscheduler.loadmanager",
CapBasedLoadManager.class, LoadManager.class), conf);
loadMgr.setTaskTrackerManager(taskTrackerManager);
loadMgr.start();
taskSelector = (TaskSelector) ReflectionUtils.newInstance(
conf.getClass("mapred.fairscheduler.taskselector",
DefaultTaskSelector.class, TaskSelector.class), conf);
taskSelector.setTaskTrackerManager(taskTrackerManager);
taskSelector.start();
Class<?> weightAdjClass = conf.getClass(
"mapred.fairscheduler.weightadjuster", null);
if (weightAdjClass != null) {
weightAdjuster = (WeightAdjuster) ReflectionUtils.newInstance(
weightAdjClass, conf);
}
updateInterval = conf.getLong(
"mapred.fairscheduler.update.interval", updateInterval);
preemptionInterval = conf.getLong(
"mapred.fairscheduler.preemption.interval", preemptionInterval);
assignMultiple = conf.getBoolean(
"mapred.fairscheduler.assignmultiple", false);
sizeBasedWeight = conf.getBoolean(
"mapred.fairscheduler.sizebasedweight", false);
preemptionEnabled = conf.getBoolean(
"mapred.fairscheduler.preemption", false);
mapPerHeartBeat =
conf.getInt("mapred.fairscheduler.mapsperheartbeat", 1);
reducePerHeartBeat =
conf.getInt("mapred.fairscheduler.reducesperheartbeat", 1);
jobComparator = JobComparator.valueOf(
conf.get("mapred.fairscheduler.jobcomparator",
JobComparator.DEFICIT.toString()));
long defaultDelay = conf.getLong(
"mapred.fairscheduler.locality.delay", -1);
localityDelayNodeLocal = conf.getLong(
"mapred.fairscheduler.locality.delay.nodelocal", defaultDelay);
localityDelayRackLocal = conf.getLong(
"mapred.fairscheduler.locality.delay.racklocal", defaultDelay);
dumpStatusPeriod = conf.getLong(
"mapred.fairscheduler.dump.status.period", dumpStatusPeriod);
if (defaultDelay == -1 &&
(localityDelayNodeLocal == -1 || localityDelayRackLocal == -1)) {
autoComputeLocalityDelay = true; // Compute from heartbeat interval
}
initialized = true;
running = true;
lastUpdateTime = clock.getTime();
// Start a thread to update deficits every updateInterval
if (runBackgroundUpdates) {
updateThread = new UpdateThread();
updateThread.start();
}
// Register servlet with JobTracker's Jetty server
if (taskTrackerManager instanceof JobTracker) {
JobTracker jobTracker = (JobTracker) taskTrackerManager;
HttpServer infoServer = jobTracker.infoServer;
infoServer.setAttribute("scheduler", this);
infoServer.addServlet("scheduler", "/scheduler",
FairSchedulerServlet.class);
fairSchedulerMetrics = new FairSchedulerMetricsInst(this, conf);
}
// Start RPC server
InetSocketAddress socAddr = FairScheduler.getAddress(conf);
server = RPC.getServer(
this, socAddr.getHostName(), socAddr.getPort(), conf);
LOG.info("FairScheduler RPC server started at " +
server.getListenerAddress());
server.start();
} catch (Exception e) {
// Can't load one of the managers - crash the JobTracker now while it is
// starting up so that the user notices.
throw new RuntimeException("Failed to start FairScheduler", e);
}
LOG.info("Successfully configured FairScheduler");
}
public static InetSocketAddress getAddress(Configuration conf) {
String nodeport = conf.get("mapred.fairscheduler.server.address");
if (nodeport == null) {
nodeport = "localhost:" + DEFAULT_PORT;
}
return NetUtils.createSocketAddr(nodeport);
}
@Override
public void terminate() throws IOException {
running = false;
jobInitializer.terminate();
if (jobListener != null)
taskTrackerManager.removeJobInProgressListener(jobListener);
if (server != null)
server.stop();
}
private class JobInitializer {
private final int DEFAULT_NUM_THREADS = 1;
private ExecutorService threadPool;
private TaskTrackerManager ttm;
public JobInitializer(Configuration conf, TaskTrackerManager ttm) {
int numThreads = conf.getInt("mapred.jobinit.threads",
DEFAULT_NUM_THREADS);
threadPool = Executors.newFixedThreadPool(numThreads);
this.ttm = ttm;
}
public void initJob(JobInfo jobInfo, JobInProgress job) {
if (runBackgroundUpdates) {
threadPool.execute(new InitJob(jobInfo, job));
} else {
new InitJob(jobInfo, job).run();
}
}
class InitJob implements Runnable {
private JobInfo jobInfo;
private JobInProgress job;
public InitJob(JobInfo jobInfo, JobInProgress job) {
this.jobInfo = jobInfo;
this.job = job;
}
public void run() {
ttm.initJob(job);
}
}
void terminate() {
LOG.info("Shutting down thread pool");
threadPool.shutdownNow();
try {
threadPool.awaitTermination(1, TimeUnit.MINUTES);
} catch (InterruptedException e) {
// Ignore, we are in shutdown anyway.
}
}
}
/**
* Used to listen for jobs added/removed by our {@link TaskTrackerManager}.
*/
private class JobListener extends JobInProgressListener {
@Override
public void jobAdded(JobInProgress job) {
synchronized (FairScheduler.this) {
poolMgr.addJob(job);
JobInfo info = new JobInfo(clock.getTime());
info.poolName = poolMgr.getPoolName(job);
infos.put(job, info);
if (updateThread != null)
updateThread.interrupt();
else
update();
}
}
@Override
public void jobRemoved(JobInProgress job) {
synchronized (FairScheduler.this) {
poolMgr.removeJob(job);
infos.remove(job);
if(sortedJobsByMapNeed != null)
sortedJobsByMapNeed.remove(job);
if(sortedJobsByReduceNeed != null)
sortedJobsByReduceNeed.remove(job);
}
}
@Override
public void jobUpdated(JobChangeEvent event) {
}
}
/**
* A thread which calls {@link FairScheduler#update()} ever
* <code>updateInterval</code> milliseconds.
*/
private class UpdateThread extends Thread {
private UpdateThread() {
super("FairScheduler update thread");
}
public void run() {
while (running) {
try {
try {
Thread.sleep(updateInterval);
} catch (InterruptedException e) {
// ignore
}
update();
preemptTasksIfNecessary();
} catch (Exception e) {
LOG.error("Exception in fair scheduler UpdateThread", e);
}
}
}
}
@Override
public synchronized List<Task> assignTasks(TaskTracker tracker)
throws IOException {
if (!initialized) // Don't try to assign tasks if we haven't yet started up
return null;
int totalRunnableMaps = infosummary.totalRunningMaps +
infosummary.totalNeededMaps;
int totalRunnableReduces = infosummary.totalRunningReduces +
infosummary.totalNeededReduces;
ClusterStatus clusterStatus = taskTrackerManager.getClusterStatus();
// Compute total map/reduce slots
// In the future we can precompute this if the Scheduler becomes a
// listener of tracker join/leave events.
int totalMapSlots = getTotalSlots(TaskType.MAP, clusterStatus);
int totalReduceSlots = getTotalSlots(TaskType.REDUCE, clusterStatus);
if (LOG.isDebugEnabled()) {
LOG.debug("totalMapSlots:" + totalMapSlots +
" totalReduceSlots:" + totalReduceSlots);
}
// Scan to see whether any job needs to run a map, then a reduce
ArrayList<Task> tasks = new ArrayList<Task>();
long currentTime = clock.getTime();
// Update time waited for local maps for jobs skipped on last heartbeat
updateLocalityWaitTimes(currentTime);
TaskTrackerStatus trackerStatus = tracker.getStatus();
int runningMapsOnTT = occupiedSlotsAfterKill(trackerStatus, TaskType.MAP);
int runningReducesOnTT =
occupiedSlotsAfterKill(trackerStatus, TaskType.REDUCE);
int availableMapsOnTT = getAvailableSlots(trackerStatus, TaskType.MAP);
int availableReducesOnTT = getAvailableSlots(trackerStatus, TaskType.REDUCE);
if (LOG.isDebugEnabled()) {
LOG.debug("tracker:" + trackerStatus.getTrackerName() +
" runMaps:" + runningMapsOnTT +
" runReduces:" + runningReducesOnTT +
" availMaps:" + availableMapsOnTT +
" availReduces:" + availableReducesOnTT);
}
for (TaskType taskType: MAP_AND_REDUCE) {
boolean canAssign = (taskType == TaskType.MAP) ?
loadMgr.canAssignMap(trackerStatus, totalRunnableMaps,
totalMapSlots) :
loadMgr.canAssignReduce(trackerStatus, totalRunnableReduces,
totalReduceSlots);
boolean hasAvailableSlots =
(availableMapsOnTT > 0 && taskType == TaskType.MAP) ||
(availableReducesOnTT > 0 && taskType == TaskType.REDUCE);
if (LOG.isDebugEnabled()) {
LOG.debug("type:" + taskType +
" canAssign:" + canAssign +
" hasAvailableSlots:" + hasAvailableSlots);
}
if (!canAssign || !hasAvailableSlots) {
continue; // Go to the next task type
}
int numTasks = 0;
LinkedList<JobInProgress> candidates = (taskType == TaskType.MAP) ?
sortedJobsByMapNeed : sortedJobsByReduceNeed;
if (candidates == null) {
// There are no candidate jobs
// Only happens when the cluster is empty
break;
}
LinkedList<JobInProgress> jobsToReinsert = new LinkedList<JobInProgress> ();
Iterator<JobInProgress> iterator = candidates.iterator();
while (iterator.hasNext()) {
JobInProgress job = iterator.next();
if (LOG.isDebugEnabled()) {
LOG.debug("job:" + job + " numTasks:" + numTasks);
}
if (job.getStatus().getRunState() != JobStatus.RUNNING) {
if (LOG.isDebugEnabled()) {
LOG.debug("Job run state is not running. Skip job");
}
iterator.remove();
continue;
}
if (!loadMgr.canLaunchTask(trackerStatus, job, taskType)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Load manager canLaunchTask returns false. Skip job");
}
continue;
}
// Do not schedule if the maximum slots is reached in the pool.
JobInfo info = infos.get(job);
if (poolMgr.isMaxTasks(info.poolName, taskType)) {
if (LOG.isDebugEnabled()) {
LOG.debug("pool:" + info.poolName + " is full. Skip job");
}
continue;
}
// Try obtaining a suitable task for this job
Task task = null;
if (taskType == TaskType.MAP) {
LocalityLevel level = getAllowedLocalityLevel(job, currentTime);
if (LOG.isDebugEnabled()) {
LOG.debug("level:" + level);
}
task = job.obtainNewMapTask(trackerStatus,
clusterStatus.getTaskTrackers(),
taskTrackerManager.getNumberOfUniqueHosts(),
level.getCacheLevelCap());
if (task == null) {
info.skippedAtLastHeartbeat = true;
} else {
updateLastMapLocalityLevel(job, task, trackerStatus);
}
} else {
task = job.obtainNewReduceTask(trackerStatus,
clusterStatus.getTaskTrackers(),
taskTrackerManager.getNumberOfUniqueHosts());
}
if (LOG.isDebugEnabled()) {
LOG.debug("task:" + task);
}
// Update information when obtained a task
if (task != null) {
// Update the JobInfo for this job so we account for the launched
// tasks during this update interval and don't try to launch more
// tasks than the job needed on future heartbeats
if (taskType == TaskType.MAP) {
info.runningMaps++;
info.neededMaps--;
infosummary.totalRunningMaps++;
infosummary.totalNeededMaps--;
runningMapsOnTT++;
} else {
info.runningReduces++;
info.neededReduces--;
infosummary.totalRunningReduces++;
infosummary.totalNeededReduces--;
runningReducesOnTT++;
}
poolMgr.incRunningTasks(info.poolName, taskType, 1);
tasks.add(task);
numTasks++;
// delete the scheduled jobs from sorted list
iterator.remove();
// keep track that it needs to be reinserted.
// we reinsert in LIFO order to minimize comparisons
if (neededTasks(info, taskType) > 0)
jobsToReinsert.push(job);
if (!assignMultiple) {
if (jobsToReinsert.size() > 0) {
mergeJobs(jobsToReinsert, taskType);
}
return tasks;
}
if (numTasks >= ((taskType == TaskType.MAP)
? mapPerHeartBeat : reducePerHeartBeat)) {
if (LOG.isDebugEnabled()) {
LOG.debug("numTasks:" + numTasks + " reached tasks per heart beat");
}
break;
}
if (numTasks >= ((taskType == TaskType.MAP)
? availableMapsOnTT : availableReducesOnTT)) {
if (LOG.isDebugEnabled()) {
LOG.debug("numTasks:" + numTasks + " reached available slots.");
}
break;
}
}
}
if (jobsToReinsert.size() > 0)
mergeJobs(jobsToReinsert, taskType);
}
// If no tasks were found, return null
return tasks.isEmpty() ? null : tasks;
}
/**
* Obtain the how many more slots can be scheduled on this tasktracker
* @param tts The status of the tasktracker
* @param type The type of the task to be scheduled
* @return the number of tasks can be scheduled
*/
private int getAvailableSlots(TaskTrackerStatus tts, TaskType type) {
return getMaxSlots(tts, type) - occupiedSlotsAfterKill(tts, type);
}
/**
* Obtain the number of occupied slots after the scheduled kills are done
* @param tts The status of the tasktracker
* @param type The type of the task
* @return the number of occupied slots after kill actions
*/
private int occupiedSlotsAfterKill(TaskTrackerStatus tts, TaskType type) {
int occupied = (type == TaskType.MAP) ?
tts.countOccupiedMapSlots() - tts.getMapsKilled() :
tts.countOccupiedReduceSlots() - tts.getReducesKilled();
return occupied;
}
/**
* reinsert a set of jobs into the sorted jobs for a given type (MAP/REDUCE)
* the re-insertion happens in place.
* we are exploiting the property that the jobs being inserted will most likely end
* up at the head of the sorted list and not require a lot comparisons
*/
private void mergeJobs (LinkedList<JobInProgress> jobsToReinsert, TaskType taskType) {
LinkedList<JobInProgress> sortedJobs = (taskType == TaskType.MAP) ?
sortedJobsByMapNeed : sortedJobsByReduceNeed;
Comparator<JobInProgress> comparator = (taskType == TaskType.MAP) ?
mapComparator : reduceComparator;
// for each job to be reinserted
for(JobInProgress jobToReinsert: jobsToReinsert) {
// look at existing jobs in the sorted list starting with the head
boolean reinserted = false;
ListIterator<JobInProgress> iter = sortedJobs.listIterator(0);
while (iter.hasNext()) {
JobInProgress job = iter.next();
if (comparator.compare(jobToReinsert, job) < 0) {
// found the point of insertion, move the iterator back one step
iter.previous();
// now we are positioned before the job we compared against
// insert it before this job
iter.add(jobToReinsert);
reinserted = true;
break;
}
}
if (!reinserted) {
sortedJobs.add(jobToReinsert);
}
}
}
public enum JobComparator {
DEFICIT, FAIR, FIFO;
}
public synchronized JobComparator getJobComparator() {
return jobComparator;
}
public synchronized void setJobComparator(JobComparator jobComparator) {
if (jobComparator != null) {
this.jobComparator = jobComparator;
}
}
/**
* Compare jobs by deficit for a given task type, putting jobs whose current
* allocation is less than their minimum share always ahead of others. This is
* the default job comparator used for Fair Sharing.
*/
private class DeficitComparator implements Comparator<JobInProgress> {
private final TaskType taskType;
private DeficitComparator(TaskType taskType) {
this.taskType = taskType;
}
public int compare(JobInProgress j1, JobInProgress j2) {
// Put needy jobs ahead of non-needy jobs (where needy means must receive
// new tasks to meet slot minimum), comparing among jobs of the same type
// by deficit so as to put jobs with higher deficit ahead.
JobInfo j1Info = infos.get(j1);
JobInfo j2Info = infos.get(j2);
double deficitDif;
boolean job1BelowMinSlots, job2BelowMinSlots;
if (taskType == TaskType.MAP) {
job1BelowMinSlots = j1.runningMaps() < j1Info.minMaps;
job2BelowMinSlots = j2.runningMaps() < j2Info.minMaps;
deficitDif = j2Info.mapDeficit - j1Info.mapDeficit;
} else {
job1BelowMinSlots = j1.runningReduces() < j1Info.minReduces;
job2BelowMinSlots = j2.runningReduces() < j2Info.minReduces;
deficitDif = j2Info.reduceDeficit - j1Info.reduceDeficit;
}
// Compute if the pool minimum slots limit has been achieved
String pool1 = j1Info.poolName;
String pool2 = j2Info.poolName;
boolean pool1BelowMinSlots = poolMgr.getRunningTasks(pool1, taskType) <
poolMgr.getAllocation(pool1, taskType);
boolean pool2BelowMinSlots = poolMgr.getRunningTasks(pool2, taskType) <
poolMgr.getAllocation(pool2, taskType);
// A job is needy only when both of the job and pool minimum slots are
// not reached.
boolean job1Needy = pool1BelowMinSlots && job1BelowMinSlots;
boolean job2Needy = pool2BelowMinSlots && job2BelowMinSlots;
if (job1Needy && !job2Needy) {
return -1;
} else if (job2Needy && !job1Needy) {
return 1;
} else { // Both needy or both non-needy; compare by deficit
return (int) Math.signum(deficitDif);
}
}
}
/**
* Compare jobs by current running tasks for a given task type. We first
* compare if jobs are running under minimum slots. Job with tasks under
* minimum slots will be ranked higher. And we compare the ratio of running
* tasks and the fairshare to rank the job.
*/
private class FairComparator implements Comparator<JobInProgress> {
private final TaskType taskType;
private FairComparator(TaskType taskType) {
this.taskType = taskType;
}
@Override
public int compare(JobInProgress j1, JobInProgress j2) {
JobInfo j1Info = infos.get(j1);
JobInfo j2Info = infos.get(j2);
int job1RunningTasks, job2RunningTasks;
int job1MinTasks, job2MinTasks;
double job1Weight, job2Weight;
// Get running tasks, minimum tasks and weight based on task type.
if (taskType == TaskType.MAP) {
job1RunningTasks = j1Info.runningMaps;
job1MinTasks = j1Info.minMaps;
job1Weight = j1Info.mapWeight;
job2RunningTasks = j2Info.runningMaps;
job2MinTasks = j2Info.minMaps;
job2Weight = j2Info.mapWeight;
} else {
job1RunningTasks = j1Info.runningReduces;
job1MinTasks = j1Info.minReduces;
job1Weight = j1Info.reduceWeight;
job2RunningTasks = j2Info.runningReduces;
job2MinTasks = j2Info.minReduces;
job2Weight = j2Info.reduceWeight;
}
// Compute the ratio between running tasks and fairshare (or minslots)
boolean job1BelowMinSlots = false, job2BelowMinSlots = false;
double job1RunningTaskRatio, job2RunningTaskRatio;
if (job1RunningTasks < job1MinTasks) {
job1BelowMinSlots = true;
job1RunningTaskRatio = (double)job1RunningTasks /
(double)job1MinTasks;
} else {
job1RunningTaskRatio = (double)job1RunningTasks /
job1Weight;
}
if (job2RunningTasks < job2MinTasks) {
job2BelowMinSlots = true;
job2RunningTaskRatio = (double)job2RunningTasks /
(double)job2MinTasks;
} else {
job2RunningTaskRatio = (double)job2RunningTasks /
job2Weight;
}
// Compute if the pool minimum slots limit has been achieved
String pool1 = j1Info.poolName;
String pool2 = j2Info.poolName;
boolean pool1BelowMinSlots = poolMgr.getRunningTasks(pool1, taskType) <
poolMgr.getAllocation(pool1, taskType);
boolean pool2BelowMinSlots = poolMgr.getRunningTasks(pool2, taskType) <
poolMgr.getAllocation(pool2, taskType);
// A job is needy only when both of the job and pool minimum slots are
// not reached.
boolean job1Needy = pool1BelowMinSlots && job1BelowMinSlots;
boolean job2Needy = pool2BelowMinSlots && job2BelowMinSlots;
if (job1Needy && !job2Needy) {
return -1;
}
if (job2Needy && !job1Needy) {
return 1;
}
// Both needy or both non-needy; compare by running task ratio
if (job1RunningTaskRatio == job2RunningTaskRatio) {
return j1.getJobID().toString().compareTo(j2.getJobID().toString());
}
return job1RunningTaskRatio < job2RunningTaskRatio ? -1 : 1;
}
}
/**
* Update locality wait times for jobs that were skipped at last heartbeat.
*/
private void updateLocalityWaitTimes(long currentTime) {
long timeSinceLastHeartbeat =
(lastHeartbeatTime == 0 ? 0 : currentTime - lastHeartbeatTime);
lastHeartbeatTime = currentTime;
for (JobInfo info: infos.values()) {
if (info.skippedAtLastHeartbeat) {
info.timeWaitedForLocalMap += timeSinceLastHeartbeat;
// We reset the flag so that timeWaitedForLocalMap is increment only
// once. It will be increment again if skippedAtLastHeartbeat is set
// to true next time.
info.skippedAtLastHeartbeat = false;
}
}
}
/**
* Update a job's locality level and locality wait variables given that that
* it has just launched a map task on a given task tracker.
*/
private void updateLastMapLocalityLevel(JobInProgress job,
Task mapTaskLaunched, TaskTrackerStatus tracker) {
JobInfo info = infos.get(job);
LocalityLevel localityLevel = localManager.taskToLocalityLevel(
job, mapTaskLaunched, tracker);
info.lastMapLocalityLevel = localityLevel;
info.timeWaitedForLocalMap = 0;
}
/**
* Get the maximum locality level at which a given job is allowed to
* launch tasks, based on how long it has been waiting for local tasks.
* This is used to implement the "delay scheduling" feature of the Fair
* Scheduler for optimizing data locality.
* If the job has no locality information (e.g. it does not use HDFS), this
* method returns LocalityLevel.ANY, allowing tasks at any level.
* Otherwise, the job can only launch tasks at its current locality level
* or lower, unless it has waited at least localityDelayNodeLocal or
* localityDelayRackLocal milliseconds depends on the current level. If it
* has waited (localityDelayNodeLocal + localityDelayRackLocal) milliseconds,
* it can go to any level.
*/
protected LocalityLevel getAllowedLocalityLevel(JobInProgress job,
long currentTime) {
JobInfo info = infos.get(job);
if (info == null) { // Job not in infos (shouldn't happen)
LOG.error("getAllowedLocalityLevel called on job " + job
+ ", which does not have a JobInfo in infos");
return LocalityLevel.ANY;
}
if (job.nonLocalMaps.size() > 0) { // Job doesn't have locality information
return LocalityLevel.ANY;
}
// In the common case, compute locality level based on time waited
switch(info.lastMapLocalityLevel) {
case NODE: // Last task launched was node-local
if (info.timeWaitedForLocalMap >=
(localityDelayNodeLocal + localityDelayRackLocal))
return LocalityLevel.ANY;
else if (info.timeWaitedForLocalMap >= localityDelayNodeLocal)
return LocalityLevel.RACK;
else
return LocalityLevel.NODE;
case RACK: // Last task launched was rack-local
if (info.timeWaitedForLocalMap >= localityDelayRackLocal)
return LocalityLevel.ANY;
else
return LocalityLevel.RACK;
default: // Last task was non-local; can launch anywhere
return LocalityLevel.ANY;
}
}
/**
* Recompute the internal variables used by the scheduler - per-job weights,
* fair shares, deficits, minimum slot allocations, and numbers of running
* and needed tasks of each type.
*/
protected void update() {
//Making more granual locking so that clusterStatus can be fetched from Jobtracker.
ClusterStatus clusterStatus = taskTrackerManager.getClusterStatus();
// Recompute locality delay from JobTracker heartbeat interval if enabled.
// This will also lock the JT, so do it outside of a fair scheduler lock.
if (autoComputeLocalityDelay) {
JobTracker jobTracker = (JobTracker) taskTrackerManager;
localityDelayNodeLocal = Math.min(MAX_AUTOCOMPUTED_LOCALITY_DELAY,
(long) (1.5 * jobTracker.getNextHeartbeatInterval()));
localityDelayRackLocal = localityDelayNodeLocal;
}
// Got clusterStatus hence acquiring scheduler lock now
// Remove non-running jobs
synchronized(this){
// Reload allocations file if it hasn't been loaded in a while
if (poolMgr.reloadAllocsIfNecessary()) {
// Check if the cluster have enough slots for reserving
poolMgr.checkMinimumSlotsAvailable(clusterStatus, TaskType.MAP);
poolMgr.checkMinimumSlotsAvailable(clusterStatus, TaskType.REDUCE);
}
List<JobInProgress> toRemove = new ArrayList<JobInProgress>();
for (JobInProgress job: infos.keySet()) {
int runState = job.getStatus().getRunState();
if (runState == JobStatus.SUCCEEDED || runState == JobStatus.FAILED
|| runState == JobStatus.KILLED) {
toRemove.add(job);
}
}
for (JobInProgress job: toRemove) {
infos.remove(job);
poolMgr.removeJob(job);
}
// Update running jobs with deficits since last update, and compute new
// slot allocations, weight, shares and task counts
long now = clock.getTime();
long timeDelta = now - lastUpdateTime;
updateDeficits(timeDelta);
updateRunnability();
updateTaskCounts();
updateWeights();
updateMinAndMaxSlots();
updateFairShares(clusterStatus);
if (preemptionEnabled) {
updatePreemptionVariables();
}
sortJobs();
dumpStatus(now);
lastUpdateTime = now;
}
}
/**
* Output some scheduling information to LOG
* @param now current unix time
*/
private void dumpStatus(long now) {
if (now - lastDumpStatusTime < dumpStatusPeriod) {
return;
}
lastDumpStatusTime = now;
dumpSpeculationStatus(now);
}
private void dumpSpeculationStatus(long now) {
final long TASK_INFO_DUMP_DELAY = 1200000; // 20 minutes
for (JobInProgress job : infos.keySet()) {
for (TaskType type : MAP_AND_REDUCE) {
boolean isMap = (type == TaskType.MAP);
if (!isMap && job.desiredReduces() <= 0)
continue;
if ((isMap && !job.hasSpeculativeMaps()) ||
(!isMap && !job.hasSpeculativeReduces()))
continue;
DataStatistics taskStats =
job.getRunningTaskStatistics(isMap);
LOG.info(job.getJobID().toString() + " taskStats : " + taskStats);
for (TaskInProgress tip :
job.getTasks(isMap ? org.apache.hadoop.mapreduce.TaskType.MAP :
org.apache.hadoop.mapreduce.TaskType.REDUCE)) {
if (!tip.isComplete() &&
now - tip.getLastDispatchTime() > TASK_INFO_DUMP_DELAY) {
double currProgRate = tip.getProgressRate();
TreeMap<TaskAttemptID, String> activeTasks = tip.getActiveTasks();
if (activeTasks.isEmpty()) {
continue;
}
boolean canBeSpeculated = tip.canBeSpeculated(now);
LOG.info(activeTasks.firstKey() +
" activeTasks.size():" + activeTasks.size() +
" task's progressrate:" + currProgRate +
" canBeSepculated:" + canBeSpeculated);
}
}
}
}
}
private void sortJobs() {
for (TaskType taskType: MAP_AND_REDUCE) {
// Sort jobs by deficit (for Fair Sharing), submit time (for FIFO) or
// current running task ratio
Comparator<JobInProgress> comparator;
switch(jobComparator) {
case FAIR:
comparator = new FairComparator(taskType);
break;
case FIFO:
comparator = new FifoJobComparator();
break;
default:
comparator = new DeficitComparator(taskType);
}
// Figure out the jobs that need this type of task
LinkedList<JobInProgress> sortedJobs = new LinkedList<JobInProgress>();
for (Map.Entry<JobInProgress, JobInfo> entry: infos.entrySet()) {
JobInProgress job = entry.getKey();
JobInfo jobInfo = entry.getValue();
if (job.getStatus().getRunState() == JobStatus.RUNNING &&
neededTasks(jobInfo, taskType) > 0) {
sortedJobs.add(job);
}
}
Collections.sort (sortedJobs, comparator);
if (taskType == TaskType.MAP) {
sortedJobsByMapNeed = sortedJobs;
mapComparator = comparator;
} else {
sortedJobsByReduceNeed = sortedJobs;
reduceComparator = comparator;
}
}
}
private void logJobStats(List<JobInProgress> jobs, TaskType type) {
if (jobs.isEmpty())
return;
StringBuilder sb = new StringBuilder ("JobStats for type:" + type + "\t");
for (JobInProgress job: jobs) {
JobInfo info = infos.get(job);
sb.append("Job:" + job.getJobID().toString());
sb.append(",runningTasks:" + runningTasks(info, type));
sb.append(",minTasks:" + minTasks(info, type));
sb.append(",weight:" + weight(info, type));
sb.append(",fairTasks:" + fairTasks(info, type));
sb.append(",neededTasks:" + neededTasks(info, type));
sb.append("\t");
}
LOG.info (sb.toString());
}
private void updateDeficits(long timeDelta) {
for (JobInfo info: infos.values()) {
info.mapDeficit +=
(info.mapFairShare - info.runningMaps) * timeDelta;
info.reduceDeficit +=
(info.reduceFairShare - info.runningReduces) * timeDelta;
}
}
private void updateRunnability() {
// Start by marking everything as not runnable
for (JobInfo info: infos.values()) {
info.runnable = false;
}
// Create a list of sorted jobs in order of start time and priority
List<JobInProgress> jobs = new ArrayList<JobInProgress>(infos.keySet());
Collections.sort(jobs, new FifoJobComparator());
// Mark jobs as runnable in order of start time and priority, until
// user or pool limits have been reached.
Map<String, Integer> userJobs = new HashMap<String, Integer>();
Map<String, Integer> poolJobs = new HashMap<String, Integer>();
Map<String, Integer> poolTasks = new HashMap<String, Integer>();
Set<JobInProgress> couldBeInitialized = new HashSet<JobInProgress>();
for (JobInProgress job: jobs) {
String user = job.getJobConf().getUser();
String pool = poolMgr.getPoolName(job);
int userCount = userJobs.containsKey(user) ? userJobs.get(user) : 0;
int poolCount = poolJobs.containsKey(pool) ? poolJobs.get(pool) : 0;
int poolTaskCount = poolTasks.containsKey(pool) ? poolTasks.get(pool) : 0;
if (userCount < poolMgr.getUserMaxJobs(user) &&
poolCount < poolMgr.getPoolMaxJobs(pool) &&
poolTaskCount < poolMgr.getPoolMaxInitedTasks(pool)) {
if (job.getStatus().getRunState() == JobStatus.RUNNING ||
job.getStatus().getRunState() == JobStatus.PREP) {
userJobs.put(user, userCount + 1);
poolJobs.put(pool, poolCount + 1);
poolTasks.put(pool, poolTaskCount + infos.get(job).totalInitedTasks);
JobInfo jobInfo = infos.get(job);
if (job.getStatus().getRunState() == JobStatus.RUNNING) {
jobInfo.runnable = true;
} else {
// The job is in the PREP state. Give it to the job initializer
// for initialization if we have not already done it.
if (jobInfo.needsInitializing) {
jobInfo.needsInitializing = false;
jobInitializer.initJob(jobInfo, job);
}
}
}
}
}
}
private void updateTaskCounts() {
poolMgr.resetRunningTasks(TaskType.MAP);
poolMgr.resetRunningTasks(TaskType.REDUCE);
infosummary.reset();
for (Map.Entry<JobInProgress, JobInfo> entry: infos.entrySet()) {
JobInProgress job = entry.getKey();
JobInfo info = entry.getValue();
if (job.getStatus().getRunState() != JobStatus.RUNNING)
continue; // Job is still in PREP state and tasks aren't initialized
// Count maps
int totalMaps = job.numMapTasks;
int finishedMaps = 0;
int runningMaps = 0;
int runningMapTips = 0;
for (TaskInProgress tip :
job.getTasks(org.apache.hadoop.mapreduce.TaskType.MAP)) {
if (tip.isComplete()) {
finishedMaps += 1;
} else if (tip.isRunning()) {
runningMaps += tip.getActiveTasks().size();
runningMapTips += 1;
}
}
info.totalInitedTasks = job.numMapTasks + job.numReduceTasks;
info.runningMaps = runningMaps;
infosummary.totalRunningMaps += runningMaps;
poolMgr.incRunningTasks(info.poolName, TaskType.MAP, runningMaps);
info.neededSpeculativeMaps = taskSelector.neededSpeculativeMaps(job);
info.neededMaps = (totalMaps - runningMapTips - finishedMaps
+ info.neededSpeculativeMaps);
// Count reduces
int totalReduces = job.numReduceTasks;
int finishedReduces = 0;
int runningReduces = 0;
int runningReduceTips = 0;
for (TaskInProgress tip :
job.getTasks(org.apache.hadoop.mapreduce.TaskType.REDUCE)) {
if (tip.isComplete()) {
finishedReduces += 1;
} else if (tip.isRunning()) {
runningReduces += tip.getActiveTasks().size();
runningReduceTips += 1;
}
}
info.runningReduces = runningReduces;
infosummary.totalRunningReduces += runningReduces;
poolMgr.incRunningTasks(info.poolName, TaskType.REDUCE, runningReduces);
if (job.scheduleReduces()) {
info.neededSpeculativeReduces =
taskSelector.neededSpeculativeReduces(job);
info.neededReduces = (totalReduces - runningReduceTips - finishedReduces
+ info.neededSpeculativeReduces);
} else {
info.neededReduces = 0;
}
// If the job was marked as not runnable due to its user or pool having
// too many active jobs, set the neededMaps/neededReduces to 0. We still
// count runningMaps/runningReduces however so we can give it a deficit.
if (!info.runnable) {
info.neededMaps = 0;
info.neededReduces = 0;
}
infosummary.totalNeededMaps += info.neededMaps;
infosummary.totalNeededReduces += info.neededReduces;
}
}
private void updateWeights() {
// First, calculate raw weights for each job
for (Map.Entry<JobInProgress, JobInfo> entry: infos.entrySet()) {
JobInProgress job = entry.getKey();
JobInfo info = entry.getValue();
info.mapWeight = calculateRawWeight(job, TaskType.MAP);
info.reduceWeight = calculateRawWeight(job, TaskType.REDUCE);
}
// Adjust pool weight to FIFO if configured
for (Pool pool : poolMgr.getPools()) {
if (poolMgr.fifoWeight(pool.getName())) {
fifoWeightAdjust(pool);
}
}
// Now calculate job weight sums for each pool
Map<String, Double> mapWeightSums = new HashMap<String, Double>();
Map<String, Double> reduceWeightSums = new HashMap<String, Double>();
for (Pool pool: poolMgr.getPools()) {
double mapWeightSum = 0;
double reduceWeightSum = 0;
for (JobInProgress job: pool.getJobs()) {
JobInfo info = infos.get(job);
if (isRunnable(info)) {
if (runnableTasks(info, TaskType.MAP) > 0) {
mapWeightSum += info.mapWeight;
}
if (runnableTasks(info, TaskType.REDUCE) > 0) {
reduceWeightSum += info.reduceWeight;
}
}
}
mapWeightSums.put(pool.getName(), mapWeightSum);
reduceWeightSums.put(pool.getName(), reduceWeightSum);
}
// And normalize the weights based on pool sums and pool weights
// to share fairly across pools (proportional to their weights)
for (Map.Entry<JobInProgress, JobInfo> entry: infos.entrySet()) {
JobInProgress job = entry.getKey();
JobInfo info = entry.getValue();
String pool = poolMgr.getPoolName(job);
double poolWeight = poolMgr.getPoolWeight(pool);
double mapWeightSum = mapWeightSums.get(pool);
double reduceWeightSum = reduceWeightSums.get(pool);
if (mapWeightSum == 0)
info.mapWeight = 0;
else
info.mapWeight *= (poolWeight / mapWeightSum);
if (reduceWeightSum == 0)
info.reduceWeight = 0;
else
info.reduceWeight *= (poolWeight / reduceWeightSum);
}
}
/**
* Boost the weight for the older jobs.
*/
private void fifoWeightAdjust(Pool pool) {
List<JobInProgress> jobs = new ArrayList<JobInProgress>();
jobs.addAll(pool.getJobs());
Collections.sort(jobs, new FifoJobComparator());
double factor = 1.0;
for (JobInProgress job : jobs) {
JobInfo info = infos.get(job);
info.mapWeight *= factor;
info.reduceWeight *= factor;
factor *= FIFO_WEIGHT_DECAY_FACTOR;
}
}
private void updateMinAndMaxSlots() {
for (TaskType type : MAP_AND_REDUCE) {
for (Pool pool : poolMgr.getPools()) {
updateMinSlots(pool, type);
updateMaxSlots(pool, type);
}
}
}
/**
* Compute the min slots for each job. This is done by distributing the
* configured minSlots of each pool to the jobs inside that pool.
*/
private void updateMinSlots(final Pool pool, final TaskType type) {
// Find the proper ratio of (# of minSlots / weight) by bineary search
BinarySearcher searcher = new BinarySearcher() {
@Override
double targetFunction(double x) {
return poolSlotsUsedWithWeightToSlotRatio(pool, x, type, false);
}
};
int total = poolMgr.getAllocation(pool.getName(), type);
double ratio = searcher.getSolution(total);
int leftOver = total;
List<JobInfo> candidates = new LinkedList<JobInfo>();
for (JobInProgress job : pool.getJobs()) {
JobInfo info = infos.get(job);
int slots = (int)Math.floor(computeShare(info, ratio, type, false));
candidates.add(info);
leftOver -= slots;
setMinSlots(info, slots, type);
}
// Assign the left over slots
for (int i = 0; i < leftOver && !candidates.isEmpty(); ++i) {
JobInfo info = candidates.remove(0);
if (incMinSlots(info, type)) {
candidates.add(info);
}
}
}
private static void setMinSlots(JobInfo info, int slots, TaskType type) {
if (type == TaskType.MAP) {
info.minMaps = slots;
} else {
info.minReduces = slots;
}
}
private boolean incMinSlots(JobInfo info, TaskType type) {
if (type == TaskType.MAP) {
if (info.minMaps < runnableTasks(info, type)) {
info.minMaps += 1;
return true;
}
} else {
if (info.minReduces < runnableTasks(info, type)) {
info.minReduces += 1;
return true;
}
}
return false;
}
/**
* Compute the max slots for each job. This is done by distributing the
* configured max of each pool to the jobs inside that pool.
*/
private void updateMaxSlots(final Pool pool, final TaskType type) {
// Find the proper ratio of (# of maxlots / weight) by bineary search
BinarySearcher searcher = new BinarySearcher() {
@Override
double targetFunction(double x) {
return poolSlotsUsedWithWeightToSlotRatio(pool, x, type, false);
}
};
int total = poolMgr.getMaxSlots(pool.getName(), type);
double ratio = searcher.getSolution(total);
int leftOver = total;
List<JobInfo> candidates = new LinkedList<JobInfo>();
for (JobInProgress job : pool.getJobs()) {
JobInfo info = infos.get(job);
int slots = (int)Math.floor(computeShare(info, ratio, type, false));
candidates.add(info);
leftOver -= slots;
setMaxSlots(info, slots, type);
}
// Assign the left over slots
for (int i = 0; i < leftOver && !candidates.isEmpty(); ++i) {
JobInfo info = candidates.remove(0);
if (incMaxSlots(info, type)) {
candidates.add(info);
}
}
}
private static void setMaxSlots(JobInfo info, int slots, TaskType type) {
if (type == TaskType.MAP) {
info.maxMaps = slots;
} else {
info.maxReduces = slots;
}
}
private boolean incMaxSlots(JobInfo info, TaskType type) {
if (type == TaskType.MAP) {
if (info.maxMaps < runnableTasks(info, type)) {
info.maxMaps += 1;
return true;
}
} else {
if (info.maxReduces < runnableTasks(info, type)) {
info.maxReduces += 1;
return true;
}
}
return false;
}
/**
* Compute the number of slots that would be used given a weight-to-slot
* ratio w2sRatio.
*/
private double poolSlotsUsedWithWeightToSlotRatio(
Pool pool, double w2sRatio, TaskType type, boolean considerMinMax) {
double slotsTaken = 0;
for (JobInProgress job : pool.getJobs()) {
JobInfo info = infos.get(job);
slotsTaken += computeShare(
info, w2sRatio, type, considerMinMax);
}
return slotsTaken;
}
private void updateFairShares(ClusterStatus clusterStatus) {
double totalMaps = getTotalSlots(TaskType.MAP, clusterStatus);
updateFairShares(totalMaps, TaskType.MAP);
double totalReduces = getTotalSlots(TaskType.REDUCE, clusterStatus);
updateFairShares(totalReduces, TaskType.REDUCE);
}
/**
* Update fairshare for each JobInfo based on the weight, neededTasks and
* minTasks and the size of the pool. We compute the share by finding the
* ratio of (# of slots / weight) using binary search.
*/
private void updateFairShares(double totalSlots, final TaskType type) {
// Find the proper ratio of (# of slots share / weight) by bineary search
BinarySearcher searcher = new BinarySearcher() {
@Override
double targetFunction(double x) {
return slotsUsedWithWeightToSlotRatio(x, type);
}
};
double ratio = searcher.getSolution(totalSlots);
// Set the fair shares based on the value of R we've converged to
for (JobInfo info : infos.values()) {
if (type == TaskType.MAP) {
info.mapFairShare = computeShare(info, ratio, type);
} else {
info.reduceFairShare = computeShare(info, ratio, type);
}
}
}
/**
* Compute the number of slots that would be used given a weight-to-slot
* ratio w2sRatio.
*/
private double slotsUsedWithWeightToSlotRatio(double w2sRatio, TaskType type) {
double slotsTaken = 0;
for (JobInfo info : infos.values()) {
slotsTaken += computeShare(info, w2sRatio, type);
}
return slotsTaken;
}
private double computeShare(
JobInfo info, double w2sRatio, TaskType type) {
return computeShare(info, w2sRatio, type, true);
}
/**
* Compute the number of slots assigned to a job given a particular
* weight-to-slot ratio w2sRatio.
*/
private double computeShare(JobInfo info, double w2sRatio,
TaskType type, boolean considerMinMax) {
if (!isRunnable(info)) {
return 0;
}
double share = type == TaskType.MAP ? info.mapWeight : info.reduceWeight;
share *= w2sRatio;
if (considerMinMax) {
int minSlots = type == TaskType.MAP ? info.minMaps : info.minReduces;
share = Math.max(share, minSlots);
int maxSlots = type == TaskType.MAP ? info.maxMaps : info.maxReduces;
share = Math.min(share, maxSlots);
}
share = Math.min(share, runnableTasks(info, type));
return share;
}
/**
* Given a targetFunction and a targetValue, find a positive number x so that
* targetFunction(x) == targetValue approximately
*/
abstract class BinarySearcher {
final static int MAXIMUM_ITERATION = 25;
final static double ERROR_ALLOW_WHEN_COMPARE_FLOATS = 1e-8;
abstract double targetFunction(double x);
double getSolution(double targetValue) {
double rMax = 1.0;
double oldValue = -1;
for (int i = 0; i < MAXIMUM_ITERATION; ++i) {
double value = targetFunction(rMax);
if (value >= targetValue) {
break;
}
if (equals(value, oldValue)) {
return rMax; // Target value is not feasible. Just return rMax
}
rMax *= 2;
}
double left = 0, right = rMax;
for (int i = 0; i < MAXIMUM_ITERATION; ++i) {
double mid = (left + right) / 2.0;
double value = targetFunction(mid);
if (equals(value, targetValue)) {
return mid;
}
if (value < targetValue) {
left = mid;
} else {
right = mid;
}
}
return right;
}
private boolean equals(double x, double y) {
return Math.abs(x - y) < ERROR_ALLOW_WHEN_COMPARE_FLOATS;
}
}
private double calculateRawWeight(JobInProgress job, TaskType taskType) {
if (!isRunnable(job)) {
return 0;
} else {
double weight = 1.0;
if (sizeBasedWeight) {
// Set weight based on runnable tasks
weight = Math.log1p(runnableTasks(job, taskType)) / Math.log(2);
}
weight *= job.getPriority().getFactor();
if (weightAdjuster != null) {
// Run weight through the user-supplied weightAdjuster
weight = weightAdjuster.adjustWeight(job, taskType, weight);
}
return weight;
}
}
/**
* Returns the LoadManager object used by the Fair Share scheduler
*/
public LoadManager getLoadManager() {
return loadMgr;
}
public PoolManager getPoolManager() {
return poolMgr;
}
private int getTotalSlots(TaskType type, ClusterStatus clusterStatus) {
return (type == TaskType.MAP ?
clusterStatus.getMaxMapTasks() : clusterStatus.getMaxReduceTasks());
}
// Getter methods for reading JobInfo values based on TaskType, safely
// returning 0's for jobs with no JobInfo present.
protected int neededTasks(JobInfo info, TaskType taskType) {
if (info == null) return 0;
return taskType == TaskType.MAP ? info.neededMaps : info.neededReduces;
}
protected int runningTasks(JobInfo info, TaskType taskType) {
if (info == null) return 0;
return taskType == TaskType.MAP ? info.runningMaps : info.runningReduces;
}
protected int minTasks(JobInfo info, TaskType type) {
if (info == null) return 0;
return (type == TaskType.MAP) ? info.minMaps : info.minReduces;
}
protected double weight(JobInfo info, TaskType type) {
if (info == null) return 0;
return (type == TaskType.MAP) ? info.mapWeight : info.reduceWeight;
}
protected int neededTasks(JobInProgress job, TaskType taskType) {
JobInfo info = infos.get(job);
return neededTasks (info, taskType);
}
protected int runningTasks(JobInProgress job, TaskType taskType) {
JobInfo info = infos.get(job);
return runningTasks (info, taskType);
}
protected int runnableTasks(JobInfo info, TaskType type) {
return neededTasks(info, type) + runningTasks(info, type);
}
protected int runnableTasks(JobInProgress job, TaskType type) {
JobInfo info = infos.get(job);
return neededTasks(info, type) + runningTasks(info, type);
}
protected int minTasks(JobInProgress job, TaskType type) {
JobInfo info = infos.get(job);
return minTasks(info, type);
}
protected double weight(JobInProgress job, TaskType taskType) {
JobInfo info = infos.get(job);
return weight(info, taskType);
}
protected double deficit(JobInProgress job, TaskType taskType) {
JobInfo info = infos.get(job);
if (info == null) return 0;
return taskType == TaskType.MAP ? info.mapDeficit : info.reduceDeficit;
}
protected static boolean isRunnable(JobInfo info) {
if (info == null) return false;
return info.runnable;
}
protected boolean isRunnable(JobInProgress job) {
JobInfo info = infos.get(job);
return isRunnable(info);
}
@Override
public synchronized Collection<JobInProgress> getJobs(String queueName) {
Pool myJobPool = poolMgr.getPool(queueName);
return myJobPool.getJobs();
}
public int getMapPerHeartBeat() {
return mapPerHeartBeat;
}
public void setMapPerHeartBeat(int mapPerHeartBeat) {
LOG.info("The allowed Mapers per heartbeat has been changed to " +
mapPerHeartBeat);
this.mapPerHeartBeat = mapPerHeartBeat;
}
public int getReducePerHeartBeat() {
return reducePerHeartBeat;
}
public void setReducePerHeartBeat(int reducePerHeartBeat) {
LOG.info("The allowed Reducers per heartbeat has been changed to " +
reducePerHeartBeat);
this.reducePerHeartBeat = reducePerHeartBeat;
}
public void setLocalityDelayRackLocal(long localityDelay) {
this.localityDelayRackLocal = localityDelay;
}
public long getLocalityDelayRackLocal() {
return localityDelayRackLocal;
}
public void setLocalityDelayNodeLocal(long localityDelay) {
this.localityDelayNodeLocal = localityDelay;
}
public long getLocalityDelayNodeLocal() {
return localityDelayNodeLocal;
}
public boolean isPreemptionEnabled() {
return preemptionEnabled;
}
public void setPreemptionEnabled(boolean preemptionEnabled) {
this.preemptionEnabled = preemptionEnabled;
}
/**
* Update the preemption JobInfo fields for all jobs, i.e. the times since
* each job last was at its guaranteed share and at > 1/2 of its fair share
* for each type of task.
*/
private void updatePreemptionVariables() {
long now = clock.getTime();
for (Map.Entry<JobInProgress, JobInfo> entry: infos.entrySet()) {
JobInProgress job = entry.getKey();
JobInfo info = entry.getValue();
if (job.getStatus().getRunState() != JobStatus.RUNNING) {
// Job is still in PREP state and tasks aren't initialized. Count it as
// both at min and fair share since we shouldn't start any timeouts now.
info.lastTimeAtMapMinShare = now;
info.lastTimeAtReduceMinShare = now;
info.lastTimeAtMapHalfFairShare = now;
info.lastTimeAtReduceHalfFairShare = now;
} else {
if (!isStarvedForMinShare(info, TaskType.MAP))
info.lastTimeAtMapMinShare = now;
if (!isStarvedForMinShare(info, TaskType.REDUCE))
info.lastTimeAtReduceMinShare = now;
if (!isStarvedForFairShare(info, TaskType.MAP))
info.lastTimeAtMapHalfFairShare = now;
if (!isStarvedForFairShare(info, TaskType.REDUCE))
info.lastTimeAtReduceHalfFairShare = now;
}
}
}
/**
* Is a job below 90% of its min share for the given task type?
*/
boolean isStarvedForMinShare(JobInfo info, TaskType taskType) {
float starvingThreshold = (float) (minTasks(info, taskType) * 0.9);
return runningTasks(info, taskType) < starvingThreshold;
}
/**
* Is a job being starved for fair share for the given task type?
* This is defined as being below half its fair share *and* having a
* positive deficit.
*/
boolean isStarvedForFairShare(JobInfo info, TaskType type) {
int desiredFairShare = (int) Math.floor(Math.min(
fairTasks(info, type) / 2, runnableTasks(info, type)));
return (runningTasks(info, type) < desiredFairShare);
}
/**
* Check for jobs that need tasks preempted, either because they have been
* below their guaranteed share for their pool's preemptionTimeout or they
* have been below half their fair share for the fairSharePreemptionTimeout.
* If such jobs exist, compute how many tasks of each type need to be
* preempted and then select the right ones using selectTasksToPreempt.
*
* This method computes and logs the number of tasks we want to preempt even
* if preemption is disabled, for debugging purposes.
*/
protected void preemptTasksIfNecessary() {
if (!preemptionEnabled || jobComparator == JobComparator.FIFO)
return;
long curTime = clock.getTime();
if (curTime - lastPreemptCheckTime < preemptionInterval)
return;
lastPreemptCheckTime = curTime;
// Acquire locks on both the JobTracker (task tracker manager) and this
// because we might need to call some JobTracker methods (killTask).
synchronized (taskTrackerManager) {
synchronized (this) {
List<JobInProgress> jobs = new ArrayList<JobInProgress>(infos.keySet());
for (TaskType type: MAP_AND_REDUCE) {
int tasksToPreempt = 0;
for (JobInProgress job: jobs) {
tasksToPreempt += tasksToPreempt(job, type, curTime);
}
if (tasksToPreempt > 0) {
// for debugging purposes log the jobs by scheduling priority
// to check whether preemption and scheduling are in sync.
logJobStats(sortedJobsByMapNeed, TaskType.MAP);
logJobStats(sortedJobsByReduceNeed, TaskType.REDUCE);
}
// Actually preempt the tasks. The policy for this is to pick
// tasks from jobs that are above their min share and have very
// negative deficits (meaning they've been over-scheduled).
// However, we also want to minimize the amount of computation
// wasted by preemption, so prefer tasks that started recently.
preemptTasks(jobs, type, tasksToPreempt);
}
}
}
}
/**
* Count how many tasks of a given type the job needs to preempt, if any.
* If the job has been below its min share for at least its pool's preemption
* timeout, it should preempt the difference between its current share and
* this min share. If it has been below half its fair share for at least the
* fairSharePreemptionTimeout, it should preempt enough tasks to get up to
* its full fair share. If both situations hold, we preempt the max of the
* two amounts (this shouldn't happen unless someone sets the timeouts to
* be identical for some reason).
*/
protected int tasksToPreempt(JobInProgress job, TaskType type, long curTime) {
JobInfo info = infos.get(job);
if (info == null || poolMgr.isMaxTasks(info.poolName, type)) return 0;
String pool = info.poolName;
long minShareTimeout = poolMgr.getMinSharePreemptionTimeout(pool);
long fairShareTimeout = poolMgr.getFairSharePreemptionTimeout();
int tasksDueToMinShare = 0;
int tasksDueToFairShare = 0;
boolean poolBelowMinSlots = poolMgr.getRunningTasks(pool, type) <
poolMgr.getAllocation(pool, type);
if (type == TaskType.MAP) {
if (curTime - info.lastTimeAtMapMinShare > minShareTimeout &&
poolBelowMinSlots) {
tasksDueToMinShare = info.minMaps - info.runningMaps;
}
if (curTime - info.lastTimeAtMapHalfFairShare > fairShareTimeout) {
double fairShare = Math.min(info.mapFairShare,
runnableTasks(info, type));
tasksDueToFairShare = (int) (fairShare - info.runningMaps);
}
} else { // type == TaskType.REDUCE
if (curTime - info.lastTimeAtReduceMinShare > minShareTimeout &&
poolBelowMinSlots) {
tasksDueToMinShare = info.minReduces - info.runningReduces;
}
if (curTime - info.lastTimeAtReduceHalfFairShare > fairShareTimeout) {
double fairShare = Math.min(info.reduceFairShare,
runnableTasks(info, type));
tasksDueToFairShare = (int) (fairShare - info.runningReduces);
}
}
int tasksToPreempt = Math.max(tasksDueToMinShare, tasksDueToFairShare);
int neededNonSpeculativeTasks = type == TaskType.MAP ?
info.neededMaps - info.neededSpeculativeMaps :
info.neededReduces - info.neededSpeculativeReduces;
// We do not preempt for speculative execution tasks
tasksToPreempt = Math.min(neededNonSpeculativeTasks, tasksToPreempt);
if (tasksToPreempt > 0) {
String message = "Should preempt " + tasksToPreempt + " "
+ type + " tasks for " + job.getJobID()
+ ": tasksDueToMinShare = " + tasksDueToMinShare
+ ", tasksDueToFairShare = " + tasksDueToFairShare
+ ", runningTasks = " + runningTasks(info, type);
LOG.info(message);
}
return tasksToPreempt < 0 ? 0 : tasksToPreempt;
}
/**
* Can we preempt tasks from this job?
*/
private boolean canBePreempted(JobInProgress job) {
return poolMgr.canBePreempted(infos.get(job).poolName);
}
/**
* Preempt up to maxToPreempt tasks of the given type.
* Selects the tasks so as to preempt the least recently launched one first,
* thus minimizing wasted compute time.
*/
private void preemptTasks(Collection<JobInProgress> jobs,
TaskType type, int maxToPreempt) {
if (maxToPreempt <= 0) {
return;
}
Set<TaskInProgress> tips = new HashSet<TaskInProgress>();
Map<JobInProgress, Integer> tasksCanBePreempted =
new HashMap<JobInProgress, Integer>();
// Collect the tasks can be preempted
for (JobInProgress job : jobs) {
if (!canBePreempted(job)) {
continue;
}
int runningTasks = runningTasks(job, type);
int minTasks = minTasks(job, type);
int desiredFairShare = (int) Math.floor(Math.min(
fairTasks(job, type), runnableTasks(job, type)));
int tasksToLeave = Math.max(desiredFairShare, minTasks);
int tasksCanBePreemptedCurrent = runningTasks - tasksToLeave;
if (tasksCanBePreemptedCurrent <= 0) {
continue;
}
tasksCanBePreempted.put(job, tasksCanBePreemptedCurrent);
if (type == TaskType.MAP) {
// Jobs may have both "non-local maps" which have a split with no
// locality info (e.g. the input file is not in HDFS), and maps with
// locality info, which are stored in the runningMapCache map from
// location to task list
tips.addAll(job.nonLocalRunningMaps);
for (Set<TaskInProgress> set: job.runningMapCache.values()) {
tips.addAll(set);
}
}
else {
tips.addAll(job.runningReduces);
}
}
// Get the active TaskStatus'es for each TaskInProgress (there may be
// more than one if the task has multiple copies active due to speculation)
List<TaskStatus> statuses = new ArrayList<TaskStatus>();
for (TaskInProgress tip: tips) {
for (TaskAttemptID id: tip.getActiveTasks().keySet()) {
TaskStatus stat = tip.getTaskStatus(id);
// status is null when the task has been scheduled but not yet running
if (stat != null) {
statuses.add(stat);
}
}
}
// Sort the statuses in order of start time, with the latest launched first
Collections.sort(statuses, new Comparator<TaskStatus>() {
public int compare(TaskStatus t1, TaskStatus t2) {
return (int) Math.signum(t2.getStartTime() - t1.getStartTime());
}
});
Map<JobInProgress, Integer> tasksPreempted =
new HashMap<JobInProgress, Integer>();
for (TaskStatus status : statuses) {
if (maxToPreempt <= 0) {
break;
}
JobID jobId = status.getTaskID().getJobID();
JobInProgress job = taskTrackerManager.getJob(jobId);
if (tasksCanBePreempted.get(job) <= 0) {
continue;
}
try {
LOG.info("Preempt task: " + status.getTaskID());
taskTrackerManager.killTask(status.getTaskID(), false);
preemptTaskUpdateMetric(type, status.getTaskID());
tasksCanBePreempted.put(job, tasksCanBePreempted.get(job) - 1);
Integer count = tasksPreempted.get(job);
if (count == null) {
count = 0;
}
tasksPreempted.put(job, count + 1);
maxToPreempt--;
} catch (IOException e) {
LOG.error("Failed to kill task " + status.getTaskID(), e);
}
}
for (JobInProgress job : tasksPreempted.keySet()) {
int runningTasks = runningTasks(job, type);
int minTasks = minTasks(job, type);
int desiredFairShare = (int) Math.floor(Math.min(
fairTasks(job, type), runnableTasks(job, type)));
LOG.info("Job " + job.getJobID() + " was preempted for "
+ (type == TaskType.MAP ? "map" : "reduce")
+ ": tasksPreempted = " + tasksPreempted.get(job)
+ ", fairShare = " + desiredFairShare
+ ", minSlots = " + minTasks
+ ", runningTasks = " + runningTasks);
}
}
private void preemptTaskUpdateMetric(TaskType type, TaskAttemptID id) {
if (fairSchedulerMetrics != null)
if (type == TaskType.MAP) {
fairSchedulerMetrics.preemptMap(id);
} else {
fairSchedulerMetrics.preemptReduce(id);
}
}
protected double fairTasks(JobInfo info, TaskType type) {
if (info == null) return 0;
return (type == TaskType.MAP) ? info.mapFairShare : info.reduceFairShare;
}
protected double fairTasks(JobInProgress job, TaskType type) {
JobInfo info = infos.get(job);
return fairTasks(info, type);
}
@Override
public int getMaxSlots(TaskTrackerStatus status, TaskType type) {
int maxSlots = loadMgr.getMaxSlots(status, type);
return maxSlots;
}
@Override
public int getFSMaxSlots(String trackerName, TaskType type) {
return loadMgr.getFSMaxSlots(trackerName, type);
}
@Override
public void setFSMaxSlots(String trackerName, TaskType type, int slots)
throws IOException {
loadMgr.setFSMaxSlots(trackerName, type, slots);
}
@Override
public void resetFSMaxSlots() throws IOException {
loadMgr.resetFSMaxSlots();
}
@Override
public TaskTrackerStatus[] getTaskTrackerStatus() throws IOException {
Collection<TaskTrackerStatus> tts = taskTrackerManager.taskTrackers();
return tts.toArray(new TaskTrackerStatus[tts.size()]);
}
@Override
public synchronized int getRunnableTasks(TaskType type)
throws IOException {
int runnableTasks = 0;
for (JobInfo info : infos.values()) {
runnableTasks += runnableTasks(info, type);
}
return runnableTasks;
}
@Override
public long getProtocolVersion(String arg0, long arg1) throws IOException {
return versionID;
}
@Override
public ProtocolSignature getProtocolSignature(String protocol,
long clientVersion, int clientMethodsHash) throws IOException {
return ProtocolSignature.getProtocolSignature(
this, protocol, clientVersion, clientMethodsHash);
}
@Override
public int[] getPoolRunningTasks(String pool) throws IOException {
int result[] = new int[2];
result[0] = poolMgr.getRunningTasks(pool, TaskType.MAP);
result[1] = poolMgr.getRunningTasks(pool, TaskType.REDUCE);
return result;
}
@Override
public int[] getPoolMaxTasks(String pool) throws IOException {
int result[] = new int[2];
result[0] = poolMgr.getMaxSlots(pool, TaskType.MAP);
result[1] = poolMgr.getMaxSlots(pool, TaskType.REDUCE);
return result;
}
}