CapacityTaskScheduler.java example

Explorer
hadoop-20-master
- src
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.mapred;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.server.jobtracker.TaskTracker;

/**
 * A {@link TaskScheduler} that implements the requirements in HADOOP-3421
 * and provides a HOD-less way to share large clusters. This scheduler 
 * provides the following features: 
 *  * support for queues, where a job is submitted to a queue. 
 *  * Queues are assigned a fraction of the capacity of the grid (their
 *  'capacity') in the sense that a certain capacity of resources 
 *  will be at their disposal. All jobs submitted to the queues of an Org 
 *  will have access to the capacity to the Org.
 *  * Free resources can be allocated to any queue beyond its 
 *  capacity.
 *  * Queues optionally support job priorities (disabled by default). 
 *  * Within a queue, jobs with higher priority will have access to the 
 *  queue's resources before jobs with lower priority. However, once a job 
 *  is running, it will not be preempted for a higher priority job.
 *  * In order to prevent one or more users from monopolizing its resources, 
 *  each queue enforces a limit on the percentage of resources allocated to a 
 *  user at any given time, if there is competition for them.
 *  
 */
class CapacityTaskScheduler extends TaskScheduler {


  /***********************************************************************
   * Keeping track of scheduling information for queues
   * 
   * We need to maintain scheduling information relevant to a queue (its 
   * name, capacity, etc), along with information specific to 
   * each kind of task, Map or Reduce (num of running tasks, pending 
   * tasks etc). 
   * 
   * This scheduling information is used to decide how to allocate
   * tasks, redistribute capacity, etc.
   *  
   * A QueueSchedulingInfo(QSI) object represents scheduling information for
   * a queue. A TaskSchedulingInfo (TSI) object represents scheduling 
   * information for a particular kind of task (Map or Reduce).
   *   
   **********************************************************************/

  private static class TaskSchedulingInfo {
    
    /** 
     * the actual capacity, which depends on how many slots are available
     * in the cluster at any given time. 
     */
    private int capacity = 0;
    // number of running tasks
    int numRunningTasks = 0;
    // number of slots occupied by running tasks
    int numSlotsOccupied = 0;

    //the actual maximum capacity which depends on how many slots are available
    //in cluster at any given time.
    private int maxCapacity = -1;

    /**
     * for each user, we need to keep track of number of slots occupied by
     * running tasks
     */
    Map<String, Integer> numSlotsOccupiedByUser = 
      new HashMap<String, Integer>();

    /**
     * reset the variables associated with tasks
     */
    void resetTaskVars() {
      numRunningTasks = 0;
      numSlotsOccupied = 0;
      for (String s: numSlotsOccupiedByUser.keySet()) {
        numSlotsOccupiedByUser.put(s, Integer.valueOf(0));
      }
    }


    /**
     * Returns the actual capacity.
     * capacity.
     *
     * @return
     */
    int getCapacity() {
      return capacity;
    }

    /**
     * Mutator method for capacity
     *
     * @param capacity
     */
    void setCapacity(int capacity) {
        this.capacity = capacity;
    }


    /**
     * return information about the tasks
     */
    @Override
    public String toString() {
      float occupiedSlotsAsPercent =
          getCapacity() != 0 ?
            ((float) numSlotsOccupied * 100 / getCapacity()) : 0;
      StringBuffer sb = new StringBuffer();
      
      sb.append("Capacity: " + capacity + " slots\n");
      
      if(getMaxCapacity() >= 0) {
        sb.append("Maximum capacity: " + getMaxCapacity() +" slots\n");
      }
      sb.append(String.format("Used capacity: %d (%.1f%% of Capacity)\n",
          Integer.valueOf(numSlotsOccupied), Float
              .valueOf(occupiedSlotsAsPercent)));
      sb.append(String.format("Running tasks: %d\n", Integer
          .valueOf(numRunningTasks)));
      // include info on active users
      if (numSlotsOccupied != 0) {
        sb.append("Active users:\n");
        for (Map.Entry<String, Integer> entry : numSlotsOccupiedByUser
            .entrySet()) {
          if ((entry.getValue() == null) || (entry.getValue().intValue() <= 0)) {
            // user has no tasks running
            continue;
          }
          sb.append("User '" + entry.getKey() + "': ");
          int numSlotsOccupiedByThisUser = entry.getValue().intValue();
          float p =
              (float) numSlotsOccupiedByThisUser * 100 / numSlotsOccupied;
          sb.append(String.format("%d (%.1f%% of used capacity)\n", Long
              .valueOf(numSlotsOccupiedByThisUser), Float.valueOf(p)));
        }
      }
      return sb.toString();
    }

    int getMaxCapacity() {
      return maxCapacity;
    }

    void setMaxCapacity(int maxCapacity) {
      this.maxCapacity = maxCapacity;
    }
  }
  
  private static class QueueSchedulingInfo {
    String queueName;

    /**
     * capacity(%) is set in the config
     */
    float capacityPercent = 0;
    
    
  /**
   * maxCapacityPercent(%) is set in config as
   * mapred.capacity-scheduler.queue.<queue-name>.maximum-capacity
   * maximum-capacity percent defines a limit beyond which a queue
   * cannot expand. Remember this limit is dynamic and changes w.r.t
   * cluster size.
   */
    float maxCapacityPercent = -1;
    /** 
     * to handle user limits, we need to know how many users have jobs in 
     * the queue.
     */  
    Map<String, Integer> numJobsByUser = new HashMap<String, Integer>();
      
    /**
     * min value of user limit (same for all users)
     */
    int ulMin;
    
    /**
     * We keep track of the JobQueuesManager only for reporting purposes 
     * (in toString()). 
     */
    private JobQueuesManager jobQueuesManager;
    
    /**
     * We keep a TaskSchedulingInfo object for each kind of task we support
     */
    TaskSchedulingInfo mapTSI;
    TaskSchedulingInfo reduceTSI;
    
    public QueueSchedulingInfo(
      String queueName, float capacityPercent,
      float maxCapacityPercent, int ulMin, JobQueuesManager jobQueuesManager
    ) {
      this.queueName = new String(queueName);
      this.capacityPercent = capacityPercent;
      this.maxCapacityPercent = maxCapacityPercent;
      this.ulMin = ulMin;
      this.jobQueuesManager = jobQueuesManager;
      this.mapTSI = new TaskSchedulingInfo();
      this.reduceTSI = new TaskSchedulingInfo();
    }
    
    /**
     * return information about the queue
     *
     * @return a String representing the information about the queue.
     */
    @Override
    public String toString(){
      // We print out the queue information first, followed by info
      // on map and reduce tasks and job info
      StringBuffer sb = new StringBuffer();
      sb.append("Queue configuration\n");
      sb.append("Capacity Percentage: ");
      sb.append(capacityPercent);
      sb.append("%\n");
      sb.append(String.format("User Limit: %d%s\n",ulMin, "%"));
      sb.append(String.format("Priority Supported: %s\n",
          (jobQueuesManager.doesQueueSupportPriorities(queueName))?
              "YES":"NO"));
      sb.append("-------------\n");

      sb.append("Map tasks\n");
      sb.append(mapTSI.toString());
      sb.append("-------------\n");
      sb.append("Reduce tasks\n");
      sb.append(reduceTSI.toString());
      sb.append("-------------\n");
      
      sb.append("Job info\n");
      sb.append(String.format("Number of Waiting Jobs: %d\n", 
          jobQueuesManager.getWaitingJobCount(queueName)));
      sb.append(String.format("Number of users who have submitted jobs: %d\n", 
          numJobsByUser.size()));
      return sb.toString();
    }
  }

  /** quick way to get qsi object given a queue name */
  private Map<String, QueueSchedulingInfo> queueInfoMap = 
    new HashMap<String, QueueSchedulingInfo>();
  
  /**
   * This class captures scheduling information we want to display or log.
   */
  private static class SchedulingDisplayInfo {
    private String queueName;
    CapacityTaskScheduler scheduler;
    
    SchedulingDisplayInfo(String queueName, CapacityTaskScheduler scheduler) { 
      this.queueName = queueName;
      this.scheduler = scheduler;
    }
    
    @Override
    public String toString(){
      // note that we do not call updateQSIObjects() here for performance
      // reasons. This means that the data we print out may be slightly
      // stale. This data is updated whenever assignTasks() is called
      // If this doesn't happen, the data gets stale. If we see
      // this often, we may need to detect this situation and call 
      // updateQSIObjects(), or just call it each time. 
      return scheduler.getDisplayInfo(queueName);
    }
  }

  // this class encapsulates the result of a task lookup
  private static class TaskLookupResult {

    static enum LookUpStatus {
      TASK_FOUND,
      NO_TASK_FOUND,
      TASK_FAILING_MEMORY_REQUIREMENT,
    }
    // constant TaskLookupResult objects. Should not be accessed directly.
    private static final TaskLookupResult NoTaskLookupResult = 
      new TaskLookupResult(null, TaskLookupResult.LookUpStatus.NO_TASK_FOUND);
    private static final TaskLookupResult MemFailedLookupResult = 
      new TaskLookupResult(null, 
          TaskLookupResult.LookUpStatus.TASK_FAILING_MEMORY_REQUIREMENT);

    private LookUpStatus lookUpStatus;
    private Task task;

    // should not call this constructor directly. use static factory methods.
    private TaskLookupResult(Task t, LookUpStatus lUStatus) {
      this.task = t;
      this.lookUpStatus = lUStatus;
    }
    
    static TaskLookupResult getTaskFoundResult(Task t) {
      return new TaskLookupResult(t, LookUpStatus.TASK_FOUND);
    }
    static TaskLookupResult getNoTaskFoundResult() {
      return NoTaskLookupResult;
    }
    static TaskLookupResult getMemFailedResult() {
      return MemFailedLookupResult;
    }
    

    Task getTask() {
      return task;
    }

    LookUpStatus getLookUpStatus() {
      return lookUpStatus;
    }
  }

  /** 
   * This class handles the scheduling algorithms. 
   * The algos are the same for both Map and Reduce tasks. 
   * There may be slight variations later, in which case we can make this
   * an abstract base class and have derived classes for Map and Reduce.  
   */
  private static abstract class TaskSchedulingMgr {

    /** our TaskScheduler object */
    protected CapacityTaskScheduler scheduler;
    protected TaskType type = null;

    abstract Task obtainNewTask(TaskTrackerStatus taskTracker, 
        JobInProgress job) throws IOException;

    int getSlotsOccupied(JobInProgress job) {
      return (getNumReservedTaskTrackers(job) + getRunningTasks(job)) * 
             getSlotsPerTask(job);
    }

    abstract int getClusterCapacity();
    abstract int getSlotsPerTask(JobInProgress job);
    abstract int getRunningTasks(JobInProgress job);
    abstract int getPendingTasks(JobInProgress job);
    abstract TaskSchedulingInfo getTSI(QueueSchedulingInfo qsi);
    abstract int getNumReservedTaskTrackers(JobInProgress job);
    
    /**
     * To check if job has a speculative task on the particular tracker.
     * 
     * @param job job to check for speculative tasks.
     * @param tts task tracker on which speculative task would run.
     * @return true if there is a speculative task to run on the tracker.
     */
    abstract boolean hasSpeculativeTask(JobInProgress job, 
        TaskTrackerStatus tts);

    /**
     * Check if the given job has sufficient reserved tasktrackers for all its
     * pending tasks.
     * 
     * @param job job to check for sufficient reserved tasktrackers 
     * @return <code>true</code> if the job has reserved tasktrackers,
     *         else <code>false</code>
     */
    boolean hasSufficientReservedTaskTrackers(JobInProgress job) {
      return getNumReservedTaskTrackers(job) >= getPendingTasks(job);
    }
    
    /**
     * List of QSIs for assigning tasks.
     * Queues are ordered by a ratio of (# of running tasks)/capacity, which
     * indicates how much 'free space' the queue has, or how much it is over
     * capacity. This ordered list is iterated over, when assigning tasks.
     */  
    private List<QueueSchedulingInfo> qsiForAssigningTasks = 
      new ArrayList<QueueSchedulingInfo>();

    /**
     * Comparator to sort queues.
     * For maps, we need to sort on QueueSchedulingInfo.mapTSI. For 
     * reducers, we use reduceTSI. So we'll need separate comparators.  
     */ 
    private static abstract class QueueComparator 
      implements Comparator<QueueSchedulingInfo> {
      abstract TaskSchedulingInfo getTSI(QueueSchedulingInfo qsi);
      public int compare(QueueSchedulingInfo q1, QueueSchedulingInfo q2) {
        TaskSchedulingInfo t1 = getTSI(q1);
        TaskSchedulingInfo t2 = getTSI(q2);
        // look at how much capacity they've filled. Treat a queue with
        // capacity=0 equivalent to a queue running at capacity
        double r1 = (0 == t1.getCapacity())? 1.0f:
          (double)t1.numSlotsOccupied/(double) t1.getCapacity();
        double r2 = (0 == t2.getCapacity())? 1.0f:
          (double)t2.numSlotsOccupied/(double) t2.getCapacity();
        if (r1<r2) return -1;
        else if (r1>r2) return 1;
        else return 0;
      }
    }
    // subclass for map and reduce comparators
    private static final class MapQueueComparator extends QueueComparator {
      TaskSchedulingInfo getTSI(QueueSchedulingInfo qsi) {
        return qsi.mapTSI;
      }
    }
    private static final class ReduceQueueComparator extends QueueComparator {
      TaskSchedulingInfo getTSI(QueueSchedulingInfo qsi) {
        return qsi.reduceTSI;
      }
    }
    // these are our comparator instances
    protected final static MapQueueComparator mapComparator = new MapQueueComparator();
    protected final static ReduceQueueComparator reduceComparator = new ReduceQueueComparator();
    // and this is the comparator to use
    protected QueueComparator queueComparator;

    // Returns queues sorted according to the QueueComparator.
    // Mainly for testing purposes.
    String[] getOrderedQueues() {
      List<String> queues = new ArrayList<String>(qsiForAssigningTasks.size());
      for (QueueSchedulingInfo qsi : qsiForAssigningTasks) {
        queues.add(qsi.queueName);
      }
      return queues.toArray(new String[queues.size()]);
    }

    TaskSchedulingMgr(CapacityTaskScheduler sched) {
      scheduler = sched;
    }
    
    // let the scheduling mgr know which queues are in the system
    void initialize(Map<String, QueueSchedulingInfo> qsiMap) { 
      // add all the qsi objects to our list and sort
      qsiForAssigningTasks.addAll(qsiMap.values());
      Collections.sort(qsiForAssigningTasks, queueComparator);
    }
    
    private synchronized void updateCollectionOfQSIs() {
      Collections.sort(qsiForAssigningTasks, queueComparator);
    }


    private boolean isUserOverLimit(JobInProgress j, QueueSchedulingInfo qsi) {
      // what is our current capacity? It is equal to the queue-capacity if
      // we're running below capacity. If we're running over capacity, then its
      // #running plus slotPerTask of the job (which is the number of extra
      // slots we're getting).
      int currentCapacity;
      TaskSchedulingInfo tsi = getTSI(qsi);
      if (tsi.numSlotsOccupied < tsi.getCapacity()) {
        currentCapacity = tsi.getCapacity();
      }
      else {
        currentCapacity = tsi.numSlotsOccupied + getSlotsPerTask(j);
      }
      int limit = Math.max((int)(Math.ceil((double)currentCapacity/
          (double)qsi.numJobsByUser.size())), 
          (int)(Math.ceil((double)(qsi.ulMin*currentCapacity)/100.0)));
      String user = j.getProfile().getUser();
      if (tsi.numSlotsOccupiedByUser.get(user) >= limit) {
        LOG.debug("User " + user + " is over limit, num slots occupied = " + 
            tsi.numSlotsOccupiedByUser.get(user) + ", limit = " + limit);
        return true;
      }
      else {
        return false;
      }
    }

    /*
     * This is the central scheduling method. 
     * It tries to get a task from jobs in a single queue. 
     * Always return a TaskLookupResult object. Don't return null. 
     */
    private TaskLookupResult getTaskFromQueue(TaskTracker taskTracker,
                                              QueueSchedulingInfo qsi)
    throws IOException {
      TaskTrackerStatus taskTrackerStatus = taskTracker.getStatus();
      // we only look at jobs in the running queues, as these are the ones
      // who have been potentially initialized

      for (JobInProgress j : 
        scheduler.jobQueuesManager.getRunningJobQueue(qsi.queueName)) {
        // only look at jobs that can be run. We ignore jobs that haven't 
        // initialized, or have completed but haven't been removed from the 
        // running queue. 
        if (j.getStatus().getRunState() != JobStatus.RUNNING) {
          continue;
        }
        //Check if queue is over maximum-capacity
        if(this.areTasksInQueueOverMaxCapacity(qsi,j.getNumSlotsPerTask(type))) {
          continue;
        }
        // check if the job's user is over limit
        if (isUserOverLimit(j, qsi)) {
          continue;
        } 
        //If this job meets memory requirements. Ask the JobInProgress for
        //a task to be scheduled on the task tracker.
        //if we find a job then we pass it on.
        if (scheduler.memoryMatcher.matchesMemoryRequirements(j, type,
                                                              taskTrackerStatus)) {
          // We found a suitable job. Get task from it.
          Task t = obtainNewTask(taskTrackerStatus, j);
          //if there is a task return it immediately.
          if (t != null) {
            // we're successful in getting a task
            return TaskLookupResult.getTaskFoundResult(t);
          } else {
            //skip to the next job in the queue.
            LOG.debug("Job " + j.getJobID().toString()
                + " returned no tasks of type " + type);
            continue;
          }
        } else {
          // if memory requirements don't match then we check if the job has
          // pending tasks and has insufficient number of 'reserved'
          // tasktrackers to cover all pending tasks. If so we reserve the
          // current tasktracker for this job so that high memory jobs are not
          // starved
          if ((getPendingTasks(j) != 0 && !hasSufficientReservedTaskTrackers(j))) {
            // Reserve all available slots on this tasktracker
            LOG.info(j.getJobID() + ": Reserving "
                + taskTracker.getTrackerName()
                + " since memory-requirements don't match");
            taskTracker.reserveSlots(type, j, taskTracker
                .getAvailableSlots(type));

            // Block
            return TaskLookupResult.getMemFailedResult();
          }
        }//end of memory check block
        // if we're here, this job has no task to run. Look at the next job.
      }//end of for loop

      // if we're here, we haven't found any task to run among all jobs in 
      // the queue. This could be because there is nothing to run, or that 
      // the user limit for some user is too strict, i.e., there's at least 
      // one user who doesn't have enough tasks to satisfy his limit. If 
      // it's the latter case, re-look at jobs without considering user 
      // limits, and get a task from the first eligible job; however
      // we do not 'reserve' slots on tasktrackers anymore since the user is 
      // already over the limit
      // Note: some of the code from above is repeated here. This is on 
      // purpose as it improves overall readability.  
      // Note: we walk through jobs again. Some of these jobs, which weren't
      // considered in the first pass, shouldn't be considered here again, 
      // but we still check for their viability to keep the code simple. In
      // some cases, for high mem jobs that have nothing to run, we call 
      // obtainNewTask() unnecessarily. Should this be a problem, we can 
      // create a list of jobs to look at (those whose users were over 
      // limit) in the first pass and walk through that list only. 
      for (JobInProgress j : 
        scheduler.jobQueuesManager.getRunningJobQueue(qsi.queueName)) {
        if (j.getStatus().getRunState() != JobStatus.RUNNING) {
          continue;
        }
        //Check if queue is over maximum-capacity
        if (this.areTasksInQueueOverMaxCapacity(
          qsi, j.getNumSlotsPerTask(type))) {
          continue;
        }
        
        if (scheduler.memoryMatcher.matchesMemoryRequirements(j, type,
            taskTrackerStatus)) {
          // We found a suitable job. Get task from it.
          Task t = obtainNewTask(taskTrackerStatus, j);
          //if there is a task return it immediately.
          if (t != null) {
            // we're successful in getting a task
            return TaskLookupResult.getTaskFoundResult(t);
          } else {
            //skip to the next job in the queue.
            continue;
          }
        } else {
          //if memory requirements don't match then we check if the 
          //job has either pending or speculative task. If the job
          //has pending or speculative task we block till this job
          //tasks get scheduled, so that high memory jobs are not 
          //starved
          if (getPendingTasks(j) != 0 || hasSpeculativeTask(j, taskTrackerStatus)) {
            return TaskLookupResult.getMemFailedResult();
          } 
        }//end of memory check block
      }//end of for loop

      // found nothing for this queue, look at the next one.
      String msg = "Found no task from the queue " + qsi.queueName;
      LOG.debug(msg);
      return TaskLookupResult.getNoTaskFoundResult();
    }

    // Always return a TaskLookupResult object. Don't return null. 
    // The caller is responsible for ensuring that the QSI objects and the 
    // collections are up-to-date.
    private TaskLookupResult assignTasks(TaskTracker taskTracker) 
    throws IOException {
      TaskTrackerStatus taskTrackerStatus = taskTracker.getStatus();

      printQSIs();

      // Check if this tasktracker has been reserved for a job...
      JobInProgress job = taskTracker.getJobForFallowSlot(type);
      if (job != null) {
        int availableSlots = taskTracker.getAvailableSlots(type);
        if (LOG.isDebugEnabled()) {
          LOG.debug(job.getJobID() + ": Checking 'reserved' tasktracker " + 
                    taskTracker.getTrackerName() + " with " + availableSlots + 
                    " '" + type + "' slots");
        }

        if (availableSlots >= job.getNumSlotsPerTask(type)) {
          // Unreserve 
          taskTracker.unreserveSlots(type, job);
          
          // We found a suitable job. Get task from it.
          Task t = obtainNewTask(taskTrackerStatus, job);
          //if there is a task return it immediately.
          if (t != null) {
            if (LOG.isDebugEnabled()) {
              LOG.info(job.getJobID() + ": Got " + t.getTaskID() + 
                       " for reserved tasktracker " + 
                       taskTracker.getTrackerName());
            }
            // we're successful in getting a task
            return TaskLookupResult.getTaskFoundResult(t);
          } 
        } else {
          // Re-reserve the current tasktracker
          taskTracker.reserveSlots(type, job, availableSlots);
          
          if (LOG.isDebugEnabled()) {
            LOG.debug(job.getJobID() + ": Re-reserving " + 
                      taskTracker.getTrackerName());
          }

          return TaskLookupResult.getMemFailedResult(); 
        }
      }
      
      
      for (QueueSchedulingInfo qsi : qsiForAssigningTasks) {
        // we may have queues with capacity=0. We shouldn't look at jobs from 
        // these queues
        if (0 == getTSI(qsi).getCapacity()) {
          continue;
        }
        
        //This call is for optimization if we are already over the
        //maximum-capacity we avoid traversing the queues.
        if(this.areTasksInQueueOverMaxCapacity(qsi,1)) {
          continue;
        }
        TaskLookupResult tlr = getTaskFromQueue(taskTracker, qsi);
        TaskLookupResult.LookUpStatus lookUpStatus = tlr.getLookUpStatus();

        if (lookUpStatus == TaskLookupResult.LookUpStatus.NO_TASK_FOUND) {
          continue; // Look in other queues.
        }

        // if we find a task, return
        if (lookUpStatus == TaskLookupResult.LookUpStatus.TASK_FOUND) {
          return tlr;
        }
        // if there was a memory mismatch, return
        else if (lookUpStatus == 
          TaskLookupResult.LookUpStatus.TASK_FAILING_MEMORY_REQUIREMENT) {
            return tlr;
        }
      }

      // nothing to give
      return TaskLookupResult.getNoTaskFoundResult();
    }


    /**
     * Check if maximum-capacity is set for this queue.
     * If set and greater than 0 ,
     * check if numofslotsoccupied+numSlotsPerTask is greater than
     * maximum-capacity , if yes , implies this queue is over limit.
     *
     * Incase noOfSlotsOccupied is less than maximum-capacity ,but ,
     * numOfSlotsOccupied + noSlotsPerTask is more than maximum-capacity we
     * still dont assign the task . This may lead to under utilization of very
     * small set of slots. But this is ok , as we strictly respect the
     * maximum-capacity limit.
     * 
     * @param qsi
     * @return true if queue is over limit.
     */

    private boolean areTasksInQueueOverMaxCapacity(
      QueueSchedulingInfo qsi, int numSlotsPerTask) {
      TaskSchedulingInfo tsi = getTSI(qsi);
      if (tsi.getMaxCapacity() >= 0) {
        if ((tsi.numSlotsOccupied + numSlotsPerTask) > tsi.getMaxCapacity()) {
          if (LOG.isDebugEnabled()) {
            LOG.debug(
              "Queue " + qsi.queueName + " " + "has reached its  max " + type +
                "Capacity");
            LOG.debug("Current running tasks " + tsi.getCapacity());

          }
          return true;
        }
      }
      return false;
    }

    // for debugging.
    private void printQSIs() {
      if (LOG.isDebugEnabled()) {
        StringBuffer s = new StringBuffer();
        for (QueueSchedulingInfo qsi : qsiForAssigningTasks) {
          TaskSchedulingInfo tsi = getTSI(qsi);
          Collection<JobInProgress> runJobs =
            scheduler.jobQueuesManager.getRunningJobQueue(qsi.queueName);
          s.append(
            String.format(
              " Queue '%s'(%s): runningTasks=%d, "
                + "occupiedSlots=%d, capacity=%d, runJobs=%d  maxCapacity=%d ",
              qsi.queueName,
              this.type, Integer.valueOf(tsi.numRunningTasks), Integer
                .valueOf(tsi.numSlotsOccupied), Integer
                .valueOf(tsi.getCapacity()), Integer.valueOf(runJobs.size()),
              Integer.valueOf(tsi.getMaxCapacity())));
        }
        LOG.debug(s);
      }
    }
    
    /**
     * Check if one of the tasks have a speculative task to execute on the 
     * particular task tracker.
     * 
     * @param tips tasks of a job
     * @param progress percentage progress of the job
     * @param tts task tracker status for which we are asking speculative tip
     * @return true if job has a speculative task to run on particular TT.
     */
    boolean hasSpeculativeTask(TaskInProgress[] tips, float progress, 
        TaskTrackerStatus tts) {
      long currentTime = System.currentTimeMillis();
      for(TaskInProgress tip : tips)  {
        if(tip.isRunning() 
            && !(tip.hasRunOnMachine(tts.getHost(), tts.getTrackerName())) 
            && tip.canBeSpeculated(currentTime)) {
          return true;
        }
      }
      return false;
    }
  }

  /**
   * The scheduling algorithms for map tasks. 
   */
  private static class MapSchedulingMgr extends TaskSchedulingMgr {

    MapSchedulingMgr(CapacityTaskScheduler schedulr) {
      super(schedulr);
      type = TaskType.MAP;
      queueComparator = mapComparator;
    }

    @Override
    Task obtainNewTask(TaskTrackerStatus taskTracker, JobInProgress job) 
    throws IOException {
      ClusterStatus clusterStatus = 
        scheduler.taskTrackerManager.getClusterStatus();
      int numTaskTrackers = clusterStatus.getTaskTrackers();
      return job.obtainNewMapTask(taskTracker, numTaskTrackers, 
          scheduler.taskTrackerManager.getNumberOfUniqueHosts());
    }

    @Override
    int getClusterCapacity() {
      return scheduler.taskTrackerManager.getClusterStatus().getMaxMapTasks();
    }

    @Override
    int getRunningTasks(JobInProgress job) {
      return job.runningMaps();
    }

    @Override
    int getPendingTasks(JobInProgress job) {
      return job.pendingMaps();
    }

    @Override
    int getSlotsPerTask(JobInProgress job) {
      return 
        job.getJobConf().computeNumSlotsPerMap(scheduler.getMemSizeForMapSlot());    
    }

    @Override
    TaskSchedulingInfo getTSI(QueueSchedulingInfo qsi) {
      return qsi.mapTSI;
    }

    int getNumReservedTaskTrackers(JobInProgress job) {
      return job.getNumReservedTaskTrackersForMaps();
    }

    @Override
    boolean hasSpeculativeTask(JobInProgress job, TaskTrackerStatus tts) {
      //Check if job supports speculative map execution first then 
      //check if job has speculative maps.
      return (job.getJobConf().getMapSpeculativeExecution())&& (
          hasSpeculativeTask(job.getTasks(TaskType.MAP), 
              job.getStatus().mapProgress(), tts));
    }

  }

  /**
   * The scheduling algorithms for reduce tasks. 
   */
  private static class ReduceSchedulingMgr extends TaskSchedulingMgr {

    ReduceSchedulingMgr(CapacityTaskScheduler schedulr) {
      super(schedulr);
      type = TaskType.REDUCE;
      queueComparator = reduceComparator;
    }

    @Override
    Task obtainNewTask(TaskTrackerStatus taskTracker, JobInProgress job) 
    throws IOException {
      ClusterStatus clusterStatus = 
        scheduler.taskTrackerManager.getClusterStatus();
      int numTaskTrackers = clusterStatus.getTaskTrackers();
      return job.obtainNewReduceTask(taskTracker, numTaskTrackers, 
          scheduler.taskTrackerManager.getNumberOfUniqueHosts());
    }

    @Override
    int getClusterCapacity() {
      return scheduler.taskTrackerManager.getClusterStatus()
          .getMaxReduceTasks();
    }

    @Override
    int getRunningTasks(JobInProgress job) {
      return job.runningReduces();
    }

    @Override
    int getPendingTasks(JobInProgress job) {
      return job.pendingReduces();
    }

    @Override
    int getSlotsPerTask(JobInProgress job) {
      return
        job.getJobConf().computeNumSlotsPerReduce(scheduler.getMemSizeForReduceSlot());    
    }

    @Override
    TaskSchedulingInfo getTSI(QueueSchedulingInfo qsi) {
      return qsi.reduceTSI;
    }

    int getNumReservedTaskTrackers(JobInProgress job) {
      return job.getNumReservedTaskTrackersForReduces();
    }

    @Override
    boolean hasSpeculativeTask(JobInProgress job, TaskTrackerStatus tts) {
      //check if the job supports reduce speculative execution first then
      //check if the job has speculative tasks.
      return (job.getJobConf().getReduceSpeculativeExecution()) && (
          hasSpeculativeTask(job.getTasks(TaskType.REDUCE), 
              job.getStatus().reduceProgress(), tts));
    }

  }
  
  /** the scheduling mgrs for Map and Reduce tasks */ 
  protected TaskSchedulingMgr mapScheduler = new MapSchedulingMgr(this);
  protected TaskSchedulingMgr reduceScheduler = new ReduceSchedulingMgr(this);

  MemoryMatcher memoryMatcher = new MemoryMatcher(this);

  /** we keep track of the number of map/reduce slots we saw last */
  private int prevMapClusterCapacity = 0;
  private int prevReduceClusterCapacity = 0;
  
    
  static final Log LOG = LogFactory.getLog(CapacityTaskScheduler.class);
  protected JobQueuesManager jobQueuesManager;
  protected CapacitySchedulerConf schedConf;
  /** whether scheduler has started or not */
  private boolean started = false;

  final static String JOB_SCHEDULING_INFO_FORMAT_STRING =
    "%s running map tasks using %d map slots. %d additional slots reserved." +
    " %s running reduce tasks using %d reduce slots." +
    " %d additional slots reserved.";
  /**
   * A clock class - can be mocked out for testing.
   */
  static class Clock {
    long getTime() {
      return System.currentTimeMillis();
    }
  }

  private Clock clock;
  private JobInitializationPoller initializationPoller;

  private long memSizeForMapSlotOnJT;
  private long memSizeForReduceSlotOnJT;
  private long limitMaxMemForMapTasks;
  private long limitMaxMemForReduceTasks;
  private boolean assignMultipleTasks = true;

  public CapacityTaskScheduler() {
    this(new Clock());
  }
  
  // for testing
  public CapacityTaskScheduler(Clock clock) {
    this.jobQueuesManager = new JobQueuesManager(this);
    this.clock = clock;
  }
  
  /** mostly for testing purposes */
  public void setResourceManagerConf(CapacitySchedulerConf conf) {
    this.schedConf = conf;
  }

  private void initializeMemoryRelatedConf() {
    //handling @deprecated
    if (conf.get(
      CapacitySchedulerConf.DEFAULT_PERCENTAGE_OF_PMEM_IN_VMEM_PROPERTY) !=
      null) {
      LOG.warn(
        JobConf.deprecatedString(
          CapacitySchedulerConf.DEFAULT_PERCENTAGE_OF_PMEM_IN_VMEM_PROPERTY));
    }

    //handling @deprecated
    if (conf.get(CapacitySchedulerConf.UPPER_LIMIT_ON_TASK_PMEM_PROPERTY) !=
      null) {
      LOG.warn(
        JobConf.deprecatedString(
          CapacitySchedulerConf.UPPER_LIMIT_ON_TASK_PMEM_PROPERTY));
    }

    if (conf.get(JobConf.MAPRED_TASK_DEFAULT_MAXVMEM_PROPERTY) != null) {
      LOG.warn(
        JobConf.deprecatedString(
          JobConf.MAPRED_TASK_DEFAULT_MAXVMEM_PROPERTY));
    }

    memSizeForMapSlotOnJT =
        JobConf.normalizeMemoryConfigValue(conf.getLong(
            JobTracker.MAPRED_CLUSTER_MAP_MEMORY_MB_PROPERTY,
            JobConf.DISABLED_MEMORY_LIMIT));
    memSizeForReduceSlotOnJT =
        JobConf.normalizeMemoryConfigValue(conf.getLong(
            JobTracker.MAPRED_CLUSTER_REDUCE_MEMORY_MB_PROPERTY,
            JobConf.DISABLED_MEMORY_LIMIT));

    //handling @deprecated values
    if (conf.get(JobConf.UPPER_LIMIT_ON_TASK_VMEM_PROPERTY) != null) {
      LOG.warn(
        JobConf.deprecatedString(
          JobConf.UPPER_LIMIT_ON_TASK_VMEM_PROPERTY)+
          " instead use " +JobTracker.MAPRED_CLUSTER_MAX_MAP_MEMORY_MB_PROPERTY+
          " and " + JobTracker.MAPRED_CLUSTER_MAX_REDUCE_MEMORY_MB_PROPERTY
      );
      
      limitMaxMemForMapTasks = limitMaxMemForReduceTasks =
        JobConf.normalizeMemoryConfigValue(
          conf.getLong(
            JobConf.UPPER_LIMIT_ON_TASK_VMEM_PROPERTY,
            JobConf.DISABLED_MEMORY_LIMIT));
      if (limitMaxMemForMapTasks != JobConf.DISABLED_MEMORY_LIMIT &&
        limitMaxMemForMapTasks >= 0) {
        limitMaxMemForMapTasks = limitMaxMemForReduceTasks =
          limitMaxMemForMapTasks /
            (1024 * 1024); //Converting old values in bytes to MB
      }
    } else {
      limitMaxMemForMapTasks =
        JobConf.normalizeMemoryConfigValue(
          conf.getLong(
            JobTracker.MAPRED_CLUSTER_MAX_MAP_MEMORY_MB_PROPERTY,
            JobConf.DISABLED_MEMORY_LIMIT));
      limitMaxMemForReduceTasks =
        JobConf.normalizeMemoryConfigValue(
          conf.getLong(
            JobTracker.MAPRED_CLUSTER_MAX_REDUCE_MEMORY_MB_PROPERTY,
            JobConf.DISABLED_MEMORY_LIMIT));
    }
    LOG.info(String.format("Scheduler configured with "
        + "(memSizeForMapSlotOnJT, memSizeForReduceSlotOnJT, "
        + "limitMaxMemForMapTasks, limitMaxMemForReduceTasks)"
        + " (%d,%d,%d,%d)", Long.valueOf(memSizeForMapSlotOnJT), Long
        .valueOf(memSizeForReduceSlotOnJT), Long
        .valueOf(limitMaxMemForMapTasks), Long
        .valueOf(limitMaxMemForReduceTasks)));
  }

  long getMemSizeForMapSlot() {
    return memSizeForMapSlotOnJT;
  }

  long getMemSizeForReduceSlot() {
    return memSizeForReduceSlotOnJT;
  }

  long getLimitMaxMemForMapSlot() {
    return limitMaxMemForMapTasks;
  }

  long getLimitMaxMemForReduceSlot() {
    return limitMaxMemForReduceTasks;
  }

  String[] getOrderedQueues(TaskType type) {
    if (type == TaskType.MAP) {
      return mapScheduler.getOrderedQueues();
    } else if (type == TaskType.REDUCE) {
      return reduceScheduler.getOrderedQueues();
    }
    return null;
  }

  @Override
  public synchronized void start() throws IOException {
    if (started) return;
    super.start();
    // initialize our queues from the config settings
    if (null == schedConf) {
      schedConf = new CapacitySchedulerConf();
    }

    initializeMemoryRelatedConf();
    
    // read queue info from config file
    QueueManager queueManager = taskTrackerManager.getQueueManager();
    Set<String> queues = queueManager.getQueues();
    // Sanity check: there should be at least one queue. 
    if (0 == queues.size()) {
      throw new IllegalStateException("System has no queue configured");
    }

    Set<String> queuesWithoutConfiguredCapacity = new HashSet<String>();
    float totalCapacityPercent = 0.0f;
    for (String queueName: queues) {
      float capacityPercent = schedConf.getCapacity(queueName);
      if (capacityPercent == -1.0) {
        queuesWithoutConfiguredCapacity.add(queueName);
      }else {
        totalCapacityPercent += capacityPercent;
      }

      float maxCapacityPercent = schedConf.getMaxCapacity(queueName);
      int ulMin = schedConf.getMinimumUserLimitPercent(queueName);
      // create our QSI and add to our hashmap
      QueueSchedulingInfo qsi = new QueueSchedulingInfo(
        queueName, capacityPercent, maxCapacityPercent ,ulMin, jobQueuesManager);
      queueInfoMap.put(queueName, qsi);

      // create the queues of job objects
      boolean supportsPrio = schedConf.isPrioritySupported(queueName);
      jobQueuesManager.createQueue(queueName, supportsPrio);
      
      SchedulingDisplayInfo schedulingInfo = 
        new SchedulingDisplayInfo(queueName, this);
      queueManager.setSchedulerInfo(queueName, schedulingInfo);
      
    }
    float remainingQuantityToAllocate = 100 - totalCapacityPercent;
    float quantityToAllocate = 
      remainingQuantityToAllocate/queuesWithoutConfiguredCapacity.size();
    for(String queue: queuesWithoutConfiguredCapacity) {
      QueueSchedulingInfo qsi = queueInfoMap.get(queue); 
      qsi.capacityPercent = quantityToAllocate;
      if(qsi.maxCapacityPercent >= 0) {
        if(qsi.capacityPercent > qsi.maxCapacityPercent) {
          throw new IllegalStateException(
            " Allocated capacity of " + qsi.capacityPercent +
              " to unconfigured queue " + qsi.queueName +
              " is greater than maximum Capacity " + qsi.maxCapacityPercent);
        }
      }
      schedConf.setCapacity(queue, quantityToAllocate);
    }    
    
    if (totalCapacityPercent > 100.0) {
      throw new IllegalArgumentException(
        "Sum of queue capacities over 100% at "
          + totalCapacityPercent);
    }    
    
    // let our mgr objects know about the queues
    mapScheduler.initialize(queueInfoMap);
    reduceScheduler.initialize(queueInfoMap);
    
    // listen to job changes
    taskTrackerManager.addJobInProgressListener(jobQueuesManager);

    //Start thread for initialization
    if (initializationPoller == null) {
      this.initializationPoller = new JobInitializationPoller(
          jobQueuesManager,schedConf,queues, taskTrackerManager);
    }
    initializationPoller.init(queueManager.getQueues(), schedConf);
    initializationPoller.setDaemon(true);
    initializationPoller.start();

    started = true;
    LOG.info("Capacity scheduler initialized " + queues.size() + " queues");  
  }
  
  /** mostly for testing purposes */
  void setInitializationPoller(JobInitializationPoller p) {
    this.initializationPoller = p;
  }
  
  @Override
  public synchronized void terminate() throws IOException {
    if (!started) return;
    if (jobQueuesManager != null) {
      taskTrackerManager.removeJobInProgressListener(
          jobQueuesManager);
    }
    started = false;
    initializationPoller.terminate();
    super.terminate();
  }
  
  @Override
  public synchronized void setConf(Configuration conf) {
    super.setConf(conf);
  }

  /**
   * provided for the test classes
   * lets you update the QSI objects and sorted collections
   */ 
  void updateQSIInfoForTests() {
    ClusterStatus c = taskTrackerManager.getClusterStatus();
    int mapClusterCapacity = c.getMaxMapTasks();
    int reduceClusterCapacity = c.getMaxReduceTasks();
    // update the QSI objects
    updateQSIObjects(mapClusterCapacity, reduceClusterCapacity);
    mapScheduler.updateCollectionOfQSIs();
    reduceScheduler.updateCollectionOfQSIs();
  }

  /**
   * Update individual QSI objects.
   * We don't need exact information for all variables, just enough for us
   * to make scheduling decisions. For example, we don't need an exact count
   * of numRunningTasks. Once we count upto the grid capacity, any
   * number beyond that will make no difference.
   */
  private synchronized void updateQSIObjects(
    int mapClusterCapacity,
      int reduceClusterCapacity) {
    // if # of slots have changed since last time, update.
    // First, compute whether the total number of TT slots have changed
    for (QueueSchedulingInfo qsi: queueInfoMap.values()) {
      // compute new capacities, if TT slots have changed
      if (mapClusterCapacity != prevMapClusterCapacity) {
        qsi.mapTSI.setCapacity(
          (int)
          (qsi.capacityPercent*mapClusterCapacity/100));

        //compute new max map capacities
        if(qsi.maxCapacityPercent > 0) {
          qsi.mapTSI.setMaxCapacity(
            (int) (qsi.maxCapacityPercent * mapClusterCapacity / 100));
        }
      }
      if (reduceClusterCapacity != prevReduceClusterCapacity) {
        qsi.reduceTSI.setCapacity(
          (int)
          (qsi.capacityPercent*reduceClusterCapacity/100));

        //compute new max reduce capacities
        if (qsi.maxCapacityPercent > 0) {
          qsi.reduceTSI.setMaxCapacity(
            (int) (qsi.maxCapacityPercent * reduceClusterCapacity / 100));
      }
      }

      // reset running/pending tasks, tasks per user
      qsi.mapTSI.resetTaskVars();
      qsi.reduceTSI.resetTaskVars();
      // update stats on running jobs
      for (JobInProgress j:
        jobQueuesManager.getRunningJobQueue(qsi.queueName)) {
        if (j.getStatus().getRunState() != JobStatus.RUNNING) {
          continue;
        }

        int numMapsRunningForThisJob = mapScheduler.getRunningTasks(j);
        int numReducesRunningForThisJob = reduceScheduler.getRunningTasks(j);
        int numRunningMapSlots = 
          numMapsRunningForThisJob * mapScheduler.getSlotsPerTask(j);
        int numRunningReduceSlots =
          numReducesRunningForThisJob * reduceScheduler.getSlotsPerTask(j);
        int numMapSlotsForThisJob = mapScheduler.getSlotsOccupied(j);
        int numReduceSlotsForThisJob = reduceScheduler.getSlotsOccupied(j);
        int numReservedMapSlotsForThisJob = 
          (mapScheduler.getNumReservedTaskTrackers(j) * 
           mapScheduler.getSlotsPerTask(j)); 
        int numReservedReduceSlotsForThisJob = 
          (reduceScheduler.getNumReservedTaskTrackers(j) * 
           reduceScheduler.getSlotsPerTask(j)); 
        j.setSchedulingInfo(
            String.format(JOB_SCHEDULING_INFO_FORMAT_STRING,
                          Integer.valueOf(numMapsRunningForThisJob), 
                          Integer.valueOf(numRunningMapSlots),
                          Integer.valueOf(numReservedMapSlotsForThisJob),
                          Integer.valueOf(numReducesRunningForThisJob), 
                          Integer.valueOf(numRunningReduceSlots),
                          Integer.valueOf(numReservedReduceSlotsForThisJob)));
        qsi.mapTSI.numRunningTasks += numMapsRunningForThisJob;
        qsi.reduceTSI.numRunningTasks += numReducesRunningForThisJob;
        qsi.mapTSI.numSlotsOccupied += numMapSlotsForThisJob;
        qsi.reduceTSI.numSlotsOccupied += numReduceSlotsForThisJob;
        Integer i =
            qsi.mapTSI.numSlotsOccupiedByUser.get(j.getProfile().getUser());
        qsi.mapTSI.numSlotsOccupiedByUser.put(j.getProfile().getUser(),
            Integer.valueOf(i.intValue() + numMapSlotsForThisJob));
        i = qsi.reduceTSI.numSlotsOccupiedByUser.get(j.getProfile().getUser());
        qsi.reduceTSI.numSlotsOccupiedByUser.put(j.getProfile().getUser(),
            Integer.valueOf(i.intValue() + numReduceSlotsForThisJob));
        if (LOG.isDebugEnabled()) {
          LOG.debug(String.format("updateQSI: job %s: run(m)=%d, "
              + "occupied(m)=%d, run(r)=%d, occupied(r)=%d, finished(m)=%d,"
              + " finished(r)=%d, failed(m)=%d, failed(r)=%d, "
              + "spec(m)=%d, spec(r)=%d, total(m)=%d, total(r)=%d", j
              .getJobID().toString(), Integer
              .valueOf(numMapsRunningForThisJob), Integer
              .valueOf(numMapSlotsForThisJob), Integer
              .valueOf(numReducesRunningForThisJob), Integer
              .valueOf(numReduceSlotsForThisJob), Integer.valueOf(j
              .finishedMaps()), Integer.valueOf(j.finishedReduces()), Integer
              .valueOf(j.failedMapTasks),
              Integer.valueOf(j.failedReduceTasks), Integer
                  .valueOf(j.speculativeMapTasks), Integer
                  .valueOf(j.speculativeReduceTasks), Integer
                  .valueOf(j.numMapTasks), Integer.valueOf(j.numReduceTasks)));
        }

        /*
         * it's fine walking down the entire list of running jobs - there
         * probably will not be many, plus, we may need to go through the
         * list to compute numSlotsOccupiedByUser. If this is expensive, we
         * can keep a list of running jobs per user. Then we only need to
         * consider the first few jobs per user.
         */
      }
    }

    prevMapClusterCapacity = mapClusterCapacity;
    prevReduceClusterCapacity = reduceClusterCapacity;
  }

  /**
   * Sets whether the scheduler can assign multiple tasks in a heartbeat
   * or not.
   * 
   * This method is used only for testing purposes.
   * 
   * @param assignMultipleTasks true, to assign multiple tasks per heartbeat
   */
  void setAssignMultipleTasks(boolean assignMultipleTasks) {
    this.assignMultipleTasks = assignMultipleTasks;
  }

  /*
   * The grand plan for assigning a task.
   * 
   * If multiple task assignment is enabled, it tries to get one map and
   * one reduce slot depending on free slots on the TT.
   * 
   * Otherwise, we decide whether a Map or Reduce task should be given to a TT 
   * (if the TT can accept either). 
   * Either way, we first pick a queue. We only look at queues that need 
   * a slot. Among these, we first look at queues whose 
   * (# of running tasks)/capacity is the least.
   * Next, pick a job in a queue. we pick the job at the front of the queue
   * unless its user is over the user limit. 
   * Finally, given a job, pick a task from the job. 
   *  
   */
  @Override
  public synchronized List<Task> assignTasks(TaskTracker taskTracker)
  throws IOException {    
    TaskTrackerStatus taskTrackerStatus = taskTracker.getStatus();
    ClusterStatus c = taskTrackerManager.getClusterStatus();
    int mapClusterCapacity = c.getMaxMapTasks();
    int reduceClusterCapacity = c.getMaxReduceTasks();
    int maxMapSlots = taskTrackerStatus.getMaxMapSlots();
    int currentMapSlots = taskTrackerStatus.countOccupiedMapSlots();
    int maxReduceSlots = taskTrackerStatus.getMaxReduceSlots();
    int currentReduceSlots = taskTrackerStatus.countOccupiedReduceSlots();
    LOG.debug("TT asking for task, max maps=" + taskTrackerStatus.getMaxMapSlots() + 
        ", run maps=" + taskTrackerStatus.countMapTasks() + ", max reds=" + 
        taskTrackerStatus.getMaxReduceSlots() + ", run reds=" + 
        taskTrackerStatus.countReduceTasks() + ", map cap=" + 
        mapClusterCapacity + ", red cap = " + 
        reduceClusterCapacity);

    /* 
     * update all our QSI objects.
     * This involves updating each qsi structure. This operation depends
     * on the number of running jobs in a queue, and some waiting jobs. If it
     * becomes expensive, do it once every few heartbeats only.
     */ 
    updateQSIObjects(mapClusterCapacity, reduceClusterCapacity);
    List<Task> result = new ArrayList<Task>();
    if (assignMultipleTasks) {
      addReduceTask(taskTracker, result, maxReduceSlots, currentReduceSlots);
      addMapTask(taskTracker, result, maxMapSlots, currentMapSlots);
    } else {
      /* 
       * If TT has Map and Reduce slot free, we need to figure out whether to
       * give it a Map or Reduce task.
       * Number of ways to do this. For now, base decision on how much is needed
       * versus how much is used (default to Map, if equal).
       */
      if ((maxReduceSlots - currentReduceSlots) 
          > (maxMapSlots - currentMapSlots)) {
        addReduceTask(taskTracker, result, maxReduceSlots, currentReduceSlots);
        if (result.size() == 0) {
          addMapTask(taskTracker, result, maxMapSlots, currentMapSlots);
        }
      } else {
        addMapTask(taskTracker, result, maxMapSlots, currentMapSlots);
        if (result.size() == 0) {
          addReduceTask(taskTracker, result, maxReduceSlots, currentReduceSlots);
        }
      }
      if (result.size() == 0) {
        return null;
      }
    }
    return result;
  }

  // Pick a reduce task and add to the list of tasks, if there's space
  // on the TT to run one.
  private void addReduceTask(TaskTracker taskTracker, List<Task> tasks,
                                  int maxReduceSlots, int currentReduceSlots) 
                    throws IOException {
    if (maxReduceSlots > currentReduceSlots) {
      reduceScheduler.updateCollectionOfQSIs();
      TaskLookupResult tlr = reduceScheduler.assignTasks(taskTracker);
      if (TaskLookupResult.LookUpStatus.TASK_FOUND == tlr.getLookUpStatus()) {
        tasks.add(tlr.getTask());
      }
    }
  }
  
  // Pick a map task and add to the list of tasks, if there's space
  // on the TT to run one.
  private void addMapTask(TaskTracker taskTracker, List<Task> tasks, 
                              int maxMapSlots, int currentMapSlots)
                    throws IOException {
    if (maxMapSlots > currentMapSlots) {
      mapScheduler.updateCollectionOfQSIs();
      TaskLookupResult tlr = mapScheduler.assignTasks(taskTracker);
      if (TaskLookupResult.LookUpStatus.TASK_FOUND == tlr.getLookUpStatus()) {
        tasks.add(tlr.getTask());
      }
    }
  }
  
  // called when a job is added
  synchronized void jobAdded(JobInProgress job) throws IOException {
    QueueSchedulingInfo qsi = 
      queueInfoMap.get(job.getProfile().getQueueName());
    // qsi shouldn't be null
    // update user-specific info
    Integer i = qsi.numJobsByUser.get(job.getProfile().getUser());
    if (null == i) {
      i = 1;
      // set the count for running tasks to 0
      qsi.mapTSI.numSlotsOccupiedByUser.put(job.getProfile().getUser(),
          Integer.valueOf(0));
      qsi.reduceTSI.numSlotsOccupiedByUser.put(job.getProfile().getUser(),
          Integer.valueOf(0));
    }
    else {
      i++;
    }
    qsi.numJobsByUser.put(job.getProfile().getUser(), i);
    
    // setup scheduler specific job information
    preInitializeJob(job);
    
    LOG.debug("Job " + job.getJobID().toString() + " is added under user " 
              + job.getProfile().getUser() + ", user now has " + i + " jobs");
  }

  /**
   * Setup {@link CapacityTaskScheduler} specific information prior to
   * job initialization.
   */
  void preInitializeJob(JobInProgress job) {
    JobConf jobConf = job.getJobConf();
    
    // Compute number of slots required to run a single map/reduce task
    int slotsPerMap = 1;
    int slotsPerReduce = 1;
    if (memoryMatcher.isSchedulingBasedOnMemEnabled()) {
      slotsPerMap = jobConf.computeNumSlotsPerMap(getMemSizeForMapSlot());
     slotsPerReduce = 
       jobConf.computeNumSlotsPerReduce(getMemSizeForReduceSlot());
    }
    job.setNumSlotsPerMap(slotsPerMap);
    job.setNumSlotsPerReduce(slotsPerReduce);
  }
  
  // called when a job completes
  synchronized void jobCompleted(JobInProgress job) {
    QueueSchedulingInfo qsi = 
      queueInfoMap.get(job.getProfile().getQueueName());
    // qsi shouldn't be null
    // update numJobsByUser
    LOG.debug("JOb to be removed for user " + job.getProfile().getUser());
    Integer i = qsi.numJobsByUser.get(job.getProfile().getUser());
    i--;
    if (0 == i.intValue()) {
      qsi.numJobsByUser.remove(job.getProfile().getUser());
      // remove job footprint from our TSIs
      qsi.mapTSI.numSlotsOccupiedByUser.remove(job.getProfile().getUser());
      qsi.reduceTSI.numSlotsOccupiedByUser.remove(job.getProfile().getUser());
      LOG.debug("No more jobs for user, number of users = " + qsi.numJobsByUser.size());
    }
    else {
      qsi.numJobsByUser.put(job.getProfile().getUser(), i);
      LOG.debug("User still has " + i + " jobs, number of users = "
                + qsi.numJobsByUser.size());
    }
  }
  
  @Override
  public synchronized Collection<JobInProgress> getJobs(String queueName) {
    Collection<JobInProgress> jobCollection = new ArrayList<JobInProgress>();
    Collection<JobInProgress> runningJobs = 
        jobQueuesManager.getRunningJobQueue(queueName);
    if (runningJobs != null) {
      jobCollection.addAll(runningJobs);
    }
    Collection<JobInProgress> waitingJobs = 
      jobQueuesManager.getWaitingJobs(queueName);
    Collection<JobInProgress> tempCollection = new ArrayList<JobInProgress>();
    if(waitingJobs != null) {
      tempCollection.addAll(waitingJobs);
    }
    tempCollection.removeAll(runningJobs);
    if(!tempCollection.isEmpty()) {
      jobCollection.addAll(tempCollection);
    }
    return jobCollection;
  }
  
  JobInitializationPoller getInitializationPoller() {
    return initializationPoller;
  }

  synchronized String getDisplayInfo(String queueName) {
    QueueSchedulingInfo qsi = queueInfoMap.get(queueName);
    if (null == qsi) { 
      return null;
    }
    return qsi.toString();
  }

}