CapacityTaskScheduler.java example

Explorer
HadoopEKG-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.mapred;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.AbstractQueue.AbstractQueueComparator;
import org.apache.hadoop.mapred.JobTracker.IllegalStateException;
import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.server.jobtracker.TaskTracker;
import org.apache.hadoop.util.StringUtils;

/**
 * A {@link TaskScheduler} that implements the requirements in HADOOP-3421
 * and provides a HOD-less way to share large clusters. This scheduler 
 * provides the following features: 
 *  * support for queues, where a job is submitted to a queue. 
 *  * Queues are assigned a fraction of the capacity of the grid (their
 *  'capacity') in the sense that a certain capacity of resources 
 *  will be at their disposal. All jobs submitted to the queues of an Org 
 *  will have access to the capacity to the Org.
 *  * Free resources can be allocated to any queue beyond its 
 *  capacity.
 *  * Queues optionally support job priorities (disabled by default). 
 *  * Within a queue, jobs with higher priority will have access to the 
 *  queue's resources before jobs with lower priority. However, once a job 
 *  is running, it will not be preempted for a higher priority job.
 *  * In order to prevent one or more users from monopolizing its resources, 
 *  each queue enforces a limit on the percentage of resources allocated to a 
 *  user at any given time, if there is competition for them.
 *  
 */
class CapacityTaskScheduler extends TaskScheduler {

  /** quick way to get qsc object given a queue name */
  private Map<String, QueueSchedulingContext> queueInfoMap =
    new HashMap<String, QueueSchedulingContext>();

  //Root level queue . It has all the
  //cluster capacity at its disposal.
  //Queues declared by users would
  //be children of this queue.
  //CS would have handle to root.
  private AbstractQueue root = null;

  /**
   * This class captures scheduling information we want to display or log.
   */
  private static class SchedulingDisplayInfo {
    private String queueName;
    CapacityTaskScheduler scheduler;
    
    SchedulingDisplayInfo(String queueName, CapacityTaskScheduler scheduler) { 
      this.queueName = queueName;
      this.scheduler = scheduler;
    }
    
    @Override
    public String toString(){
      // note that we do not call updateContextObjects() here for performance
      // reasons. This means that the data we print out may be slightly
      // stale. This data is updated whenever assignTasks() is called
      // If this doesn't happen, the data gets stale. If we see
      // this often, we may need to detect this situation and call 
      // updateContextObjects(), or just call it each time.
      return scheduler.getDisplayInfo(queueName);
    }
  }


  // this class encapsulates the result of a task lookup
  private static class TaskLookupResult {

    static enum LookUpStatus {
      TASK_FOUND,
      NO_TASK_FOUND,
      TASK_FAILING_MEMORY_REQUIREMENT,
    }
    // constant TaskLookupResult objects. Should not be accessed directly.
    private static final TaskLookupResult NoTaskLookupResult = 
      new TaskLookupResult(null, TaskLookupResult.LookUpStatus.NO_TASK_FOUND);
    private static final TaskLookupResult MemFailedLookupResult = 
      new TaskLookupResult(null, 
          TaskLookupResult.LookUpStatus.TASK_FAILING_MEMORY_REQUIREMENT);

    private LookUpStatus lookUpStatus;
    private Task task;

    // should not call this constructor directly. use static factory methods.
    private TaskLookupResult(Task t, LookUpStatus lUStatus) {
      this.task = t;
      this.lookUpStatus = lUStatus;
    }
    
    static TaskLookupResult getTaskFoundResult(Task t) {
      LOG.debug("Returning task " + t);
      return new TaskLookupResult(t, LookUpStatus.TASK_FOUND);
    }
    static TaskLookupResult getNoTaskFoundResult() {
      return NoTaskLookupResult;
    }
    static TaskLookupResult getMemFailedResult() {
      return MemFailedLookupResult;
    }
    

    Task getTask() {
      return task;
    }

    LookUpStatus getLookUpStatus() {
      return lookUpStatus;
    }
  }

  /**
   * This class handles the scheduling algorithms.
   * The algos are the same for both Map and Reduce tasks.
   * There may be slight variations later, in which case we can make this
   * an abstract base class and have derived classes for Map and Reduce.
   */
  private static abstract class TaskSchedulingMgr {

    /** our TaskScheduler object */
    protected CapacityTaskScheduler scheduler;
    protected TaskType type = null;

    abstract Task obtainNewTask(TaskTrackerStatus taskTracker,
        JobInProgress job) throws IOException;

    abstract int getClusterCapacity();
    abstract TaskSchedulingContext getTSC(
      QueueSchedulingContext qsc);
    /**
     * To check if job has a speculative task on the particular tracker.
     *
     * @param job job to check for speculative tasks.
     * @param tts task tracker on which speculative task would run.
     * @return true if there is a speculative task to run on the tracker.
     */
    abstract boolean hasSpeculativeTask(JobInProgress job,
        TaskTrackerStatus tts);

    /**
     * Comparator to sort queues.
     * For maps, we need to sort on QueueSchedulingContext.mapTSC. For
     * reducers, we use reduceTSC. So we'll need separate comparators.
     */
    private static abstract class QueueComparator
      implements Comparator<AbstractQueue> {
      abstract TaskSchedulingContext getTSC(
        QueueSchedulingContext qsi);
      public int compare(AbstractQueue q1, AbstractQueue q2) {
        TaskSchedulingContext t1 = getTSC(q1.getQueueSchedulingContext());
        TaskSchedulingContext t2 = getTSC(q2.getQueueSchedulingContext());
        // look at how much capacity they've filled. Treat a queue with
        // capacity=0 equivalent to a queue running at capacity
        double r1 = (0 == t1.getCapacity())? 1.0f:
          (double) t1.getNumSlotsOccupied() /(double) t1.getCapacity();
        double r2 = (0 == t2.getCapacity())? 1.0f:
          (double) t2.getNumSlotsOccupied() /(double) t2.getCapacity();
        if (r1<r2) return -1;
        else if (r1>r2) return 1;
        else return 0;
      }
    }
    // subclass for map and reduce comparators
    private static final class MapQueueComparator extends QueueComparator {
      TaskSchedulingContext getTSC(QueueSchedulingContext qsi) {
        return qsi.getMapTSC();
      }
    }
    private static final class ReduceQueueComparator extends QueueComparator {
      TaskSchedulingContext getTSC(QueueSchedulingContext qsi) {
        return qsi.getReduceTSC();
      }
    }

    // these are our comparator instances
    protected final static MapQueueComparator mapComparator =
      new MapQueueComparator();
    protected final static ReduceQueueComparator reduceComparator =
      new ReduceQueueComparator();
    // and this is the comparator to use
    protected QueueComparator queueComparator;

    // Returns queues sorted according to the QueueComparator.
    // Mainly for testing purposes.
    String[] getOrderedQueues() {
      List<AbstractQueue> queueList = getOrderedJobQueues();
      List<String> queues = new ArrayList<String>(queueList.size());
      for (AbstractQueue q : queueList) {
        queues.add(q.getName());
      }
      return queues.toArray(new String[queues.size()]);
    }

    /**
     * Return an ordered list of {@link JobQueue}s wrapped as
     * {@link AbstractQueue}s. Ordering is according to {@link QueueComparator}.
     * To reflect the true ordering of the JobQueues, the complete hierarchy is
     * sorted such that {@link AbstractQueue}s are ordered according to their
     * needs at each level in the hierarchy, after which only the leaf level
     * {@link JobQueue}s are returned.
     * 
     * @return a list of {@link JobQueue}s wrapped as {@link AbstractQueue}s
     *         sorted by their needs.
     */
    List<AbstractQueue> getOrderedJobQueues() {
      scheduler.root.sort(queueComparator);
      return scheduler.root.getDescendentJobQueues();
    }

    TaskSchedulingMgr(CapacityTaskScheduler sched) {
      scheduler = sched;
    }

    private boolean isUserOverLimit(JobInProgress j,
                                    QueueSchedulingContext qsc) {
      // what is our current capacity? It is equal to the queue-capacity if
      // we're running below capacity. If we're running over capacity, then its
      // #running plus slotPerTask of the job (which is the number of extra
      // slots we're getting).
      int currentCapacity;
      TaskSchedulingContext tsi = getTSC(qsc);
      if (tsi.getNumSlotsOccupied() < tsi.getCapacity()) {
        currentCapacity = tsi.getCapacity();
      }
      else {
        currentCapacity =
          tsi.getNumSlotsOccupied() +
            TaskDataView.getTaskDataView(type).getSlotsPerTask(j);
      }
      int limit = Math.max((int)(Math.ceil((double)currentCapacity/
          (double) qsc.getNumJobsByUser().size())),
          (int)(Math.ceil((double)(qsc.getUlMin() *currentCapacity)/100.0)));
      String user = j.getProfile().getUser();
      if (tsi.getNumSlotsOccupiedByUser().get(user) >= limit) {
        LOG.debug("User " + user + " is over limit, num slots occupied = " +
            tsi.getNumSlotsOccupiedByUser().get(user) + ", limit = " + limit);
        return true;
      }
      else {
        return false;
      }
    }

    /*
     * This is the central scheduling method.
     * It tries to get a task from jobs in a single queue.
     * Always return a TaskLookupResult object. Don't return null.
     */
    private TaskLookupResult getTaskFromQueue(TaskTracker taskTracker,
                                              QueueSchedulingContext qsi)
    throws IOException {
      TaskTrackerStatus taskTrackerStatus = taskTracker.getStatus();
      // we only look at jobs in the running queues, as these are the ones
      // who have been potentially initialized

      for (JobInProgress j :
        scheduler.jobQueuesManager.getJobQueue(qsi.getQueueName())
          .getRunningJobs()) {
        // only look at jobs that can be run. We ignore jobs that haven't
        // initialized, or have completed but haven't been removed from the
        // running queue.

        //Check queue for maximum capacity .
        if(areTasksInQueueOverMaxCapacity(qsi,j.getNumSlotsPerTask(type))) {
          continue;
        }
        
        if (j.getStatus().getRunState() != JobStatus.RUNNING) {
          continue;
        }
        // check if the job's user is over limit
        if (isUserOverLimit(j, qsi)) {
          continue;
        }
        //If this job meets memory requirements. Ask the JobInProgress for
        //a task to be scheduled on the task tracker.
        //if we find a job then we pass it on.
        if (scheduler.memoryMatcher.matchesMemoryRequirements(j, type,
                                                              taskTrackerStatus)) {
          // We found a suitable job. Get task from it.
          Task t = obtainNewTask(taskTrackerStatus, j);
          //if there is a task return it immediately.
          if (t != null) {
            // we're successful in getting a task
            return TaskLookupResult.getTaskFoundResult(t);
          } else {
            //skip to the next job in the queue.
            LOG.debug("Job " + j.getJobID().toString()
                + " returned no tasks of type " + type);
          }
        } else {
          // if memory requirements don't match then we check if the job has
          // pending tasks and has insufficient number of 'reserved'
          // tasktrackers to cover all pending tasks. If so we reserve the
          // current tasktracker for this job so that high memory jobs are not
          // starved
          TaskDataView view = TaskDataView.getTaskDataView(type);
          if ((view.getPendingTasks(j) != 0 && 
                !view.hasSufficientReservedTaskTrackers(j))) {
            // Reserve all available slots on this tasktracker
            LOG.info(j.getJobID() + ": Reserving "
                + taskTracker.getTrackerName()
                + " since memory-requirements don't match");
            taskTracker.reserveSlots(type, j, taskTracker
                .getAvailableSlots(type));

            // Block
            return TaskLookupResult.getMemFailedResult();
          }
        }//end of memory check block
        // if we're here, this job has no task to run. Look at the next job.
      }//end of for loop

      // if we're here, we haven't found any task to run among all jobs in
      // the queue. This could be because there is nothing to run, or that
      // the user limit for some user is too strict, i.e., there's at least
      // one user who doesn't have enough tasks to satisfy his limit. If
      // it's the latter case, re-look at jobs without considering user
      // limits, and get a task from the first eligible job; however
      // we do not 'reserve' slots on tasktrackers anymore since the user is
      // already over the limit
      // Note: some of the code from above is repeated here. This is on
      // purpose as it improves overall readability.
      // Note: we walk through jobs again. Some of these jobs, which weren't
      // considered in the first pass, shouldn't be considered here again,
      // but we still check for their viability to keep the code simple. In
      // some cases, for high mem jobs that have nothing to run, we call
      // obtainNewTask() unnecessarily. Should this be a problem, we can
      // create a list of jobs to look at (those whose users were over
      // limit) in the first pass and walk through that list only.
      for (JobInProgress j :
        scheduler.jobQueuesManager.getJobQueue(qsi.getQueueName())
          .getRunningJobs()) {
        if (j.getStatus().getRunState() != JobStatus.RUNNING) {
          continue;
        }

        //Check for the maximum-capacity.
        if(areTasksInQueueOverMaxCapacity(qsi,j.getNumSlotsPerTask(type))) {
          continue;
        }


        if (scheduler.memoryMatcher.matchesMemoryRequirements(j, type,
            taskTrackerStatus)) {
          // We found a suitable job. Get task from it.
          Task t = obtainNewTask(taskTrackerStatus, j);
          //if there is a task return it immediately.
          if (t != null) {
            // we're successful in getting a task
            return TaskLookupResult.getTaskFoundResult(t);
          } else {
          }
        } else {
          //if memory requirements don't match then we check if the
          //job has either pending or speculative task. If the job
          //has pending or speculative task we block till this job
          //tasks get scheduled, so that high memory jobs are not
          //starved
          if (TaskDataView.getTaskDataView(type).getPendingTasks(j) != 0 ||
            hasSpeculativeTask(j, taskTrackerStatus)) {
            return TaskLookupResult.getMemFailedResult();
          }
        }//end of memory check block
      }//end of for loop

      // found nothing for this queue, look at the next one.
      String msg = "Found no task from the queue " + qsi.getQueueName();
      LOG.debug(msg);
      return TaskLookupResult.getNoTaskFoundResult();
    }

    // Always return a TaskLookupResult object. Don't return null.
    // The caller is responsible for ensuring that the QSC objects and the
    // collections are up-to-date.
    private TaskLookupResult assignTasks(TaskTracker taskTracker)
    throws IOException {
      TaskTrackerStatus taskTrackerStatus = taskTracker.getStatus();

      printQSCs();

      // Check if this tasktracker has been reserved for a job...
      JobInProgress job = taskTracker.getJobForFallowSlot(type);
      if (job != null) {
        int availableSlots = taskTracker.getAvailableSlots(type);
        if (LOG.isDebugEnabled()) {
          LOG.debug(job.getJobID() + ": Checking 'reserved' tasktracker " +
                    taskTracker.getTrackerName() + " with " + availableSlots +
                    " '" + type + "' slots");
        }

        if (availableSlots >= job.getNumSlotsPerTask(type)) {
          // Unreserve
          taskTracker.unreserveSlots(type, job);

          // We found a suitable job. Get task from it.
          Task t = obtainNewTask(taskTrackerStatus, job);
          //if there is a task return it immediately.
          if (t != null) {
            if (LOG.isDebugEnabled()) {
              LOG.info(job.getJobID() + ": Got " + t.getTaskID() +
                       " for reserved tasktracker " +
                       taskTracker.getTrackerName());
            }
            // we're successful in getting a task
            return TaskLookupResult.getTaskFoundResult(t);
          }
        } else {
          // Re-reserve the current tasktracker
          taskTracker.reserveSlots(type, job, availableSlots);

          if (LOG.isDebugEnabled()) {
            LOG.debug(job.getJobID() + ": Re-reserving " +
                      taskTracker.getTrackerName());
          }

          return TaskLookupResult.getMemFailedResult();
        }
      }

      for (AbstractQueue q : getOrderedJobQueues()) {
        QueueSchedulingContext qsc = q.getQueueSchedulingContext();
        // we may have queues with capacity=0. We shouldn't look at jobs from
        // these queues
        if (0 == getTSC(qsc).getCapacity()) {
          continue;
        }

        //This call is important for optimization purposes , if we
        //have reached the limit already no need for traversing the queue.
        if(this.areTasksInQueueOverMaxCapacity(qsc,1)) {
          continue;
        }
        
        TaskLookupResult tlr = getTaskFromQueue(taskTracker, qsc);
        TaskLookupResult.LookUpStatus lookUpStatus = tlr.getLookUpStatus();

        if (lookUpStatus == TaskLookupResult.LookUpStatus.NO_TASK_FOUND) {
          continue; // Look in other queues.
        }

        // if we find a task, return
        if (lookUpStatus == TaskLookupResult.LookUpStatus.TASK_FOUND) {
          return tlr;
        }
        // if there was a memory mismatch, return
        else if (lookUpStatus ==
          TaskLookupResult.LookUpStatus.TASK_FAILING_MEMORY_REQUIREMENT) {
            return tlr;
        }
      }

      // nothing to give
      return TaskLookupResult.getNoTaskFoundResult();
    }


    /**
     * Check if maximum-capacity is set  for this queue.
     * If set and greater than 0 ,
     * check if numofslotsoccupied+numSlotsPerTask is greater than
     * maximum-Capacity ,if yes , implies this queue is over limit.
     *
     * Incase noOfSlotsOccupied is less than maximum-capacity ,but ,
     * numOfSlotsOccupied+noSlotsPerTask is more than maximum-capacity we still
     * dont assign the task . This may lead to under utilization of very small
     * set of slots. But this is ok ,as we strictly respect the maximum-capacity
     * @param qsc
     * @param noOfSlotsPerTask
     * @return true if queue is over maximum-capacity
     */
    private boolean areTasksInQueueOverMaxCapacity(
      QueueSchedulingContext qsc,int noOfSlotsPerTask) {
      TaskSchedulingContext tsi = getTSC(qsc);
      //check for maximum-capacity
      if(tsi.getMaxCapacity() >= 0) {
        if ((tsi.getNumSlotsOccupied() + noOfSlotsPerTask) >
          tsi.getMaxCapacity()) {
          if (LOG.isDebugEnabled()) {
            LOG.debug(
              "Queue " + qsc.getQueueName() + " " + "has reached its  max " +
                type + "Capacity");
            LOG.debug("Current running tasks " + tsi.getCapacity());

          }
          return true;
        }
      }
      return false;
    }


    // for debugging.
    private void printQSCs() {
      if (LOG.isDebugEnabled()) {
        StringBuffer s = new StringBuffer();
        for (AbstractQueue aq: getOrderedJobQueues()) {
          QueueSchedulingContext qsi = aq.getQueueSchedulingContext();
          TaskSchedulingContext tsi = getTSC(qsi);
          Collection<JobInProgress> runJobs =
            scheduler.jobQueuesManager.getJobQueue(qsi.getQueueName())
              .getRunningJobs();
          s.append(
            String.format(
              " Queue '%s'(%s): runningTasks=%d, "
                + "occupiedSlots=%d, capacity=%d, runJobs=%d  maximumCapacity=%d ",
              qsi.getQueueName(),
              this.type, tsi.getNumRunningTasks(),
              tsi.getNumSlotsOccupied(), tsi.getCapacity(), (runJobs.size()),
              tsi.getMaxCapacity()));
        }
        LOG.debug(s);
      }
    }

    /**
     * Check if one of the tasks have a speculative task to execute on the
     * particular task tracker.
     *
     * @param tips tasks of a job
     * @param tts task tracker status for which we are asking speculative tip
     * @return true if job has a speculative task to run on particular TT.
     */
    boolean hasSpeculativeTask(
      TaskInProgress[] tips,
      TaskTrackerStatus tts) {
      long currentTime = System.currentTimeMillis();
      for(TaskInProgress tip : tips)  {
        if(tip.isRunning()
            && !(tip.hasRunOnMachine(tts.getHost(), tts.getTrackerName()))
            && tip.canBeSpeculated(currentTime)) {
          return true;
        }
      }
      return false;
    }
  }

  /**
   * The scheduling algorithms for map tasks.
   */
  private static class MapSchedulingMgr extends TaskSchedulingMgr {

    MapSchedulingMgr(CapacityTaskScheduler schedulr) {
      super(schedulr);
      type = TaskType.MAP;
      queueComparator = mapComparator;
    }

    @Override
    Task obtainNewTask(TaskTrackerStatus taskTracker, JobInProgress job)
    throws IOException {
      synchronized (scheduler) {
        ClusterStatus clusterStatus = scheduler.taskTrackerManager
            .getClusterStatus();
        int numTaskTrackers = clusterStatus.getTaskTrackers();
        return job.obtainNewMapTask(taskTracker, numTaskTrackers,
            scheduler.taskTrackerManager.getNumberOfUniqueHosts());
      }
    }

    @Override
    int getClusterCapacity() {
      synchronized (scheduler) {
        return scheduler.taskTrackerManager.getClusterStatus().getMaxMapTasks();
      }
    }

    @Override
    TaskSchedulingContext getTSC(QueueSchedulingContext qsi) {
      return qsi.getMapTSC();
    }


    @Override
    boolean hasSpeculativeTask(JobInProgress job, TaskTrackerStatus tts) {
      //Check if job supports speculative map execution first then
      //check if job has speculative maps.
      return (job.getJobConf().getMapSpeculativeExecution())&& (
          hasSpeculativeTask(job.getTasks(TaskType.MAP),
                             tts));
    }

  }

  /**
   * The scheduling algorithms for reduce tasks.
   */
  private static class ReduceSchedulingMgr extends TaskSchedulingMgr {

    ReduceSchedulingMgr(CapacityTaskScheduler schedulr) {
      super(schedulr);
      type = TaskType.REDUCE;
      queueComparator = reduceComparator;
    }

    @Override
    Task obtainNewTask(TaskTrackerStatus taskTracker, JobInProgress job)
    throws IOException {
      synchronized (scheduler) {
        ClusterStatus clusterStatus = scheduler.taskTrackerManager
            .getClusterStatus();
        int numTaskTrackers = clusterStatus.getTaskTrackers();
        return job.obtainNewReduceTask(taskTracker, numTaskTrackers,
            scheduler.taskTrackerManager.getNumberOfUniqueHosts());
      }
    }

    @Override
    int getClusterCapacity() {
      synchronized (scheduler) {
        return scheduler.taskTrackerManager.getClusterStatus()
            .getMaxReduceTasks();
      }
    }

    @Override
    TaskSchedulingContext getTSC(QueueSchedulingContext qsi) {
      return qsi.getReduceTSC();
    }

    @Override
    boolean hasSpeculativeTask(JobInProgress job, TaskTrackerStatus tts) {
      //check if the job supports reduce speculative execution first then
      //check if the job has speculative tasks.
      return (job.getJobConf().getReduceSpeculativeExecution()) && (
          hasSpeculativeTask(job.getTasks(TaskType.REDUCE),
                             tts));
    }

  }

  /** the scheduling mgrs for Map and Reduce tasks */
  protected TaskSchedulingMgr mapScheduler = new MapSchedulingMgr(this);
  protected TaskSchedulingMgr reduceScheduler = new ReduceSchedulingMgr(this);

  MemoryMatcher memoryMatcher = new MemoryMatcher();

  static final Log LOG = LogFactory.getLog(CapacityTaskScheduler.class);
  protected JobQueuesManager jobQueuesManager;

  /** whether scheduler has started or not */
  private boolean started = false;

  /**
   * A clock class - can be mocked out for testing.
   */
  static class Clock {
    long getTime() {
      return System.currentTimeMillis();
    }
  }

  private Clock clock;
  private JobInitializationPoller initializationPoller;

  class CapacitySchedulerQueueRefresher extends QueueRefresher {
    @Override
    void refreshQueues(List<JobQueueInfo> newRootQueues)
        throws Throwable {
      if (!started) {
        String msg =
            "Capacity Scheduler is not in the 'started' state."
                + " Cannot refresh queues.";
        LOG.error(msg);
        throw new IOException(msg);
      }
      CapacitySchedulerConf schedConf = new CapacitySchedulerConf();
      initializeQueues(newRootQueues, schedConf, true);
      initializationPoller.refreshQueueInfo(schedConf);
    }
  }

  public CapacityTaskScheduler() {
    this(new Clock());
  }
  
  // for testing
  public CapacityTaskScheduler(Clock clock) {
    this.jobQueuesManager = new JobQueuesManager();
    this.clock = clock;
  }

  @Override
  QueueRefresher getQueueRefresher() {
    return new CapacitySchedulerQueueRefresher();
  }

  /**
   * Only for testing.
   * @param type
   * @return
   */
  String[] getOrderedQueues(TaskType type) {
    if (type == TaskType.MAP) {
      return mapScheduler.getOrderedQueues();
    } else if (type == TaskType.REDUCE) {
      return reduceScheduler.getOrderedQueues();
    }
    return null;
  }

  @Override
  public synchronized void start() throws IOException {
    if (started) return;
    super.start();

    // Initialize MemoryMatcher
    MemoryMatcher.initializeMemoryRelatedConf(conf);

    // read queue info from config file
    QueueManager queueManager = taskTrackerManager.getQueueManager();

    // initialize our queues from the config settings
    CapacitySchedulerConf schedConf = new CapacitySchedulerConf();
    try {
      initializeQueues(queueManager.getRoot().getJobQueueInfo().getChildren(),
          schedConf, false);
    } catch (Throwable e) {
      LOG.error("Couldn't initialize queues because of the excecption : "
          + StringUtils.stringifyException(e));
      throw new IOException(e);
    }

    // Queues are ready. Now register jobQueuesManager with the JobTracker so as
    // to listen to job changes
    taskTrackerManager.addJobInProgressListener(jobQueuesManager);

    //Start thread for initialization
    if (initializationPoller == null) {
      this.initializationPoller = new JobInitializationPoller(
          jobQueuesManager, taskTrackerManager);
    }
    initializationPoller.init(jobQueuesManager.getJobQueueNames(), schedConf);
    initializationPoller.setDaemon(true);
    initializationPoller.start();

    started = true;

    LOG.info("Capacity scheduler started successfully");  
  }

  /**
   * Read the configuration and initialize the queues. This operation should be
   * done only when either the scheduler is starting or a request is received
   * from {@link QueueManager} to refresh the queue configuration.
   * 
   * <p>
   * 
   * Even in case of refresh, we do not explicitly destroy AbstractQueue items,
   * or the info maps, they will be automatically garbage-collected.
   * 
   * <p>
   * 
   * We don't explicitly lock the scheduler completely. This method is called at
   * two times. 1) When the scheduler is starting. During this time, the lock
   * sequence is JT->scheduler and so we don't need any more locking here. 2)
   * When refresh is issued to {@link QueueManager}. When this happens, parallel
   * refreshes are guarded by {@link QueueManager} itself by taking its lock.
   * 
   * @param newRootQueues
   * @param schedConf
   * @param refreshingQueues
   * @throws Throwable
   */
  private void initializeQueues(List<JobQueueInfo> newRootQueues,
      CapacitySchedulerConf schedConf, boolean refreshingQueues)
      throws Throwable {

    if (newRootQueues == null) {
      throw new IOException(
          "Cannot initialize the queues with null root-queues!");
    }

    // Sanity check: there should be at least one queue.
    if (0 == newRootQueues.size()) {
      throw new IllegalStateException("System has no queue configured!");
    }

    // Create a new queue-hierarchy builder and try loading the complete
    // hierarchy of queues.
    AbstractQueue newRootAbstractQueue;
    try {
      newRootAbstractQueue =
          new QueueHierarchyBuilder().createHierarchy(newRootQueues, schedConf);

    } catch (Throwable e) {
      LOG.error("Exception while tryign to (re)initializing queues : "
          + StringUtils.stringifyException(e));
      LOG.info("(Re)initializing the queues with the new configuration "
          + "failed, so keeping the old configuration.");
      throw e;
    }

    // New configuration is successfully validated and applied, set the new
    // configuration to the current queue-hierarchy.

    if (refreshingQueues) {
      // Scheduler is being refreshed.

      // Going to commit the changes to the hierarchy. Lock the scheduler.
      synchronized (this) {
        AbstractQueueComparator comparator = new AbstractQueueComparator();
        this.root.sort(comparator);
        newRootAbstractQueue.sort(comparator);
        root.validateAndCopyQueueContexts(newRootAbstractQueue);
      }
    } else {
      // Scheduler is just starting.

      this.root = newRootAbstractQueue;

      // JobQueue objects are created. Inform the JobQueuesManager so that it
      // can track the running/waiting jobs. JobQueuesManager is still not added
      // as a listener to JobTracker, so no locking needed.
      addJobQueuesToJobQueuesManager();
    }

    List<AbstractQueue> allQueues = new ArrayList<AbstractQueue>();
    allQueues.addAll(getRoot().getDescendantContainerQueues());
    allQueues.addAll(getRoot().getDescendentJobQueues());
    for (AbstractQueue queue : allQueues) {
      if (!refreshingQueues) {
        // Scheduler is just starting, create the display info also
        createDisplayInfo(taskTrackerManager.getQueueManager(), queue.getName());
      }

      // QueueSchedulingContext objects are created/have changed. Put them
      // (back) in the queue-info so as to be consumed by the UI.
      addToQueueInfoMap(queue.getQueueSchedulingContext());
    }
  }

  /**
   * Inform the {@link JobQueuesManager} about the newly constructed
   * {@link JobQueue}s.
   */
  private void addJobQueuesToJobQueuesManager() {
    List<AbstractQueue> allJobQueues = getRoot().getDescendentJobQueues();
    for (AbstractQueue jobQ : allJobQueues) {
      jobQueuesManager.addQueue((JobQueue)jobQ);
    }
  }

  /** mostly for testing purposes */
  synchronized void setInitializationPoller(JobInitializationPoller p) {
    this.initializationPoller = p;
  }
  
  @Override
  public synchronized void terminate() throws IOException {
    if (!started) return;
    if (jobQueuesManager != null) {
      taskTrackerManager.removeJobInProgressListener(
          jobQueuesManager);
    }
    started = false;
    initializationPoller.terminate();
    super.terminate();
  }
  
  @Override
  public synchronized void setConf(Configuration conf) {
    super.setConf(conf);
  }

  /**
   * provided for the test classes
   * lets you update the QSI objects and sorted collections
   */ 
  synchronized void updateContextInfoForTests() {
    ClusterStatus c = taskTrackerManager.getClusterStatus();
    int mapClusterCapacity = c.getMaxMapTasks();
    int reduceClusterCapacity = c.getMaxReduceTasks();
    // update the QSI objects
    updateContextObjects(mapClusterCapacity, reduceClusterCapacity);
    mapScheduler.scheduler.root.sort(mapScheduler.queueComparator);
    reduceScheduler.scheduler.root.sort(reduceScheduler.queueComparator);
  }

  /**
   * Update individual QSC objects.
   * We don't need exact information for all variables, just enough for us
   * to make scheduling decisions. For example, we don't need an exact count
   * of numRunningTasks. Once we count upto the grid capacity, any
   * number beyond that will make no difference.
   *
   **/
  private synchronized void updateContextObjects(int mapClusterCapacity,
      int reduceClusterCapacity) {
    root.update(mapClusterCapacity,reduceClusterCapacity);

  }

  /*
   * The grand plan for assigning a task. 
   * Always assigns 1 reduce and 1 map , if sufficient slots are
   * available for each of types.
   * If not , then which ever type of slots are available , that type of task is
   * assigned.
   * Next, pick a queue. We only look at queues that need a slot. Among these,
   * we first look at queues whose (# of running tasks)/capacity is the least.
   * Next, pick a job in a queue. we pick the job at the front of the queue
   * unless its user is over the user limit. 
   * Finally, given a job, pick a task from the job. 
   *  
   */
  @Override
  public synchronized List<Task> assignTasks(TaskTracker taskTracker)
  throws IOException {
    
    TaskLookupResult tlr;
    TaskTrackerStatus taskTrackerStatus = taskTracker.getStatus();
    List<Task> result = new ArrayList<Task>();
    
    /* 
     * If TT has Map and Reduce slot free, we assign 1 map and 1 reduce
     * We  base decision on how much is needed
     * versus how much is used
     */
    ClusterStatus c = taskTrackerManager.getClusterStatus();
    int mapClusterCapacity = c.getMaxMapTasks();
    int reduceClusterCapacity = c.getMaxReduceTasks();
    int maxMapSlots = taskTrackerStatus.getMaxMapSlots();
    int currentMapSlots = taskTrackerStatus.countOccupiedMapSlots();
    int maxReduceSlots = taskTrackerStatus.getMaxReduceSlots();
    int currentReduceSlots = taskTrackerStatus.countOccupiedReduceSlots();
    LOG.debug("TT asking for task, max maps="
      + taskTrackerStatus.getMaxMapSlots() + 
        ", run maps=" + taskTrackerStatus.countMapTasks() + ", max reds=" + 
        taskTrackerStatus.getMaxReduceSlots() + ", run reds=" + 
        taskTrackerStatus.countReduceTasks() + ", map cap=" + 
        mapClusterCapacity + ", red cap = " + 
        reduceClusterCapacity);

    /* 
     * update all our QSC objects.
     * This involves updating each qsC structure. This operation depends
     * on the number of running jobs in a queue, and some waiting jobs. If it
     * becomes expensive, do it once every few heartbeats only.
     */ 
    updateContextObjects(mapClusterCapacity, reduceClusterCapacity);
    // make sure we get our map or reduce scheduling object to update its 
    // collection of QSC objects too.

    if (maxReduceSlots > currentReduceSlots) {
      //reduce slot available , try to get a
      //reduce task
      tlr = reduceScheduler.assignTasks(taskTracker);
      if (TaskLookupResult.LookUpStatus.TASK_FOUND == 
        tlr.getLookUpStatus()) {
        result.add(tlr.getTask());
      }
    }

    if(maxMapSlots > currentMapSlots) {
      //map slot available , try to get a map task
      tlr = mapScheduler.assignTasks(taskTracker);
      if (TaskLookupResult.LookUpStatus.TASK_FOUND == 
        tlr.getLookUpStatus()) {
        result.add(tlr.getTask());
      }
    }
    
    return (result.isEmpty()) ? null : result;
  }

  
  @Override
  public synchronized Collection<JobInProgress> getJobs(String queueName) {
    Collection<JobInProgress> jobCollection = new ArrayList<JobInProgress>();
    JobQueue jobQueue = jobQueuesManager.getJobQueue(queueName);
    if (jobQueue == null) {
      return jobCollection;
    }
    Collection<JobInProgress> runningJobs =
      jobQueue.getRunningJobs();
    if (runningJobs != null) {
      jobCollection.addAll(runningJobs);
    }
    Collection<JobInProgress> waitingJobs = 
      jobQueue.getWaitingJobs();
    Collection<JobInProgress> tempCollection = new ArrayList<JobInProgress>();
    if(waitingJobs != null) {
      tempCollection.addAll(waitingJobs);
    }
    tempCollection.removeAll(runningJobs);
    if(!tempCollection.isEmpty()) {
      jobCollection.addAll(tempCollection);
    }
    return jobCollection;
  }
  
  synchronized JobInitializationPoller getInitializationPoller() {
    return initializationPoller;
  }

  private synchronized String getDisplayInfo(String queueName) {
    QueueSchedulingContext qsi = queueInfoMap.get(queueName);
    if (null == qsi) {
      return null;
    }
    return qsi.toString();
  }

  private synchronized void addToQueueInfoMap(QueueSchedulingContext qsc) {
    queueInfoMap.put(qsc.getQueueName(), qsc);
  }

  /**
   * Create the scheduler information and set it in the {@link QueueManager}.
   * this should be only called when the scheduler is starting.
   * 
   * @param queueManager
   * @param queueName
   */
  private void createDisplayInfo(QueueManager queueManager, String queueName) {
    if (queueManager != null) {
      SchedulingDisplayInfo schedulingInfo =
        new SchedulingDisplayInfo(queueName, this);
      queueManager.setSchedulerInfo(queueName, schedulingInfo);
    }
  }


  /**
   * Use for testing purposes.
   * returns the root
   * @return
   */
  AbstractQueue getRoot() {
    return this.root;
  }


  /**
   * This is used for testing purpose only
   * Dont use this method.
   * @param rt
   */
  void setRoot(AbstractQueue rt) {
    this.root = rt;
  }

}