DefaultTaskScheduler.java example

Explorer
tajo-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tajo.querymaster;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.util.RackResolver;
import org.apache.tajo.TaskAttemptId;
import org.apache.tajo.conf.TajoConf;
import org.apache.tajo.engine.planner.global.ExecutionBlock;
import org.apache.tajo.engine.planner.global.MasterPlan;
import org.apache.tajo.engine.query.TaskRequest;
import org.apache.tajo.engine.query.TaskRequestImpl;
import org.apache.tajo.exception.TajoInternalError;
import org.apache.tajo.ipc.QueryCoordinatorProtocol;
import org.apache.tajo.ipc.QueryCoordinatorProtocol.QueryCoordinatorProtocolService;
import org.apache.tajo.ipc.TajoWorkerProtocol;
import org.apache.tajo.master.cluster.WorkerConnectionInfo;
import org.apache.tajo.master.event.*;
import org.apache.tajo.master.event.TaskAttemptToSchedulerEvent.TaskAttemptScheduleContext;
import org.apache.tajo.master.event.TaskSchedulerEvent.EventType;
import org.apache.tajo.plan.serder.LogicalNodeSerializer;
import org.apache.tajo.resource.NodeResource;
import org.apache.tajo.resource.NodeResources;
import org.apache.tajo.rpc.AsyncRpcClient;
import org.apache.tajo.rpc.CallFuture;
import org.apache.tajo.rpc.NettyClientBase;
import org.apache.tajo.rpc.RpcClientManager;
import org.apache.tajo.service.ServiceTracker;
import org.apache.tajo.storage.DataLocation;
import org.apache.tajo.storage.fragment.Fragment;
import org.apache.tajo.util.NetUtils;
import org.apache.tajo.util.RpcParameterFactory;
import org.apache.tajo.util.TUtil;

import java.net.ConnectException;
import java.net.InetSocketAddress;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;

import static org.apache.tajo.ResourceProtos.*;

public class DefaultTaskScheduler extends AbstractTaskScheduler {
  private static final Log LOG = LogFactory.getLog(DefaultTaskScheduler.class);

  private final TaskSchedulerContext context;
  private Stage stage;
  private TajoConf tajoConf;
  private Properties rpcParams;

  private Thread schedulingThread;
  private volatile boolean isStopped;
  private AtomicBoolean needWakeup = new AtomicBoolean();

  private ScheduledRequests scheduledRequests;

  private int minTaskMemory;
  private int nextTaskId = 0;
  private int scheduledObjectNum = 0;
  private boolean isLeaf;
  private int schedulerDelay;
  private int maximumRequestContainer;

  // candidate workers for locality of high priority
  private Set<Integer> candidateWorkers = Sets.newHashSet();

  public DefaultTaskScheduler(TaskSchedulerContext context, Stage stage) {
    super(DefaultTaskScheduler.class.getName());
    this.context = context;
    this.stage = stage;
  }

  @Override
  public void init(Configuration conf) {
    tajoConf = TUtil.checkTypeAndGet(conf, TajoConf.class);
    rpcParams = RpcParameterFactory.get(tajoConf);

    scheduledRequests = new ScheduledRequests();
    minTaskMemory = tajoConf.getIntVar(TajoConf.ConfVars.TASK_RESOURCE_MINIMUM_MEMORY);
    schedulerDelay= tajoConf.getIntVar(TajoConf.ConfVars.QUERYMASTER_TASK_SCHEDULER_DELAY);
    isLeaf = stage.getMasterPlan().isLeaf(stage.getBlock());

    this.schedulingThread = new Thread() {
      public void run() {

        while (!isStopped && !Thread.currentThread().isInterrupted()) {

          try {
            schedule();
          } catch (InterruptedException e) {
            if (isStopped) {
              break;
            } else {
              LOG.fatal(e.getMessage(), e);
              stage.abort(StageState.ERROR, e);
            }
          } catch (Throwable e) {
            LOG.fatal(e.getMessage(), e);
            stage.abort(StageState.ERROR, e);
            break;
          }
        }
        info(LOG, "TaskScheduler schedulingThread stopped");
      }
    };
    super.init(conf);
  }

  @Override
  public void start() {
    info(LOG, "Start TaskScheduler");
    maximumRequestContainer = Math.min(tajoConf.getIntVar(TajoConf.ConfVars.QUERYMASTER_TASK_SCHEDULER_REQUEST_MAX_NUM)
        , stage.getContext().getWorkerMap().size());

    if (isLeaf) {
      candidateWorkers.addAll(getWorkerIds(getLeafTaskHosts()));
    } else {
      //find assigned hosts for Non-Leaf locality in children executionBlock
      List<ExecutionBlock> executionBlockList = stage.getMasterPlan().getChilds(stage.getBlock());
      for (ExecutionBlock executionBlock : executionBlockList) {
        Stage childStage = stage.getContext().getStage(executionBlock.getId());
        candidateWorkers.addAll(childStage.getAssignedWorkerMap().keySet());
      }
    }

    this.schedulingThread.start();
    super.start();
  }

  @Override
  public void stop() {
    isStopped = true;

    if (schedulingThread != null) {
      synchronized (schedulingThread) {
        schedulingThread.interrupt();
      }
    }
    candidateWorkers.clear();
    scheduledRequests.clear();
    info(LOG, "Task Scheduler stopped");
    super.stop();
  }

  protected void info(Log log, String message) {
    log.info(String.format("[%s] %s", stage.getId(), message));
  }

  protected void warn(Log log, String message) {
    log.warn(String.format("[%s] %s", stage.getId(), message));
  }

  private Fragment[] fragmentsForNonLeafTask;
  private Fragment[] broadcastFragmentsForNonLeafTask;

  public void schedule() throws Exception {
    try {
      final int incompleteTaskNum = scheduledRequests.leafTaskNum() + scheduledRequests.nonLeafTaskNum();
      if (incompleteTaskNum == 0) {
        needWakeup.set(true);
        // all task is done or tasks is not scheduled
        synchronized (schedulingThread) {
          schedulingThread.wait(1000);
        }
      } else {
        LinkedList<TaskRequestEvent> taskRequests = createTaskRequest(incompleteTaskNum);

        if (taskRequests.size() == 0) {
          synchronized (schedulingThread) {
            schedulingThread.wait(schedulerDelay);
          }
        } else {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Get " + taskRequests.size() + " taskRequestEvents ");
          }

          if (isLeaf) {
            scheduledRequests.assignToLeafTasks(taskRequests);
          } else {
            scheduledRequests.assignToNonLeafTasks(taskRequests);
          }
        }
      }
    } catch (TimeoutException e) {
      LOG.error(e.getMessage());
    }
  }

  @Override
  public void handle(TaskSchedulerEvent event) {
    if (event.getType() == EventType.T_SCHEDULE) {
      if (event instanceof FragmentScheduleEvent) {
        FragmentScheduleEvent castEvent = (FragmentScheduleEvent) event;
        if (context.isLeafQuery()) {
          TaskAttemptScheduleContext taskContext = new TaskAttemptScheduleContext();
          Task task = Stage.newEmptyTask(context, taskContext, stage, nextTaskId++);
          task.addFragment(castEvent.getLeftFragment(), true);
          scheduledObjectNum++;
          if (castEvent.hasRightFragments()) {
            task.addFragments(castEvent.getRightFragments());
          }
          stage.getEventHandler().handle(new TaskEvent(task.getId(), TaskEventType.T_SCHEDULE));
        } else {
          fragmentsForNonLeafTask = new Fragment[2];
          fragmentsForNonLeafTask[0] = castEvent.getLeftFragment();
          if (castEvent.hasRightFragments()) {
            Collection<Fragment> var = castEvent.getRightFragments();
            Fragment[] rightFragments = var.toArray(new Fragment[var.size()]);
            fragmentsForNonLeafTask[1] = rightFragments[0];
            if (rightFragments.length > 1) {
              broadcastFragmentsForNonLeafTask = new Fragment[rightFragments.length - 1];
              System.arraycopy(rightFragments, 1, broadcastFragmentsForNonLeafTask, 0, broadcastFragmentsForNonLeafTask.length);
            } else {
              broadcastFragmentsForNonLeafTask = null;
            }
          }
        }
      } else if (event instanceof FetchScheduleEvent) {
        FetchScheduleEvent castEvent = (FetchScheduleEvent) event;
        Map<String, List<FetchProto>> fetches = castEvent.getFetches();
        TaskAttemptScheduleContext taskScheduleContext = new TaskAttemptScheduleContext();
        Task task = Stage.newEmptyTask(context, taskScheduleContext, stage, nextTaskId++);
        scheduledObjectNum++;
        for (Entry<String, List<FetchProto>> eachFetch : fetches.entrySet()) {
          task.addFetches(eachFetch.getKey(), eachFetch.getValue());
          task.addFragment(fragmentsForNonLeafTask[0], true);
          if (fragmentsForNonLeafTask[1] != null) {
            task.addFragment(fragmentsForNonLeafTask[1], true);
          }
        }
        if (broadcastFragmentsForNonLeafTask != null && broadcastFragmentsForNonLeafTask.length > 0) {
          task.addFragments(Arrays.asList(broadcastFragmentsForNonLeafTask));
        }
        stage.getEventHandler().handle(new TaskEvent(task.getId(), TaskEventType.T_SCHEDULE));
      } else if (event instanceof TaskAttemptToSchedulerEvent) {
        TaskAttemptToSchedulerEvent castEvent = (TaskAttemptToSchedulerEvent) event;
        if (context.isLeafQuery()) {
          scheduledRequests.addLeafTask(castEvent);
        } else {
          scheduledRequests.addNonLeafTask(castEvent);
        }

        if (needWakeup.getAndSet(false)) {
          //wake up scheduler thread after scheduled
          synchronized (schedulingThread) {
            schedulingThread.notifyAll();
          }
        }
      }
    } else if (event.getType() == EventType.T_SCHEDULE_CANCEL) {
      // when a stage is killed, unassigned query unit attmpts are canceled from the scheduler.
      // This event is triggered by TaskAttempt.
      TaskAttemptToSchedulerEvent castedEvent = (TaskAttemptToSchedulerEvent) event;
      scheduledRequests.leafTasks.remove(castedEvent.getTaskAttempt().getId());
      LOG.info(castedEvent.getTaskAttempt().getId() + " is canceled from " + this.getClass().getSimpleName());
      ((TaskAttemptToSchedulerEvent) event).getTaskAttempt().handle(
          new TaskAttemptEvent(castedEvent.getTaskAttempt().getId(), TaskAttemptEventType.TA_SCHEDULE_CANCELED));
    }
  }

  private Set<Integer> getWorkerIds(Collection<String> hosts){
    Set<Integer> workerIds = Sets.newHashSet();
    if(hosts.isEmpty()) return workerIds;

    for (WorkerConnectionInfo worker : stage.getContext().getWorkerMap().values()) {
      if(hosts.contains(worker.getHost())){
        workerIds.add(worker.getId());
      }
    }
    return workerIds;
  }


  protected LinkedList<TaskRequestEvent> createTaskRequest(final int incompleteTaskNum) throws Exception {
    LinkedList<TaskRequestEvent> taskRequestEvents = new LinkedList<>();

    //If scheduled tasks is long-term task, cluster resource can be the worst load balance.
    //This part is to throttle the maximum required container per request
    int requestContainerNum = Math.min(incompleteTaskNum, maximumRequestContainer);
    if (LOG.isDebugEnabled()) {
      LOG.debug("Try to schedule task resources: " + requestContainerNum);
    }

    ServiceTracker serviceTracker =
        context.getMasterContext().getQueryMasterContext().getWorkerContext().getServiceTracker();
    NettyClientBase tmClient = RpcClientManager.getInstance().
        getClient(serviceTracker.getUmbilicalAddress(), QueryCoordinatorProtocol.class, true, rpcParams);
    QueryCoordinatorProtocolService masterClientService = tmClient.getStub();

    CallFuture<NodeResourceResponse> callBack = new CallFuture<>();
    NodeResourceRequest.Builder request = NodeResourceRequest.newBuilder();
    request.setCapacity(NodeResources.createResource(minTaskMemory).getProto())
        .setNumContainers(requestContainerNum)
        .setPriority(stage.getPriority())
        .setQueryId(context.getMasterContext().getQueryId().getProto())
        .setType(isLeaf ? ResourceType.LEAF : ResourceType.INTERMEDIATE)
        .setUserId(context.getMasterContext().getQueryContext().getUser())
        .setRunningTasks(stage.getTotalScheduledObjectsCount() - stage.getCompletedTaskCount())
        .addAllCandidateNodes(candidateWorkers)
        .setQueue(context.getMasterContext().getQueryContext().get("queue", "default")); //TODO set queue

    masterClientService.reserveNodeResources(callBack.getController(), request.build(), callBack);
    NodeResourceResponse response = callBack.get();

    for (AllocationResourceProto resource : response.getResourceList()) {
      taskRequestEvents.add(new TaskRequestEvent(resource.getWorkerId(), resource, context.getBlockId()));
    }

    return taskRequestEvents;
  }

  @Override
  public int remainingScheduledObjectNum() {
    return scheduledObjectNum;
  }

  public void releaseTaskAttempt(TaskAttempt taskAttempt) {
    if (taskAttempt != null && taskAttempt.isLeafTask() && taskAttempt.getWorkerConnectionInfo() != null) {

      HostVolumeMapping mapping =
          scheduledRequests.leafTaskHostMapping.get(taskAttempt.getWorkerConnectionInfo().getHost());
      if (mapping != null && mapping.lastAssignedVolumeId.containsKey(taskAttempt.getId())) {
        mapping.decreaseConcurrency(mapping.lastAssignedVolumeId.remove(taskAttempt.getId()));
      }
    }
  }
  /**
   * One worker can have multiple running task runners. <code>HostVolumeMapping</code>
   * describes various information for one worker, including :
   * <ul>
   *  <li>host name</li>
   *  <li>rack name</li>
   *  <li>unassigned tasks for each disk volume</li>
   *  <li>last assigned volume id - it can be used for assigning task in a round-robin manner</li>
   *  <li>the number of running tasks for each volume</li>
   * </ul>, each task runner and the concurrency number of running tasks for volumes.
   *
   * Here, we identifier a task runner by {@link ContainerId}, and we use volume ids to identify
   * all disks in this node. Actually, each volume is only used to distinguish disks, and we don't
   * know a certain volume id indicates a certain disk. If you want to know volume id, please read the below section.
   *
   * <h3>Volume id</h3>
   * Volume id is an integer. Each volume id identifies each disk volume.
   *
   * This volume id can be obtained from org.apache.hadoop.fs.BlockStorageLocation#getVolumeIds()}.   *
   * HDFS cannot give any volume id due to unknown reason and disabled config 'dfs.client.file-block-locations.enabled'.
   * In this case, the volume id will be -1 or other native integer.
   *
   * <h3>See Also</h3>
   * <ul>
   *   <li>HDFS-3672 (https://issues.apache.org/jira/browse/HDFS-3672).</li>
   * </ul>
   */
  public class HostVolumeMapping {
    private final String host;
    private final String rack;
    /** A key is disk volume, and a value is a list of tasks to be scheduled. */
    private Map<Integer, LinkedHashSet<TaskAttempt>> unassignedTaskForEachVolume =
        Collections.synchronizedMap(new HashMap<>());
    /** A value is last assigned volume id for each task runner */
    private HashMap<TaskAttemptId, Integer> lastAssignedVolumeId = Maps.newHashMap();
    /**
     * A key is disk volume id, and a value is the load of this volume.
     * This load is measured by counting how many number of tasks are running.
     *
     * These disk volumes are kept in an order of ascending order of the volume id.
     * In other words, the head volume ids are likely to -1, meaning no given volume id.
     */
    private SortedMap<Integer, Integer> diskVolumeLoads = new TreeMap<>();
    /** The total number of remain tasks in this host */
    private AtomicInteger remainTasksNum = new AtomicInteger(0);

    public HostVolumeMapping(String host, String rack){
      this.host = host;
      this.rack = rack;
    }

    public synchronized void addTaskAttempt(int volumeId, TaskAttempt attemptId){
      synchronized (unassignedTaskForEachVolume){
        LinkedHashSet<TaskAttempt> list = unassignedTaskForEachVolume.get(volumeId);
        if (list == null) {
          list = new LinkedHashSet<>();
          unassignedTaskForEachVolume.put(volumeId, list);
        }
        list.add(attemptId);
      }

      remainTasksNum.incrementAndGet();

      if(!diskVolumeLoads.containsKey(volumeId)) diskVolumeLoads.put(volumeId, 0);
    }

    /**
     *  Priorities
     *  1. a task list in a volume of host
     *  2. unknown block or Non-splittable task in host
     *  3. remote tasks. unassignedTaskForEachVolume is only contained local task. so it will be null
     */
    public synchronized TaskAttemptId getLocalTask() {
      int volumeId = getLowestVolumeId();
      TaskAttemptId taskAttemptId = null;

      if (unassignedTaskForEachVolume.size() >  0) {
        int retry = diskVolumeLoads.size();
        do {
          //clean and get a remaining local task
          taskAttemptId = getAndRemove(volumeId);

          if (taskAttemptId == null) {
            //reassign next volume
            volumeId = getLowestVolumeId();
            retry--;
          } else {
            lastAssignedVolumeId.put(taskAttemptId, volumeId);
            break;
          }
        } while (retry > 0);
      } else {
        this.remainTasksNum.set(0);
      }

      return taskAttemptId;
    }

    public synchronized TaskAttemptId getTaskAttemptIdByRack(String rack) {
      TaskAttemptId taskAttemptId = null;

      if (unassignedTaskForEachVolume.size() > 0 && this.rack.equals(rack)) {
        int retry = unassignedTaskForEachVolume.size();
        do {
          //clean and get a remaining task
          int volumeId = getLowestVolumeId();
          taskAttemptId = getAndRemove(volumeId);
          if (taskAttemptId == null) {
            retry--;
          } else {
            break;
          }
        } while (retry > 0);
      }
      return taskAttemptId;
    }

    private synchronized TaskAttemptId getAndRemove(int volumeId){
      TaskAttemptId taskAttemptId = null;
      if(!unassignedTaskForEachVolume.containsKey(volumeId)) {
        if (volumeId > DataLocation.REMOTE_VOLUME_ID) {
          diskVolumeLoads.remove(volumeId);
        }
        return taskAttemptId;
      }

      LinkedHashSet<TaskAttempt> list = unassignedTaskForEachVolume.get(volumeId);
      if (list != null && !list.isEmpty()) {
        TaskAttempt taskAttempt;
        synchronized (unassignedTaskForEachVolume) {
          Iterator<TaskAttempt> iterator = list.iterator();
          taskAttempt = iterator.next();
          iterator.remove();
          remainTasksNum.decrementAndGet();
        }

        taskAttemptId = taskAttempt.getId();
        for (DataLocation location : taskAttempt.getTask().getDataLocations()) {
          HostVolumeMapping volumeMapping = scheduledRequests.leafTaskHostMapping.get(location.getHost());
          if (volumeMapping != null) {
            volumeMapping.removeTaskAttempt(location.getVolumeId(), taskAttempt);
          }
        }

        increaseConcurrency(volumeId);
      } else {
        unassignedTaskForEachVolume.remove(volumeId);
      }

      return taskAttemptId;
    }

    private synchronized void removeTaskAttempt(int volumeId, TaskAttempt taskAttempt){
      if(!unassignedTaskForEachVolume.containsKey(volumeId)) return;

      LinkedHashSet<TaskAttempt> tasks  = unassignedTaskForEachVolume.get(volumeId);
      if(tasks.remove(taskAttempt)) {
        remainTasksNum.getAndDecrement();
      }

      if(tasks.isEmpty()){
        unassignedTaskForEachVolume.remove(volumeId);
        if (volumeId > DataLocation.REMOTE_VOLUME_ID) {
          diskVolumeLoads.remove(volumeId);
        }
      }
    }

    /**
     * Increase the count of running tasks and disk loads for a certain task runner.
     *
     * @param volumeId Volume identifier
     * @return the volume load (i.e., how many running tasks use this volume)
     */
    private synchronized int increaseConcurrency(int volumeId) {

      int concurrency = 1;
      if (diskVolumeLoads.containsKey(volumeId)) {
        concurrency = diskVolumeLoads.get(volumeId) + 1;
      }

      if (volumeId > DataLocation.UNKNOWN_VOLUME_ID) {
        info(LOG, "Assigned host : " + host + ", Volume : " + volumeId + ", Concurrency : " + concurrency);
      } else if (volumeId == DataLocation.UNKNOWN_VOLUME_ID) {
        // this case is disabled namenode block meta or compressed text file or amazon s3
        info(LOG, "Assigned host : " + host + ", Unknown Volume : " + volumeId + ", Concurrency : " + concurrency);
      } else if (volumeId == DataLocation.REMOTE_VOLUME_ID) {
        // this case has processed all block on host and it will be assigned to remote
        info(LOG, "Assigned host : " + host + ", Remaining local tasks : " + getRemainingLocalTaskSize()
            + ", Remote Concurrency : " + concurrency + ", Unassigned volumes: " + unassignedTaskForEachVolume.size());
      }
      diskVolumeLoads.put(volumeId, concurrency);
      return concurrency;
    }

    /**
     * Decrease the count of running tasks of a certain task runner
     */
    private synchronized void decreaseConcurrency(int volumeId){
      if(diskVolumeLoads.containsKey(volumeId)){
        int concurrency = diskVolumeLoads.get(volumeId);
        if(concurrency > 0){
          diskVolumeLoads.put(volumeId, concurrency - 1);
        }
      }
    }

    /**
     *  volume of a host : 0 ~ n
     *  compressed task, amazon s3, unKnown volume : -1
     *  remote task : -2
     */
    public int getLowestVolumeId(){
      Map.Entry<Integer, Integer> volumeEntry = null;

      for (Map.Entry<Integer, Integer> entry : diskVolumeLoads.entrySet()) {
        if(volumeEntry == null) volumeEntry = entry;

        if (entry.getKey() != DataLocation.REMOTE_VOLUME_ID && volumeEntry.getValue() >= entry.getValue()) {
          volumeEntry = entry;
        }
      }

      if(volumeEntry != null){
        return volumeEntry.getKey();
      } else {
        return DataLocation.REMOTE_VOLUME_ID;
      }
    }

    public int getRemoteConcurrency(){
      return getVolumeConcurrency(DataLocation.REMOTE_VOLUME_ID);
    }

    public int getVolumeConcurrency(int volumeId){
      Integer size = diskVolumeLoads.get(volumeId);
      if(size == null) return 0;
      else return size;
    }

    public int getRemainingLocalTaskSize(){
      return remainTasksNum.get();
    }

    public String getHost() {
      return host;
    }

    public String getRack() {
      return rack;
    }
  }

  protected void cancel(TaskAttempt taskAttempt) {

    TaskAttemptToSchedulerEvent schedulerEvent = new TaskAttemptToSchedulerEvent(
        EventType.T_SCHEDULE, taskAttempt.getTask().getId().getExecutionBlockId(),
        null, taskAttempt);

    if(taskAttempt.isLeafTask()) {
      releaseTaskAttempt(taskAttempt);

      scheduledRequests.addLeafTask(schedulerEvent);
    } else {
      scheduledRequests.addNonLeafTask(schedulerEvent);
    }

    context.getMasterContext().getEventHandler().handle(
        new TaskAttemptEvent(taskAttempt.getId(), TaskAttemptEventType.TA_ASSIGN_CANCEL));
  }

  protected int cancel(List<TaskAllocationProto> tasks) {
    int canceled = 0;
    for (TaskAllocationProto proto : tasks) {
      TaskAttemptId attemptId = new TaskAttemptId(proto.getTaskRequest().getId());
      cancel(stage.getTask(attemptId.getTaskId()).getAttempt(attemptId));
      canceled++;
    }
    return canceled;
  }

  private class ScheduledRequests {
    // two list leafTasks and nonLeafTasks keep all tasks to be scheduled. Even though some task is included in
    // leafTaskHostMapping or leafTasksRackMapping, some task T will not be sent to a task runner
    // if the task is not included in leafTasks and nonLeafTasks.
    private final Set<TaskAttemptId> leafTasks = Collections.synchronizedSet(new HashSet<>());
    private final Set<TaskAttemptId> nonLeafTasks = Collections.synchronizedSet(new HashSet<>());
    private Map<String, HostVolumeMapping> leafTaskHostMapping = Maps.newConcurrentMap();
    private final Map<String, HashSet<TaskAttemptId>> leafTasksRackMapping = Maps.newConcurrentMap();

    protected void clear() {
      leafTasks.clear();
      nonLeafTasks.clear();
      leafTaskHostMapping.clear();
      leafTasksRackMapping.clear();
    }

    private void addLeafTask(TaskAttemptToSchedulerEvent event) {
      TaskAttempt taskAttempt = event.getTaskAttempt();
      List<DataLocation> locations = taskAttempt.getTask().getDataLocations();

      for (DataLocation location : locations) {
        String host = location.getHost();
        leafTaskHosts.add(host);

        HostVolumeMapping hostVolumeMapping = leafTaskHostMapping.get(host);
        if (hostVolumeMapping == null) {
          String rack = RackResolver.resolve(host).getNetworkLocation();
          hostVolumeMapping = new HostVolumeMapping(host, rack);
          leafTaskHostMapping.put(host, hostVolumeMapping);
        }
        hostVolumeMapping.addTaskAttempt(location.getVolumeId(), taskAttempt);

        if (LOG.isDebugEnabled()) {
          LOG.debug("Added attempt req to host " + host);
        }

        HashSet<TaskAttemptId> list = leafTasksRackMapping.get(hostVolumeMapping.getRack());
        if (list == null) {
          list = new HashSet<>();
          leafTasksRackMapping.put(hostVolumeMapping.getRack(), list);
        }

        list.add(taskAttempt.getId());

        if (LOG.isDebugEnabled()) {
          LOG.debug("Added attempt req to rack " + hostVolumeMapping.getRack());
        }
      }

      leafTasks.add(taskAttempt.getId());
    }

    private void addNonLeafTask(TaskAttemptToSchedulerEvent event) {
      nonLeafTasks.add(event.getTaskAttempt().getId());
    }

    public int leafTaskNum() {
      return leafTasks.size();
    }

    public int nonLeafTaskNum() {
      return nonLeafTasks.size();
    }

    private TaskAttemptId allocateLocalTask(String host){
      HostVolumeMapping hostVolumeMapping = leafTaskHostMapping.get(host);

      if (hostVolumeMapping != null) { //tajo host is located in hadoop datanode
        for (int i = 0; i < hostVolumeMapping.getRemainingLocalTaskSize(); i++) {
          TaskAttemptId attemptId = hostVolumeMapping.getLocalTask();

          if(attemptId == null) break;
          //find remaining local task
          if (leafTasks.contains(attemptId)) {
            leafTasks.remove(attemptId);
            return attemptId;
          }
        }
      }
      return null;
    }

    private TaskAttemptId allocateRackTask(String host) {

      List<HostVolumeMapping> remainingTasks = Lists.newArrayList(leafTaskHostMapping.values());
      String rack = RackResolver.resolve(host).getNetworkLocation();
      TaskAttemptId attemptId = null;

      if (remainingTasks.size() > 0) {
        synchronized (scheduledRequests) {
          //find largest remaining task of other host in rack
          Collections.sort(remainingTasks, new Comparator<HostVolumeMapping>() {
            @Override
            public int compare(HostVolumeMapping v1, HostVolumeMapping v2) {
              // descending remaining tasks
              if (v2.remainTasksNum.get() > v1.remainTasksNum.get()) {
                return 1;
              } else if (v2.remainTasksNum.get() == v1.remainTasksNum.get()) {
                return 0;
              } else {
                return -1;
              }
            }
          });
        }

        for (HostVolumeMapping tasks : remainingTasks) {
          for (int i = 0; i < tasks.getRemainingLocalTaskSize(); i++) {
            TaskAttemptId tId = tasks.getTaskAttemptIdByRack(rack);

            if (tId == null) break;

            if (leafTasks.contains(tId)) {
              leafTasks.remove(tId);
              attemptId = tId;
              break;
            }
          }
          if(attemptId != null) break;
        }
      }

      //find task in rack
      if (attemptId == null) {
        HashSet<TaskAttemptId> list = leafTasksRackMapping.get(rack);
        if (list != null) {
          synchronized (list) {
            Iterator<TaskAttemptId> iterator = list.iterator();
            while (iterator.hasNext()) {
              TaskAttemptId tId = iterator.next();
              iterator.remove();
              if (leafTasks.contains(tId)) {
                leafTasks.remove(tId);
                attemptId = tId;
                break;
              }
            }
          }
        }
      }

      return attemptId;
    }

    public void assignToLeafTasks(LinkedList<TaskRequestEvent> taskRequests) throws InterruptedException {
      Collections.shuffle(taskRequests);
      LinkedList<TaskRequestEvent> remoteTaskRequests = new LinkedList<>();
      String queryMasterHostAndPort = context.getMasterContext().getQueryMasterContext().getWorkerContext().
          getConnectionInfo().getHostAndQMPort();

      TaskRequestEvent taskRequest;
      while (leafTasks.size() > 0 && (!taskRequests.isEmpty() || !remoteTaskRequests.isEmpty())) {
        int localAssign = 0;
        int rackAssign = 0;

        taskRequest = taskRequests.pollFirst();
        if(taskRequest == null) { // if there are only remote task requests
          taskRequest = remoteTaskRequests.pollFirst();
        }

        // checking if this container is still alive.
        // If not, ignore the task request and stop the task runner
        WorkerConnectionInfo connectionInfo = context.getMasterContext().getWorkerMap().get(taskRequest.getWorkerId());
        if(connectionInfo == null) continue;

        // getting the hostname of requested node
        String host = connectionInfo.getHost();

        // if there are no worker matched to the hostname a task request
        if (!leafTaskHostMapping.containsKey(host) && !taskRequests.isEmpty()) {
          String normalizedHost = NetUtils.normalizeHost(host);

          if (!leafTaskHostMapping.containsKey(normalizedHost)) {
            // this case means one of either cases:
            // * there are no blocks which reside in this node.
            // * all blocks which reside in this node are consumed, and this task runner requests a remote task.
            // In this case, we transfer the task request to the remote task request list, and skip the followings.
            remoteTaskRequests.add(taskRequest);
            continue;
          } else {
            host = normalizedHost;
          }
        }

        if (LOG.isDebugEnabled()) {
          LOG.debug("assignToLeafTasks: " + taskRequest.getExecutionBlockId() + "," +
              "worker=" + connectionInfo.getHostAndPeerRpcPort());
        }

        //////////////////////////////////////////////////////////////////////
        // disk or host-local allocation
        //////////////////////////////////////////////////////////////////////
        TaskAttemptId attemptId = allocateLocalTask(host);
        int assignedVolume = DataLocation.REMOTE_VOLUME_ID;
        HostVolumeMapping hostVolumeMapping = leafTaskHostMapping.get(host);

        if (attemptId == null) { // if a local task cannot be found

          if(!taskRequests.isEmpty()) { //if other requests remains, move to remote list for better locality
            remoteTaskRequests.add(taskRequest);
            candidateWorkers.remove(connectionInfo.getId());
            continue;

          } else {
            if(hostVolumeMapping != null) {
              int nodes = context.getMasterContext().getWorkerMap().size();
              //this part is to control the assignment of tail and remote task balancing per node
              int tailLimit = 1;
              if (remainingScheduledObjectNum() > 0 && nodes > 0) {
                tailLimit = Math.max(remainingScheduledObjectNum() / nodes, 1);
              }

              //remote task throttling per node
              if (nodes > 1 && hostVolumeMapping.getRemoteConcurrency() >= tailLimit) {
                continue;
              } else {
                // assign to remote volume
                hostVolumeMapping.increaseConcurrency(assignedVolume);
              }
            }
          }

          //////////////////////////////////////////////////////////////////////
          // rack-local allocation
          //////////////////////////////////////////////////////////////////////
          attemptId = allocateRackTask(host);

          //////////////////////////////////////////////////////////////////////
          // random node allocation
          //////////////////////////////////////////////////////////////////////
          if (attemptId == null && leafTaskNum() > 0) {
            synchronized (leafTasks){
              attemptId = leafTasks.iterator().next();
              leafTasks.remove(attemptId);
            }
          }

          if (attemptId != null && hostVolumeMapping != null) {
            hostVolumeMapping.lastAssignedVolumeId.put(attemptId, assignedVolume);
          }
          rackAssign++;
        } else {
          if(hostVolumeMapping != null){
            //Set to real volume id
            assignedVolume = hostVolumeMapping.lastAssignedVolumeId.get(attemptId);
          }

          localAssign++;
        }

        if (attemptId != null) {
          Task task = stage.getTask(attemptId.getTaskId());
          TaskRequest taskAssign = new TaskRequestImpl(
              attemptId,
                  new ArrayList<>(task.getAllFragments()),
              "",
              false,
              LogicalNodeSerializer.serialize(task.getLogicalPlan()),
              context.getMasterContext().getQueryContext(),
              stage.getDataChannel(), stage.getBlock().getEnforcer(),
              queryMasterHostAndPort);

          NodeResource resource = new NodeResource(taskRequest.getResponseProto().getResource());

          if (checkIfInterQuery(stage.getMasterPlan(), stage.getBlock())) {
            taskAssign.setInterQuery();
          }

          //TODO send batch request
          BatchAllocationRequest.Builder requestProto = BatchAllocationRequest.newBuilder();
          requestProto.addTaskRequest(TaskAllocationProto.newBuilder()
              .setResource(resource.getProto()).setVolumeId(assignedVolume)
              .setTaskRequest(taskAssign.getProto()).build());

          requestProto.setExecutionBlockId(attemptId.getTaskId().getExecutionBlockId().getProto());
          context.getMasterContext().getEventHandler().handle(new TaskAttemptAssignedEvent(attemptId, connectionInfo));

          InetSocketAddress addr = stage.getAssignedWorkerMap().get(connectionInfo.getId());
          if (addr == null) addr = new InetSocketAddress(connectionInfo.getHost(), connectionInfo.getPeerRpcPort());

          AsyncRpcClient tajoWorkerRpc = null;
          CallFuture<BatchAllocationResponse> callFuture = new CallFuture<>();
          totalAttempts++;
          try {
            tajoWorkerRpc = RpcClientManager.getInstance().getClient(addr, TajoWorkerProtocol.class, true,
                rpcParams);

            TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerRpcClient = tajoWorkerRpc.getStub();
            tajoWorkerRpcClient.allocateTasks(callFuture.getController(), requestProto.build(), callFuture);

            BatchAllocationResponse responseProto = callFuture.get();

            if (responseProto.getCancellationTaskCount() > 0) {
              cancellation += cancel(responseProto.getCancellationTaskList());
              info(LOG, "Canceled requests: " + responseProto.getCancellationTaskCount() + " from " +  addr);
              continue;
            }
          } catch (ExecutionException | ConnectException e) {
            cancellation += cancel(requestProto.getTaskRequestList());

            warn(LOG, "Canceled requests: " + requestProto.getTaskRequestCount()
                + " by " + ExceptionUtils.getFullStackTrace(e));
            continue;
          } catch (InterruptedException e) {
            throw e;
          } catch (Exception e) {
            throw new TajoInternalError(e);
          }

          scheduledObjectNum--;
          totalAssigned++;
          hostLocalAssigned += localAssign;
          rackLocalAssigned += rackAssign;

          if (rackAssign > 0) {
            info(LOG, String.format("Assigned Local/Rack/Total: (%d/%d/%d), " +
                    "Attempted Cancel/Assign/Total: (%d/%d/%d), " +
                    "Locality: %.2f%%, Rack host: %s",
                hostLocalAssigned, rackLocalAssigned, totalAssigned,
                cancellation, totalAssigned, totalAttempts,
                ((double) hostLocalAssigned / (double) totalAssigned) * 100, host));
          }

        } else {
          throw new RuntimeException("Illegal State!!!!!!!!!!!!!!!!!!!!!");
        }
      }
    }

    private boolean checkIfInterQuery(MasterPlan masterPlan, ExecutionBlock block) {
      if (masterPlan.isRoot(block)) {
        return false;
      }

      ExecutionBlock parent = masterPlan.getParent(block);
      if (masterPlan.isRoot(parent) && parent.isUnionOnly()) {
        return false;
      }

      return true;
    }

    public void assignToNonLeafTasks(LinkedList<TaskRequestEvent> taskRequests) throws InterruptedException {
      Collections.shuffle(taskRequests);
      String queryMasterHostAndPort = context.getMasterContext().getQueryMasterContext().getWorkerContext().
          getConnectionInfo().getHostAndQMPort();

      TaskRequestEvent taskRequest;
      while (!taskRequests.isEmpty()) {
        taskRequest = taskRequests.pollFirst();
        LOG.debug("assignToNonLeafTasks: " + taskRequest.getExecutionBlockId());

        TaskAttemptId attemptId;
        // random allocation
        if (nonLeafTasks.size() > 0) {
          synchronized (nonLeafTasks){
            attemptId = nonLeafTasks.iterator().next();
            nonLeafTasks.remove(attemptId);
          }
          LOG.debug("Assigned based on * match");

          Task task;
          task = stage.getTask(attemptId.getTaskId());

          TaskRequest taskAssign = new TaskRequestImpl(
              attemptId,
              Lists.newArrayList(task.getAllFragments()),
              "",
              false,
              LogicalNodeSerializer.serialize(task.getLogicalPlan()),
              context.getMasterContext().getQueryContext(),
              stage.getDataChannel(),
              stage.getBlock().getEnforcer(),
              queryMasterHostAndPort);

          if (checkIfInterQuery(stage.getMasterPlan(), stage.getBlock())) {
            taskAssign.setInterQuery();
          }
          for(Map.Entry<String, Set<FetchProto>> entry: task.getFetchMap().entrySet()) {
            Collection<FetchProto> fetches = entry.getValue();
            if (fetches != null) {
              for (FetchProto fetch : fetches) {
                taskAssign.addFetch(fetch);
              }
            }
          }

          WorkerConnectionInfo connectionInfo =
              context.getMasterContext().getWorkerMap().get(taskRequest.getWorkerId());

          //TODO send batch request
          BatchAllocationRequest.Builder requestProto = BatchAllocationRequest.newBuilder();
          requestProto.addTaskRequest(TaskAllocationProto.newBuilder()
              .setResource(taskRequest.getResponseProto().getResource())
              .setTaskRequest(taskAssign.getProto()).build());

          requestProto.setExecutionBlockId(attemptId.getTaskId().getExecutionBlockId().getProto());
          context.getMasterContext().getEventHandler().handle(new TaskAttemptAssignedEvent(attemptId, connectionInfo));

          CallFuture<BatchAllocationResponse> callFuture = new CallFuture<>();

          InetSocketAddress addr = stage.getAssignedWorkerMap().get(connectionInfo.getId());
          if (addr == null) addr = new InetSocketAddress(connectionInfo.getHost(), connectionInfo.getPeerRpcPort());

          AsyncRpcClient tajoWorkerRpc;
          try {
            tajoWorkerRpc = RpcClientManager.getInstance().getClient(addr, TajoWorkerProtocol.class, true,
                rpcParams);

            TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerRpcClient = tajoWorkerRpc.getStub();
            tajoWorkerRpcClient.allocateTasks(callFuture.getController(), requestProto.build(), callFuture);

            BatchAllocationResponse responseProto = callFuture.get();

            if(responseProto.getCancellationTaskCount() > 0) {
              cancellation += cancel(responseProto.getCancellationTaskList());
              info(LOG, "Canceled requests: " + responseProto.getCancellationTaskCount() + " from " +  addr);
              continue;
            }

          } catch (ExecutionException | ConnectException e) {
            cancellation += cancel(requestProto.getTaskRequestList());
            warn(LOG, "Canceled requests: " + requestProto.getTaskRequestCount()
                + " by " + ExceptionUtils.getFullStackTrace(e));
            continue;
          } catch (InterruptedException e) {
            throw e;
          } catch (Exception e) {
            throw new TajoInternalError(e);
          }

          totalAssigned++;
          scheduledObjectNum--;
        }
      }
    }
  }
}