/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tajo.querymaster; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import org.apache.commons.lang.exception.ExceptionUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.util.RackResolver; import org.apache.tajo.TaskAttemptId; import org.apache.tajo.conf.TajoConf; import org.apache.tajo.engine.planner.global.ExecutionBlock; import org.apache.tajo.engine.planner.global.MasterPlan; import org.apache.tajo.engine.query.TaskRequest; import org.apache.tajo.engine.query.TaskRequestImpl; import org.apache.tajo.exception.TajoInternalError; import org.apache.tajo.ipc.QueryCoordinatorProtocol; import org.apache.tajo.ipc.QueryCoordinatorProtocol.QueryCoordinatorProtocolService; import org.apache.tajo.ipc.TajoWorkerProtocol; import org.apache.tajo.master.cluster.WorkerConnectionInfo; import org.apache.tajo.master.event.*; import org.apache.tajo.master.event.TaskAttemptToSchedulerEvent.TaskAttemptScheduleContext; import org.apache.tajo.master.event.TaskSchedulerEvent.EventType; import org.apache.tajo.plan.serder.LogicalNodeSerializer; import org.apache.tajo.resource.NodeResource; import org.apache.tajo.resource.NodeResources; import org.apache.tajo.rpc.AsyncRpcClient; import org.apache.tajo.rpc.CallFuture; import org.apache.tajo.rpc.NettyClientBase; import org.apache.tajo.rpc.RpcClientManager; import org.apache.tajo.service.ServiceTracker; import org.apache.tajo.storage.DataLocation; import org.apache.tajo.storage.fragment.Fragment; import org.apache.tajo.util.NetUtils; import org.apache.tajo.util.RpcParameterFactory; import org.apache.tajo.util.TUtil; import java.net.ConnectException; import java.net.InetSocketAddress; import java.util.*; import java.util.Map.Entry; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import static org.apache.tajo.ResourceProtos.*; public class DefaultTaskScheduler extends AbstractTaskScheduler { private static final Log LOG = LogFactory.getLog(DefaultTaskScheduler.class); private final TaskSchedulerContext context; private Stage stage; private TajoConf tajoConf; private Properties rpcParams; private Thread schedulingThread; private volatile boolean isStopped; private AtomicBoolean needWakeup = new AtomicBoolean(); private ScheduledRequests scheduledRequests; private int minTaskMemory; private int nextTaskId = 0; private int scheduledObjectNum = 0; private boolean isLeaf; private int schedulerDelay; private int maximumRequestContainer; // candidate workers for locality of high priority private Set<Integer> candidateWorkers = Sets.newHashSet(); public DefaultTaskScheduler(TaskSchedulerContext context, Stage stage) { super(DefaultTaskScheduler.class.getName()); this.context = context; this.stage = stage; } @Override public void init(Configuration conf) { tajoConf = TUtil.checkTypeAndGet(conf, TajoConf.class); rpcParams = RpcParameterFactory.get(tajoConf); scheduledRequests = new ScheduledRequests(); minTaskMemory = tajoConf.getIntVar(TajoConf.ConfVars.TASK_RESOURCE_MINIMUM_MEMORY); schedulerDelay= tajoConf.getIntVar(TajoConf.ConfVars.QUERYMASTER_TASK_SCHEDULER_DELAY); isLeaf = stage.getMasterPlan().isLeaf(stage.getBlock()); this.schedulingThread = new Thread() { public void run() { while (!isStopped && !Thread.currentThread().isInterrupted()) { try { schedule(); } catch (InterruptedException e) { if (isStopped) { break; } else { LOG.fatal(e.getMessage(), e); stage.abort(StageState.ERROR, e); } } catch (Throwable e) { LOG.fatal(e.getMessage(), e); stage.abort(StageState.ERROR, e); break; } } info(LOG, "TaskScheduler schedulingThread stopped"); } }; super.init(conf); } @Override public void start() { info(LOG, "Start TaskScheduler"); maximumRequestContainer = Math.min(tajoConf.getIntVar(TajoConf.ConfVars.QUERYMASTER_TASK_SCHEDULER_REQUEST_MAX_NUM) , stage.getContext().getWorkerMap().size()); if (isLeaf) { candidateWorkers.addAll(getWorkerIds(getLeafTaskHosts())); } else { //find assigned hosts for Non-Leaf locality in children executionBlock List<ExecutionBlock> executionBlockList = stage.getMasterPlan().getChilds(stage.getBlock()); for (ExecutionBlock executionBlock : executionBlockList) { Stage childStage = stage.getContext().getStage(executionBlock.getId()); candidateWorkers.addAll(childStage.getAssignedWorkerMap().keySet()); } } this.schedulingThread.start(); super.start(); } @Override public void stop() { isStopped = true; if (schedulingThread != null) { synchronized (schedulingThread) { schedulingThread.interrupt(); } } candidateWorkers.clear(); scheduledRequests.clear(); info(LOG, "Task Scheduler stopped"); super.stop(); } protected void info(Log log, String message) { log.info(String.format("[%s] %s", stage.getId(), message)); } protected void warn(Log log, String message) { log.warn(String.format("[%s] %s", stage.getId(), message)); } private Fragment[] fragmentsForNonLeafTask; private Fragment[] broadcastFragmentsForNonLeafTask; public void schedule() throws Exception { try { final int incompleteTaskNum = scheduledRequests.leafTaskNum() + scheduledRequests.nonLeafTaskNum(); if (incompleteTaskNum == 0) { needWakeup.set(true); // all task is done or tasks is not scheduled synchronized (schedulingThread) { schedulingThread.wait(1000); } } else { LinkedList<TaskRequestEvent> taskRequests = createTaskRequest(incompleteTaskNum); if (taskRequests.size() == 0) { synchronized (schedulingThread) { schedulingThread.wait(schedulerDelay); } } else { if (LOG.isDebugEnabled()) { LOG.debug("Get " + taskRequests.size() + " taskRequestEvents "); } if (isLeaf) { scheduledRequests.assignToLeafTasks(taskRequests); } else { scheduledRequests.assignToNonLeafTasks(taskRequests); } } } } catch (TimeoutException e) { LOG.error(e.getMessage()); } } @Override public void handle(TaskSchedulerEvent event) { if (event.getType() == EventType.T_SCHEDULE) { if (event instanceof FragmentScheduleEvent) { FragmentScheduleEvent castEvent = (FragmentScheduleEvent) event; if (context.isLeafQuery()) { TaskAttemptScheduleContext taskContext = new TaskAttemptScheduleContext(); Task task = Stage.newEmptyTask(context, taskContext, stage, nextTaskId++); task.addFragment(castEvent.getLeftFragment(), true); scheduledObjectNum++; if (castEvent.hasRightFragments()) { task.addFragments(castEvent.getRightFragments()); } stage.getEventHandler().handle(new TaskEvent(task.getId(), TaskEventType.T_SCHEDULE)); } else { fragmentsForNonLeafTask = new Fragment[2]; fragmentsForNonLeafTask[0] = castEvent.getLeftFragment(); if (castEvent.hasRightFragments()) { Collection<Fragment> var = castEvent.getRightFragments(); Fragment[] rightFragments = var.toArray(new Fragment[var.size()]); fragmentsForNonLeafTask[1] = rightFragments[0]; if (rightFragments.length > 1) { broadcastFragmentsForNonLeafTask = new Fragment[rightFragments.length - 1]; System.arraycopy(rightFragments, 1, broadcastFragmentsForNonLeafTask, 0, broadcastFragmentsForNonLeafTask.length); } else { broadcastFragmentsForNonLeafTask = null; } } } } else if (event instanceof FetchScheduleEvent) { FetchScheduleEvent castEvent = (FetchScheduleEvent) event; Map<String, List<FetchProto>> fetches = castEvent.getFetches(); TaskAttemptScheduleContext taskScheduleContext = new TaskAttemptScheduleContext(); Task task = Stage.newEmptyTask(context, taskScheduleContext, stage, nextTaskId++); scheduledObjectNum++; for (Entry<String, List<FetchProto>> eachFetch : fetches.entrySet()) { task.addFetches(eachFetch.getKey(), eachFetch.getValue()); task.addFragment(fragmentsForNonLeafTask[0], true); if (fragmentsForNonLeafTask[1] != null) { task.addFragment(fragmentsForNonLeafTask[1], true); } } if (broadcastFragmentsForNonLeafTask != null && broadcastFragmentsForNonLeafTask.length > 0) { task.addFragments(Arrays.asList(broadcastFragmentsForNonLeafTask)); } stage.getEventHandler().handle(new TaskEvent(task.getId(), TaskEventType.T_SCHEDULE)); } else if (event instanceof TaskAttemptToSchedulerEvent) { TaskAttemptToSchedulerEvent castEvent = (TaskAttemptToSchedulerEvent) event; if (context.isLeafQuery()) { scheduledRequests.addLeafTask(castEvent); } else { scheduledRequests.addNonLeafTask(castEvent); } if (needWakeup.getAndSet(false)) { //wake up scheduler thread after scheduled synchronized (schedulingThread) { schedulingThread.notifyAll(); } } } } else if (event.getType() == EventType.T_SCHEDULE_CANCEL) { // when a stage is killed, unassigned query unit attmpts are canceled from the scheduler. // This event is triggered by TaskAttempt. TaskAttemptToSchedulerEvent castedEvent = (TaskAttemptToSchedulerEvent) event; scheduledRequests.leafTasks.remove(castedEvent.getTaskAttempt().getId()); LOG.info(castedEvent.getTaskAttempt().getId() + " is canceled from " + this.getClass().getSimpleName()); ((TaskAttemptToSchedulerEvent) event).getTaskAttempt().handle( new TaskAttemptEvent(castedEvent.getTaskAttempt().getId(), TaskAttemptEventType.TA_SCHEDULE_CANCELED)); } } private Set<Integer> getWorkerIds(Collection<String> hosts){ Set<Integer> workerIds = Sets.newHashSet(); if(hosts.isEmpty()) return workerIds; for (WorkerConnectionInfo worker : stage.getContext().getWorkerMap().values()) { if(hosts.contains(worker.getHost())){ workerIds.add(worker.getId()); } } return workerIds; } protected LinkedList<TaskRequestEvent> createTaskRequest(final int incompleteTaskNum) throws Exception { LinkedList<TaskRequestEvent> taskRequestEvents = new LinkedList<>(); //If scheduled tasks is long-term task, cluster resource can be the worst load balance. //This part is to throttle the maximum required container per request int requestContainerNum = Math.min(incompleteTaskNum, maximumRequestContainer); if (LOG.isDebugEnabled()) { LOG.debug("Try to schedule task resources: " + requestContainerNum); } ServiceTracker serviceTracker = context.getMasterContext().getQueryMasterContext().getWorkerContext().getServiceTracker(); NettyClientBase tmClient = RpcClientManager.getInstance(). getClient(serviceTracker.getUmbilicalAddress(), QueryCoordinatorProtocol.class, true, rpcParams); QueryCoordinatorProtocolService masterClientService = tmClient.getStub(); CallFuture<NodeResourceResponse> callBack = new CallFuture<>(); NodeResourceRequest.Builder request = NodeResourceRequest.newBuilder(); request.setCapacity(NodeResources.createResource(minTaskMemory).getProto()) .setNumContainers(requestContainerNum) .setPriority(stage.getPriority()) .setQueryId(context.getMasterContext().getQueryId().getProto()) .setType(isLeaf ? ResourceType.LEAF : ResourceType.INTERMEDIATE) .setUserId(context.getMasterContext().getQueryContext().getUser()) .setRunningTasks(stage.getTotalScheduledObjectsCount() - stage.getCompletedTaskCount()) .addAllCandidateNodes(candidateWorkers) .setQueue(context.getMasterContext().getQueryContext().get("queue", "default")); //TODO set queue masterClientService.reserveNodeResources(callBack.getController(), request.build(), callBack); NodeResourceResponse response = callBack.get(); for (AllocationResourceProto resource : response.getResourceList()) { taskRequestEvents.add(new TaskRequestEvent(resource.getWorkerId(), resource, context.getBlockId())); } return taskRequestEvents; } @Override public int remainingScheduledObjectNum() { return scheduledObjectNum; } public void releaseTaskAttempt(TaskAttempt taskAttempt) { if (taskAttempt != null && taskAttempt.isLeafTask() && taskAttempt.getWorkerConnectionInfo() != null) { HostVolumeMapping mapping = scheduledRequests.leafTaskHostMapping.get(taskAttempt.getWorkerConnectionInfo().getHost()); if (mapping != null && mapping.lastAssignedVolumeId.containsKey(taskAttempt.getId())) { mapping.decreaseConcurrency(mapping.lastAssignedVolumeId.remove(taskAttempt.getId())); } } } /** * One worker can have multiple running task runners. <code>HostVolumeMapping</code> * describes various information for one worker, including : * <ul> * <li>host name</li> * <li>rack name</li> * <li>unassigned tasks for each disk volume</li> * <li>last assigned volume id - it can be used for assigning task in a round-robin manner</li> * <li>the number of running tasks for each volume</li> * </ul>, each task runner and the concurrency number of running tasks for volumes. * * Here, we identifier a task runner by {@link ContainerId}, and we use volume ids to identify * all disks in this node. Actually, each volume is only used to distinguish disks, and we don't * know a certain volume id indicates a certain disk. If you want to know volume id, please read the below section. * * <h3>Volume id</h3> * Volume id is an integer. Each volume id identifies each disk volume. * * This volume id can be obtained from org.apache.hadoop.fs.BlockStorageLocation#getVolumeIds()}. * * HDFS cannot give any volume id due to unknown reason and disabled config 'dfs.client.file-block-locations.enabled'. * In this case, the volume id will be -1 or other native integer. * * <h3>See Also</h3> * <ul> * <li>HDFS-3672 (https://issues.apache.org/jira/browse/HDFS-3672).</li> * </ul> */ public class HostVolumeMapping { private final String host; private final String rack; /** A key is disk volume, and a value is a list of tasks to be scheduled. */ private Map<Integer, LinkedHashSet<TaskAttempt>> unassignedTaskForEachVolume = Collections.synchronizedMap(new HashMap<>()); /** A value is last assigned volume id for each task runner */ private HashMap<TaskAttemptId, Integer> lastAssignedVolumeId = Maps.newHashMap(); /** * A key is disk volume id, and a value is the load of this volume. * This load is measured by counting how many number of tasks are running. * * These disk volumes are kept in an order of ascending order of the volume id. * In other words, the head volume ids are likely to -1, meaning no given volume id. */ private SortedMap<Integer, Integer> diskVolumeLoads = new TreeMap<>(); /** The total number of remain tasks in this host */ private AtomicInteger remainTasksNum = new AtomicInteger(0); public HostVolumeMapping(String host, String rack){ this.host = host; this.rack = rack; } public synchronized void addTaskAttempt(int volumeId, TaskAttempt attemptId){ synchronized (unassignedTaskForEachVolume){ LinkedHashSet<TaskAttempt> list = unassignedTaskForEachVolume.get(volumeId); if (list == null) { list = new LinkedHashSet<>(); unassignedTaskForEachVolume.put(volumeId, list); } list.add(attemptId); } remainTasksNum.incrementAndGet(); if(!diskVolumeLoads.containsKey(volumeId)) diskVolumeLoads.put(volumeId, 0); } /** * Priorities * 1. a task list in a volume of host * 2. unknown block or Non-splittable task in host * 3. remote tasks. unassignedTaskForEachVolume is only contained local task. so it will be null */ public synchronized TaskAttemptId getLocalTask() { int volumeId = getLowestVolumeId(); TaskAttemptId taskAttemptId = null; if (unassignedTaskForEachVolume.size() > 0) { int retry = diskVolumeLoads.size(); do { //clean and get a remaining local task taskAttemptId = getAndRemove(volumeId); if (taskAttemptId == null) { //reassign next volume volumeId = getLowestVolumeId(); retry--; } else { lastAssignedVolumeId.put(taskAttemptId, volumeId); break; } } while (retry > 0); } else { this.remainTasksNum.set(0); } return taskAttemptId; } public synchronized TaskAttemptId getTaskAttemptIdByRack(String rack) { TaskAttemptId taskAttemptId = null; if (unassignedTaskForEachVolume.size() > 0 && this.rack.equals(rack)) { int retry = unassignedTaskForEachVolume.size(); do { //clean and get a remaining task int volumeId = getLowestVolumeId(); taskAttemptId = getAndRemove(volumeId); if (taskAttemptId == null) { retry--; } else { break; } } while (retry > 0); } return taskAttemptId; } private synchronized TaskAttemptId getAndRemove(int volumeId){ TaskAttemptId taskAttemptId = null; if(!unassignedTaskForEachVolume.containsKey(volumeId)) { if (volumeId > DataLocation.REMOTE_VOLUME_ID) { diskVolumeLoads.remove(volumeId); } return taskAttemptId; } LinkedHashSet<TaskAttempt> list = unassignedTaskForEachVolume.get(volumeId); if (list != null && !list.isEmpty()) { TaskAttempt taskAttempt; synchronized (unassignedTaskForEachVolume) { Iterator<TaskAttempt> iterator = list.iterator(); taskAttempt = iterator.next(); iterator.remove(); remainTasksNum.decrementAndGet(); } taskAttemptId = taskAttempt.getId(); for (DataLocation location : taskAttempt.getTask().getDataLocations()) { HostVolumeMapping volumeMapping = scheduledRequests.leafTaskHostMapping.get(location.getHost()); if (volumeMapping != null) { volumeMapping.removeTaskAttempt(location.getVolumeId(), taskAttempt); } } increaseConcurrency(volumeId); } else { unassignedTaskForEachVolume.remove(volumeId); } return taskAttemptId; } private synchronized void removeTaskAttempt(int volumeId, TaskAttempt taskAttempt){ if(!unassignedTaskForEachVolume.containsKey(volumeId)) return; LinkedHashSet<TaskAttempt> tasks = unassignedTaskForEachVolume.get(volumeId); if(tasks.remove(taskAttempt)) { remainTasksNum.getAndDecrement(); } if(tasks.isEmpty()){ unassignedTaskForEachVolume.remove(volumeId); if (volumeId > DataLocation.REMOTE_VOLUME_ID) { diskVolumeLoads.remove(volumeId); } } } /** * Increase the count of running tasks and disk loads for a certain task runner. * * @param volumeId Volume identifier * @return the volume load (i.e., how many running tasks use this volume) */ private synchronized int increaseConcurrency(int volumeId) { int concurrency = 1; if (diskVolumeLoads.containsKey(volumeId)) { concurrency = diskVolumeLoads.get(volumeId) + 1; } if (volumeId > DataLocation.UNKNOWN_VOLUME_ID) { info(LOG, "Assigned host : " + host + ", Volume : " + volumeId + ", Concurrency : " + concurrency); } else if (volumeId == DataLocation.UNKNOWN_VOLUME_ID) { // this case is disabled namenode block meta or compressed text file or amazon s3 info(LOG, "Assigned host : " + host + ", Unknown Volume : " + volumeId + ", Concurrency : " + concurrency); } else if (volumeId == DataLocation.REMOTE_VOLUME_ID) { // this case has processed all block on host and it will be assigned to remote info(LOG, "Assigned host : " + host + ", Remaining local tasks : " + getRemainingLocalTaskSize() + ", Remote Concurrency : " + concurrency + ", Unassigned volumes: " + unassignedTaskForEachVolume.size()); } diskVolumeLoads.put(volumeId, concurrency); return concurrency; } /** * Decrease the count of running tasks of a certain task runner */ private synchronized void decreaseConcurrency(int volumeId){ if(diskVolumeLoads.containsKey(volumeId)){ int concurrency = diskVolumeLoads.get(volumeId); if(concurrency > 0){ diskVolumeLoads.put(volumeId, concurrency - 1); } } } /** * volume of a host : 0 ~ n * compressed task, amazon s3, unKnown volume : -1 * remote task : -2 */ public int getLowestVolumeId(){ Map.Entry<Integer, Integer> volumeEntry = null; for (Map.Entry<Integer, Integer> entry : diskVolumeLoads.entrySet()) { if(volumeEntry == null) volumeEntry = entry; if (entry.getKey() != DataLocation.REMOTE_VOLUME_ID && volumeEntry.getValue() >= entry.getValue()) { volumeEntry = entry; } } if(volumeEntry != null){ return volumeEntry.getKey(); } else { return DataLocation.REMOTE_VOLUME_ID; } } public int getRemoteConcurrency(){ return getVolumeConcurrency(DataLocation.REMOTE_VOLUME_ID); } public int getVolumeConcurrency(int volumeId){ Integer size = diskVolumeLoads.get(volumeId); if(size == null) return 0; else return size; } public int getRemainingLocalTaskSize(){ return remainTasksNum.get(); } public String getHost() { return host; } public String getRack() { return rack; } } protected void cancel(TaskAttempt taskAttempt) { TaskAttemptToSchedulerEvent schedulerEvent = new TaskAttemptToSchedulerEvent( EventType.T_SCHEDULE, taskAttempt.getTask().getId().getExecutionBlockId(), null, taskAttempt); if(taskAttempt.isLeafTask()) { releaseTaskAttempt(taskAttempt); scheduledRequests.addLeafTask(schedulerEvent); } else { scheduledRequests.addNonLeafTask(schedulerEvent); } context.getMasterContext().getEventHandler().handle( new TaskAttemptEvent(taskAttempt.getId(), TaskAttemptEventType.TA_ASSIGN_CANCEL)); } protected int cancel(List<TaskAllocationProto> tasks) { int canceled = 0; for (TaskAllocationProto proto : tasks) { TaskAttemptId attemptId = new TaskAttemptId(proto.getTaskRequest().getId()); cancel(stage.getTask(attemptId.getTaskId()).getAttempt(attemptId)); canceled++; } return canceled; } private class ScheduledRequests { // two list leafTasks and nonLeafTasks keep all tasks to be scheduled. Even though some task is included in // leafTaskHostMapping or leafTasksRackMapping, some task T will not be sent to a task runner // if the task is not included in leafTasks and nonLeafTasks. private final Set<TaskAttemptId> leafTasks = Collections.synchronizedSet(new HashSet<>()); private final Set<TaskAttemptId> nonLeafTasks = Collections.synchronizedSet(new HashSet<>()); private Map<String, HostVolumeMapping> leafTaskHostMapping = Maps.newConcurrentMap(); private final Map<String, HashSet<TaskAttemptId>> leafTasksRackMapping = Maps.newConcurrentMap(); protected void clear() { leafTasks.clear(); nonLeafTasks.clear(); leafTaskHostMapping.clear(); leafTasksRackMapping.clear(); } private void addLeafTask(TaskAttemptToSchedulerEvent event) { TaskAttempt taskAttempt = event.getTaskAttempt(); List<DataLocation> locations = taskAttempt.getTask().getDataLocations(); for (DataLocation location : locations) { String host = location.getHost(); leafTaskHosts.add(host); HostVolumeMapping hostVolumeMapping = leafTaskHostMapping.get(host); if (hostVolumeMapping == null) { String rack = RackResolver.resolve(host).getNetworkLocation(); hostVolumeMapping = new HostVolumeMapping(host, rack); leafTaskHostMapping.put(host, hostVolumeMapping); } hostVolumeMapping.addTaskAttempt(location.getVolumeId(), taskAttempt); if (LOG.isDebugEnabled()) { LOG.debug("Added attempt req to host " + host); } HashSet<TaskAttemptId> list = leafTasksRackMapping.get(hostVolumeMapping.getRack()); if (list == null) { list = new HashSet<>(); leafTasksRackMapping.put(hostVolumeMapping.getRack(), list); } list.add(taskAttempt.getId()); if (LOG.isDebugEnabled()) { LOG.debug("Added attempt req to rack " + hostVolumeMapping.getRack()); } } leafTasks.add(taskAttempt.getId()); } private void addNonLeafTask(TaskAttemptToSchedulerEvent event) { nonLeafTasks.add(event.getTaskAttempt().getId()); } public int leafTaskNum() { return leafTasks.size(); } public int nonLeafTaskNum() { return nonLeafTasks.size(); } private TaskAttemptId allocateLocalTask(String host){ HostVolumeMapping hostVolumeMapping = leafTaskHostMapping.get(host); if (hostVolumeMapping != null) { //tajo host is located in hadoop datanode for (int i = 0; i < hostVolumeMapping.getRemainingLocalTaskSize(); i++) { TaskAttemptId attemptId = hostVolumeMapping.getLocalTask(); if(attemptId == null) break; //find remaining local task if (leafTasks.contains(attemptId)) { leafTasks.remove(attemptId); return attemptId; } } } return null; } private TaskAttemptId allocateRackTask(String host) { List<HostVolumeMapping> remainingTasks = Lists.newArrayList(leafTaskHostMapping.values()); String rack = RackResolver.resolve(host).getNetworkLocation(); TaskAttemptId attemptId = null; if (remainingTasks.size() > 0) { synchronized (scheduledRequests) { //find largest remaining task of other host in rack Collections.sort(remainingTasks, new Comparator<HostVolumeMapping>() { @Override public int compare(HostVolumeMapping v1, HostVolumeMapping v2) { // descending remaining tasks if (v2.remainTasksNum.get() > v1.remainTasksNum.get()) { return 1; } else if (v2.remainTasksNum.get() == v1.remainTasksNum.get()) { return 0; } else { return -1; } } }); } for (HostVolumeMapping tasks : remainingTasks) { for (int i = 0; i < tasks.getRemainingLocalTaskSize(); i++) { TaskAttemptId tId = tasks.getTaskAttemptIdByRack(rack); if (tId == null) break; if (leafTasks.contains(tId)) { leafTasks.remove(tId); attemptId = tId; break; } } if(attemptId != null) break; } } //find task in rack if (attemptId == null) { HashSet<TaskAttemptId> list = leafTasksRackMapping.get(rack); if (list != null) { synchronized (list) { Iterator<TaskAttemptId> iterator = list.iterator(); while (iterator.hasNext()) { TaskAttemptId tId = iterator.next(); iterator.remove(); if (leafTasks.contains(tId)) { leafTasks.remove(tId); attemptId = tId; break; } } } } } return attemptId; } public void assignToLeafTasks(LinkedList<TaskRequestEvent> taskRequests) throws InterruptedException { Collections.shuffle(taskRequests); LinkedList<TaskRequestEvent> remoteTaskRequests = new LinkedList<>(); String queryMasterHostAndPort = context.getMasterContext().getQueryMasterContext().getWorkerContext(). getConnectionInfo().getHostAndQMPort(); TaskRequestEvent taskRequest; while (leafTasks.size() > 0 && (!taskRequests.isEmpty() || !remoteTaskRequests.isEmpty())) { int localAssign = 0; int rackAssign = 0; taskRequest = taskRequests.pollFirst(); if(taskRequest == null) { // if there are only remote task requests taskRequest = remoteTaskRequests.pollFirst(); } // checking if this container is still alive. // If not, ignore the task request and stop the task runner WorkerConnectionInfo connectionInfo = context.getMasterContext().getWorkerMap().get(taskRequest.getWorkerId()); if(connectionInfo == null) continue; // getting the hostname of requested node String host = connectionInfo.getHost(); // if there are no worker matched to the hostname a task request if (!leafTaskHostMapping.containsKey(host) && !taskRequests.isEmpty()) { String normalizedHost = NetUtils.normalizeHost(host); if (!leafTaskHostMapping.containsKey(normalizedHost)) { // this case means one of either cases: // * there are no blocks which reside in this node. // * all blocks which reside in this node are consumed, and this task runner requests a remote task. // In this case, we transfer the task request to the remote task request list, and skip the followings. remoteTaskRequests.add(taskRequest); continue; } else { host = normalizedHost; } } if (LOG.isDebugEnabled()) { LOG.debug("assignToLeafTasks: " + taskRequest.getExecutionBlockId() + "," + "worker=" + connectionInfo.getHostAndPeerRpcPort()); } ////////////////////////////////////////////////////////////////////// // disk or host-local allocation ////////////////////////////////////////////////////////////////////// TaskAttemptId attemptId = allocateLocalTask(host); int assignedVolume = DataLocation.REMOTE_VOLUME_ID; HostVolumeMapping hostVolumeMapping = leafTaskHostMapping.get(host); if (attemptId == null) { // if a local task cannot be found if(!taskRequests.isEmpty()) { //if other requests remains, move to remote list for better locality remoteTaskRequests.add(taskRequest); candidateWorkers.remove(connectionInfo.getId()); continue; } else { if(hostVolumeMapping != null) { int nodes = context.getMasterContext().getWorkerMap().size(); //this part is to control the assignment of tail and remote task balancing per node int tailLimit = 1; if (remainingScheduledObjectNum() > 0 && nodes > 0) { tailLimit = Math.max(remainingScheduledObjectNum() / nodes, 1); } //remote task throttling per node if (nodes > 1 && hostVolumeMapping.getRemoteConcurrency() >= tailLimit) { continue; } else { // assign to remote volume hostVolumeMapping.increaseConcurrency(assignedVolume); } } } ////////////////////////////////////////////////////////////////////// // rack-local allocation ////////////////////////////////////////////////////////////////////// attemptId = allocateRackTask(host); ////////////////////////////////////////////////////////////////////// // random node allocation ////////////////////////////////////////////////////////////////////// if (attemptId == null && leafTaskNum() > 0) { synchronized (leafTasks){ attemptId = leafTasks.iterator().next(); leafTasks.remove(attemptId); } } if (attemptId != null && hostVolumeMapping != null) { hostVolumeMapping.lastAssignedVolumeId.put(attemptId, assignedVolume); } rackAssign++; } else { if(hostVolumeMapping != null){ //Set to real volume id assignedVolume = hostVolumeMapping.lastAssignedVolumeId.get(attemptId); } localAssign++; } if (attemptId != null) { Task task = stage.getTask(attemptId.getTaskId()); TaskRequest taskAssign = new TaskRequestImpl( attemptId, new ArrayList<>(task.getAllFragments()), "", false, LogicalNodeSerializer.serialize(task.getLogicalPlan()), context.getMasterContext().getQueryContext(), stage.getDataChannel(), stage.getBlock().getEnforcer(), queryMasterHostAndPort); NodeResource resource = new NodeResource(taskRequest.getResponseProto().getResource()); if (checkIfInterQuery(stage.getMasterPlan(), stage.getBlock())) { taskAssign.setInterQuery(); } //TODO send batch request BatchAllocationRequest.Builder requestProto = BatchAllocationRequest.newBuilder(); requestProto.addTaskRequest(TaskAllocationProto.newBuilder() .setResource(resource.getProto()).setVolumeId(assignedVolume) .setTaskRequest(taskAssign.getProto()).build()); requestProto.setExecutionBlockId(attemptId.getTaskId().getExecutionBlockId().getProto()); context.getMasterContext().getEventHandler().handle(new TaskAttemptAssignedEvent(attemptId, connectionInfo)); InetSocketAddress addr = stage.getAssignedWorkerMap().get(connectionInfo.getId()); if (addr == null) addr = new InetSocketAddress(connectionInfo.getHost(), connectionInfo.getPeerRpcPort()); AsyncRpcClient tajoWorkerRpc = null; CallFuture<BatchAllocationResponse> callFuture = new CallFuture<>(); totalAttempts++; try { tajoWorkerRpc = RpcClientManager.getInstance().getClient(addr, TajoWorkerProtocol.class, true, rpcParams); TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerRpcClient = tajoWorkerRpc.getStub(); tajoWorkerRpcClient.allocateTasks(callFuture.getController(), requestProto.build(), callFuture); BatchAllocationResponse responseProto = callFuture.get(); if (responseProto.getCancellationTaskCount() > 0) { cancellation += cancel(responseProto.getCancellationTaskList()); info(LOG, "Canceled requests: " + responseProto.getCancellationTaskCount() + " from " + addr); continue; } } catch (ExecutionException | ConnectException e) { cancellation += cancel(requestProto.getTaskRequestList()); warn(LOG, "Canceled requests: " + requestProto.getTaskRequestCount() + " by " + ExceptionUtils.getFullStackTrace(e)); continue; } catch (InterruptedException e) { throw e; } catch (Exception e) { throw new TajoInternalError(e); } scheduledObjectNum--; totalAssigned++; hostLocalAssigned += localAssign; rackLocalAssigned += rackAssign; if (rackAssign > 0) { info(LOG, String.format("Assigned Local/Rack/Total: (%d/%d/%d), " + "Attempted Cancel/Assign/Total: (%d/%d/%d), " + "Locality: %.2f%%, Rack host: %s", hostLocalAssigned, rackLocalAssigned, totalAssigned, cancellation, totalAssigned, totalAttempts, ((double) hostLocalAssigned / (double) totalAssigned) * 100, host)); } } else { throw new RuntimeException("Illegal State!!!!!!!!!!!!!!!!!!!!!"); } } } private boolean checkIfInterQuery(MasterPlan masterPlan, ExecutionBlock block) { if (masterPlan.isRoot(block)) { return false; } ExecutionBlock parent = masterPlan.getParent(block); if (masterPlan.isRoot(parent) && parent.isUnionOnly()) { return false; } return true; } public void assignToNonLeafTasks(LinkedList<TaskRequestEvent> taskRequests) throws InterruptedException { Collections.shuffle(taskRequests); String queryMasterHostAndPort = context.getMasterContext().getQueryMasterContext().getWorkerContext(). getConnectionInfo().getHostAndQMPort(); TaskRequestEvent taskRequest; while (!taskRequests.isEmpty()) { taskRequest = taskRequests.pollFirst(); LOG.debug("assignToNonLeafTasks: " + taskRequest.getExecutionBlockId()); TaskAttemptId attemptId; // random allocation if (nonLeafTasks.size() > 0) { synchronized (nonLeafTasks){ attemptId = nonLeafTasks.iterator().next(); nonLeafTasks.remove(attemptId); } LOG.debug("Assigned based on * match"); Task task; task = stage.getTask(attemptId.getTaskId()); TaskRequest taskAssign = new TaskRequestImpl( attemptId, Lists.newArrayList(task.getAllFragments()), "", false, LogicalNodeSerializer.serialize(task.getLogicalPlan()), context.getMasterContext().getQueryContext(), stage.getDataChannel(), stage.getBlock().getEnforcer(), queryMasterHostAndPort); if (checkIfInterQuery(stage.getMasterPlan(), stage.getBlock())) { taskAssign.setInterQuery(); } for(Map.Entry<String, Set<FetchProto>> entry: task.getFetchMap().entrySet()) { Collection<FetchProto> fetches = entry.getValue(); if (fetches != null) { for (FetchProto fetch : fetches) { taskAssign.addFetch(fetch); } } } WorkerConnectionInfo connectionInfo = context.getMasterContext().getWorkerMap().get(taskRequest.getWorkerId()); //TODO send batch request BatchAllocationRequest.Builder requestProto = BatchAllocationRequest.newBuilder(); requestProto.addTaskRequest(TaskAllocationProto.newBuilder() .setResource(taskRequest.getResponseProto().getResource()) .setTaskRequest(taskAssign.getProto()).build()); requestProto.setExecutionBlockId(attemptId.getTaskId().getExecutionBlockId().getProto()); context.getMasterContext().getEventHandler().handle(new TaskAttemptAssignedEvent(attemptId, connectionInfo)); CallFuture<BatchAllocationResponse> callFuture = new CallFuture<>(); InetSocketAddress addr = stage.getAssignedWorkerMap().get(connectionInfo.getId()); if (addr == null) addr = new InetSocketAddress(connectionInfo.getHost(), connectionInfo.getPeerRpcPort()); AsyncRpcClient tajoWorkerRpc; try { tajoWorkerRpc = RpcClientManager.getInstance().getClient(addr, TajoWorkerProtocol.class, true, rpcParams); TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerRpcClient = tajoWorkerRpc.getStub(); tajoWorkerRpcClient.allocateTasks(callFuture.getController(), requestProto.build(), callFuture); BatchAllocationResponse responseProto = callFuture.get(); if(responseProto.getCancellationTaskCount() > 0) { cancellation += cancel(responseProto.getCancellationTaskList()); info(LOG, "Canceled requests: " + responseProto.getCancellationTaskCount() + " from " + addr); continue; } } catch (ExecutionException | ConnectException e) { cancellation += cancel(requestProto.getTaskRequestList()); warn(LOG, "Canceled requests: " + requestProto.getTaskRequestCount() + " by " + ExceptionUtils.getFullStackTrace(e)); continue; } catch (InterruptedException e) { throw e; } catch (Exception e) { throw new TajoInternalError(e); } totalAssigned++; scheduledObjectNum--; } } } } }