/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** MODIFIED FOR GPGPU Usage! **/
package org.apache.hadoop.mapred;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.server.jobtracker.TaskTracker;
/**
* A {@link TaskScheduler} that keeps jobs in a queue in priority order (FIFO
* by default).
*/
class JobQueueTaskScheduler extends TaskScheduler {
private static final int MIN_CLUSTER_SIZE_FOR_PADDING = 3;
public static final Log LOG = LogFactory.getLog(JobQueueTaskScheduler.class);
protected JobQueueJobInProgressListener jobQueueJobInProgressListener;
protected EagerTaskInitializationListener eagerTaskInitializationListener;
private float padFraction;
private boolean isOptionalScheduling = false;
public JobQueueTaskScheduler() {
this.jobQueueJobInProgressListener = new JobQueueJobInProgressListener();
}
@Override
public synchronized void start() throws IOException {
super.start();
taskTrackerManager.addJobInProgressListener(jobQueueJobInProgressListener);
eagerTaskInitializationListener.setTaskTrackerManager(taskTrackerManager);
eagerTaskInitializationListener.start();
taskTrackerManager.addJobInProgressListener(
eagerTaskInitializationListener);
}
@Override
public synchronized void terminate() throws IOException {
if (jobQueueJobInProgressListener != null) {
taskTrackerManager.removeJobInProgressListener(
jobQueueJobInProgressListener);
}
if (eagerTaskInitializationListener != null) {
taskTrackerManager.removeJobInProgressListener(
eagerTaskInitializationListener);
eagerTaskInitializationListener.terminate();
}
super.terminate();
}
@Override
public synchronized void setConf(Configuration conf) {
super.setConf(conf);
isOptionalScheduling = conf.getBoolean("mapred.jobtracker.map.optionalscheduling", false);
padFraction = conf.getFloat("mapred.jobtracker.taskalloc.capacitypad",
0.01f);
this.eagerTaskInitializationListener =
new EagerTaskInitializationListener(conf);
}
@Override
public synchronized List<Task> assignTasks(TaskTracker taskTracker)
throws IOException {
TaskTrackerStatus taskTrackerStatus = taskTracker.getStatus();
ClusterStatus clusterStatus = taskTrackerManager.getClusterStatus();
final int numTaskTrackers = clusterStatus.getTaskTrackers();
final int clusterMapCapacity = clusterStatus.getMaxMapTasks();
final int clusterReduceCapacity = clusterStatus.getMaxReduceTasks();
Collection<JobInProgress> jobQueue =
jobQueueJobInProgressListener.getJobQueue();
//
// Get map + reduce counts for the current tracker.
//
final int trackerMapCapacity = taskTrackerStatus.getMaxMapSlots();
final int trackerCPUMapCapacity = taskTrackerStatus.getMaxCPUMapSlots();
final int trackerGPUMapCapacity = taskTrackerStatus.getMaxGPUMapSlots();
final int trackerReduceCapacity = taskTrackerStatus.getMaxReduceSlots();
//final int trackerRunningMaps = taskTracker.countMapTasks();
final int trackerRunningCPUMaps = taskTrackerStatus.countCPUMapTasks();
final int trackerRunningGPUMaps = taskTrackerStatus.countGPUMapTasks();
//final int trackerRunningMaps = trackerRunningCPUMaps + trackerRunningGPUMaps;
final int trackerRunningMaps = taskTrackerStatus.countMapTasks();
final int trackerRunningReduces = taskTrackerStatus.countReduceTasks();
LOG.info("DEBUG ************* assignTasks started!!!");
LOG.info("DEBUG trackerMapCapacity : " + trackerMapCapacity);
LOG.info("DEBUG trackerCPUMapCapacity : " + trackerCPUMapCapacity);
LOG.info("DEBUG trackerGPUMapCapacity : " + trackerGPUMapCapacity);
LOG.info("DEBUG trackerRunningCPUMaps : " + trackerRunningCPUMaps);
LOG.info("DEBUG trackerRunningGPUMaps : " + trackerRunningGPUMaps);
LOG.info("DEBUG trackerRunningMaps : " + trackerRunningMaps);
// Assigned tasks
List<Task> assignedTasks = new ArrayList<Task>();
//
// Compute (running + pending) map and reduce task numbers across pool
//
int remainingReduceLoad = 0;
int remainingMapLoad = 0;
int pendingMapLoad = 0;
int finishedCPUMapTasks = 0;
int finishedGPUMapTasks = 0;
int cpuMapTaskMeanTime = 0;
int gpuMapTaskMeanTime = 0;
int totalMapTasks = 0;
synchronized (jobQueue) {
for (JobInProgress job : jobQueue) {
if (job.getStatus().getRunState() == JobStatus.RUNNING) {
remainingMapLoad += (job.desiredMaps() - job.finishedMaps());
pendingMapLoad += (remainingMapLoad - job.runningMaps());
totalMapTasks += job.desiredMaps();
finishedCPUMapTasks += job.finishedCPUMaps();
finishedGPUMapTasks += job.finishedGPUMaps();
cpuMapTaskMeanTime = job.getCPUMapTaskMeanTime();
gpuMapTaskMeanTime = job.getGPUMapTaskMeanTime();
LOG.info("DEBUG: job.desiredMaps : " + job.desiredMaps());
LOG.info("DEBUG: job.finishedMaps : " + job.finishedMaps());
LOG.info("DEBUG: job.maptaskmeantime : " + job.getMapTaskMeanTime());
LOG.info("DEBUG: job.CPUmaptaskmeantime : " + job.getCPUMapTaskMeanTime());
LOG.info("DEBUG: job.GPUmaptaskmeantime : " + job.getGPUMapTaskMeanTime());
int i = 0;
Iterator<Long> it = job.getCPUMapTaskTimes().iterator();
while(it.hasNext()) {
LOG.info("DEBUG: CPU : " + i + " : " + it.next().longValue());
i++;
}
i = 0;
Iterator<Long> it_gpu = job.getGPUMapTaskTimes().iterator();
while(it_gpu.hasNext()) {
LOG.info("DEBUG: GPU : " + i + " : " + it_gpu.next().longValue());
i++;
}
if (job.scheduleReduces()) {
remainingReduceLoad +=
(job.desiredReduces() - job.finishedReduces());
}
}
}
}
LOG.info("DEBUG: finishedCPUMaps : " + finishedCPUMapTasks);
LOG.info("DEBUG: finishedGPUMaps : " + finishedGPUMapTasks);
LOG.info("DEBUG: reminingMapLoad : " + remainingMapLoad);
LOG.info("DEBUG: pendingMapLoad : " + pendingMapLoad);
double accelarationFactor =
(cpuMapTaskMeanTime == 0 || gpuMapTaskMeanTime == 0) ? 0.0
: (double)cpuMapTaskMeanTime / (double)gpuMapTaskMeanTime;
LOG.info("DEBUG: accelarationfactor : " + accelarationFactor);
//apply scheduling algorithm to MapTasks
/*
if (accelarationFactor != 0.0) {
double fcpu = Math.ceil((double)pendingMapLoad / trackerCPUMapCapacity)
* accelarationFactor;
double fgpu = Math.ceil((double)pendingMapLoad / trackerGPUMapCapacity);
double fmin = fgpu;
int xmin = 0, ymin = pendingMapLoad;
for (int x = 1; x < pendingMapLoad; x++) {
int y = pendingMapLoad - x;
double f = Math.max(Math.ceil(x / trackerCPUMapCapacity)
* accelarationFactor, Math.ceil(y
/ trackerGPUMapCapacity));
if (f < fmin) {
fmin = f;
xmin = x;
ymin = y;
}
}
LOG.info("[fcpu_only, x, y, x/ncpu, y/ngpu] :" + " [" + fcpu
* gpuMapTaskMeanTime + ", " + pendingMapLoad + ", " + 0
+ ", " + pendingMapLoad / trackerCPUMapCapacity + ", " + 0
+ "]");
LOG.info("[fgpu_only, x, y, x/ncpu, y/ngpu] :" + " [" + fgpu
* gpuMapTaskMeanTime + ", " + 0 + ", " + pendingMapLoad
+ ", " + 0 + ", " + pendingMapLoad / trackerGPUMapCapacity
+ "]");
// fgreedy is used just for testing the scheduling effect
double z = Math.ceil((double)pendingMapLoad
/ (trackerCPUMapCapacity + trackerGPUMapCapacity));
double fgreedy = Math.max(z * accelarationFactor, z);
LOG.info("[fgreedy, x, y, x/ncpu, y/ngpu] :" + " [" + fgreedy
* gpuMapTaskMeanTime + ", " + -1 + ", " + -1 + ", " + z
+ ", " + z + "]");
LOG.info("[f, x, y, x/ncpu, y/ngpu] :" + " [" + fmin
* gpuMapTaskMeanTime + ", " + xmin + ", " + ymin + ", "
+ xmin / trackerCPUMapCapacity + ", " + ymin
/ trackerGPUMapCapacity + "]");
}
*/
// Compute the 'load factor' for maps and reduces
double mapLoadFactor = 0.0;
if (clusterMapCapacity > 0) {
mapLoadFactor = (double)remainingMapLoad / clusterMapCapacity;
}
double reduceLoadFactor = 0.0;
if (clusterReduceCapacity > 0) {
reduceLoadFactor = (double)remainingReduceLoad / clusterReduceCapacity;
}
//
// In the below steps, we allocate first map tasks (if appropriate),
// and then reduce tasks if appropriate. We go through all jobs
// in order of job arrival; jobs only get serviced if their
// predecessors are serviced, too.
//
//
// We assign tasks to the current taskTracker if the given machine
// has a workload that's less than the maximum load of that kind of
// task.
// However, if the cluster is close to getting loaded i.e. we don't
// have enough _padding_ for speculative executions etc., we only
// schedule the "highest priority" task i.e. the task from the job
// with the highest priority.
//
// final int trackerCurrentMapCapacity =
// Math.min((int)Math.ceil(mapLoadFactor * trackerMapCapacity),
// trackerMapCapacity);
// int availableMapSlots = trackerCurrentMapCapacity - trackerRunningMaps;
/** FIXTHIS **/
int availableMapSlots = trackerMapCapacity - trackerRunningMaps;
int availableCPUMapSlots = trackerCPUMapCapacity - trackerRunningCPUMaps;
int availableGPUMapSlots = trackerGPUMapCapacity - trackerRunningGPUMaps;
boolean[] availableGPUDevices = taskTrackerStatus.availableGPUDevices();
LOG.info("DEBUG: GPUDevices: " + availableGPUDevices.length);
boolean exceededMapPadding = false;
assert availableCPUMapSlots >= 0;
// if (availableMapSlots > 0) {
// exceededMapPadding =
// exceededPadding(true, clusterStatus, trackerMapCapacity);
// }
int numLocalMaps = 0;
int numNonLocalMaps = 0;
LOG.info("DEBUG availableMapSlots : " + availableMapSlots);
LOG.info("DEBUG availableCPUMapSlots : " + availableCPUMapSlots);
LOG.info("DEBUG availableGPUMapSlots : " + availableGPUMapSlots);
LOG.info("DEBUG availableGPUDevices : ");
for(int i = 0; i < trackerGPUMapCapacity; i++) {
LOG.info("DEBUG availableGPUDevices["+i+"]: "+ i + " : " + availableGPUDevices[i]);
}
// LOG.info("XXXX numTaskTrackers : " + numTaskTrackers);
//check if assign to CPU or not in aspect of scheduring algorithm
// LOG.info("XXXX a * t * n : " + (accelarationFactor * trackerGPUMapCapacity * numTaskTrackers));
LOG.info("DEBUG OptionalScheduling: " + isOptionalScheduling);
// if(isOptionalScheduling) {
// if(Math.max(pendingMapLoad, 0) >= accelarationFactor * trackerGPUMapCapacity * numTaskTrackers) {
if(!(isOptionalScheduling &&
Math.max(pendingMapLoad, 0) < accelarationFactor * trackerGPUMapCapacity * numTaskTrackers)){
LOG.info("DEBUG: ************* try to assign to CPU");
scheduleCPUMaps:
for (int i = 0; i < availableCPUMapSlots; ++i) {
synchronized (jobQueue) {
for (JobInProgress job : jobQueue) {
if (job.getStatus().getRunState() != JobStatus.RUNNING) {
continue;
}
Task t = null;
// Try to schedule a node-local or rack-local Map task
t = job.obtainNewNodeLocalMapTask(taskTrackerStatus,
numTaskTrackers, taskTrackerManager
.getNumberOfUniqueHosts());
if (t != null) {
assignedTasks.add(t);
++numLocalMaps;
LOG.info("DEBUG: ************* assign to CPU");
break;
}
t = job.obtainNewNonLocalMapTask(taskTrackerStatus,
numTaskTrackers, taskTrackerManager.getNumberOfUniqueHosts());
if (t != null) {
assignedTasks.add(t);
++numNonLocalMaps;
LOG.info("DEBUG: ************* assign to CPU");
break scheduleCPUMaps;
}
}
}
}
}
else{
LOG.info("DEBUG: DO NOT try to assign to CPU");
}
LOG.info("DEBUG: ************* try to assign to GPU");
scheduleGPUMaps:
for (int i = 0; i < availableGPUMapSlots; ++i) {
synchronized(jobQueue) {
for (JobInProgress job : jobQueue) {
if (job.getStatus().getRunState() != JobStatus.RUNNING) {
continue;
}
if (job.getJobConf().get("hadoop.pipes.gpu.executable",null)==null){
LOG.info("DEBUG: GPU executable is not set! --> cannot assign to GPU! ");
LOG.info("DEBUG: hadoop.pipes.executable: "+job.getJobConf().get("hadoop.pipes.executable",null));
LOG.info("DEBUG: hadoop.pipes.gpu.executable: "+job.getJobConf().get("hadoop.pipes.gpu.executable",null));
break;
}
Task t = null;
// NewNodeLocalMapTask
t = job.obtainNewNodeLocalMapTask(taskTrackerStatus, numTaskTrackers,
taskTrackerManager.getNumberOfUniqueHosts());
if (t != null) {
t.setRunOnGPU(true);
for(int j = 0; j < trackerGPUMapCapacity; j++) {
if(availableGPUDevices[j] == true) {
t.setGPUDeviceId(j);
availableGPUDevices[j] = false;
break;
}
}
assignedTasks.add(t);
++numLocalMaps;
LOG.info("DEBUG: ************* assign to GPU");
break;
}
// NewNonLocalMapTask
t = job.obtainNewNonLocalMapTask(taskTrackerStatus, numTaskTrackers,
taskTrackerManager.getNumberOfUniqueHosts());
if (t != null) {
t.setRunOnGPU(true);
for(int j = 0; j < availableGPUMapSlots; j++) {
if(availableGPUDevices[j] == true) {
t.setGPUDeviceId(j);
availableGPUDevices[j] = false;
break;
}
}
assignedTasks.add(t);
++numNonLocalMaps;
LOG.info("DEBUG: ************* assign to GPU");
break scheduleGPUMaps;
}
}
}
}
/*
//NO optional scheduling
LOG.info("try to assign to CPU");
scheduleCPUMaps:
for (int i = 0; i < availableCPUMapSlots; ++i) {
synchronized (jobQueue) {
for (JobInProgress job : jobQueue) {
if (job.getStatus().getRunState() != JobStatus.RUNNING) {
continue;
}
Task t = null;
// Try to schedule a node-local or rack-local Map task
t = job.obtainNewLocalMapTask(taskTracker,
numTaskTrackers, taskTrackerManager
.getNumberOfUniqueHosts());
if (t != null) {
assignedTasks.add(t);
++numLocalMaps;
LOG.info("assign to CPU");
break;
}
t = job.obtainNewNonLocalMapTask(taskTracker,
numTaskTrackers, taskTrackerManager.getNumberOfUniqueHosts());
if (t != null) {
assignedTasks.add(t);
++numNonLocalMaps;
LOG.info("assign to CPU");
break scheduleCPUMaps;
}
}
}
}
scheduleGPUMaps:
for (int i = 0; i < availableGPUMapSlots; ++i) {
synchronized(jobQueue) {
for (JobInProgress job : jobQueue) {
if (job.getStatus().getRunState() != JobStatus.RUNNING) {
continue;
}
Task t = null;
t = job.obtainNewLocalMapTask(taskTracker, numTaskTrackers,
taskTrackerManager.getNumberOfUniqueHosts());
if (t != null) {
t.setRunOnGPU(true);
for(int j = 0; j < trackerGPUMapCapacity; j++) {
if(availableGPUDevices[j] == true) {
t.setGPUDeviceId(j);
availableGPUDevices[j] = false;
break;
}
}
assignedTasks.add(t);
++numLocalMaps;
LOG.info("assign to GPU");
break;
}
t = job.obtainNewNonLocalMapTask(taskTracker, numTaskTrackers,
taskTrackerManager.getNumberOfUniqueHosts());
if (t != null) {
t.setRunOnGPU(true);
for(int j = 0; j < availableGPUMapSlots; j++) {
if(availableGPUDevices[j] == true) {
t.setGPUDeviceId(j);
availableGPUDevices[j] = false;
break;
}
}
assignedTasks.add(t);
++numNonLocalMaps;
LOG.info("assign to GPU");
break scheduleGPUMaps;
}
}
}
}
}
*/
/*
scheduleMaps:
for (int i=0; i < availableMapSlots; ++i) {
synchronized (jobQueue) {
for (JobInProgress job : jobQueue) {
if (job.getStatus().getRunState() != JobStatus.RUNNING) {
continue;
}
Task t = null;
// Try to schedule a node-local or rack-local Map task
t =
job.obtainNewNodeOrRackLocalMapTask(taskTrackerStatus,
numTaskTrackers, taskTrackerManager.getNumberOfUniqueHosts());
if (t != null) {
assignedTasks.add(t);
++numLocalMaps;
// Don't assign map tasks to the hilt!
// Leave some free slots in the cluster for future task-failures,
// speculative tasks etc. beyond the highest priority job
if (exceededMapPadding) {
break scheduleMaps;
}
// Try all jobs again for the next Map task
break;
}
// Try to schedule a node-local or rack-local Map task
t =
job.obtainNewNonLocalMapTask(taskTrackerStatus, numTaskTrackers,
taskTrackerManager.getNumberOfUniqueHosts());
if (t != null) {
assignedTasks.add(t);
++numNonLocalMaps;
// We assign at most 1 off-switch or speculative task
// This is to prevent TaskTrackers from stealing local-tasks
// from other TaskTrackers.
break scheduleMaps;
}
}
}
}
int assignedMaps = assignedTasks.size();
*/
//
// Same thing, but for reduce tasks
// However we _never_ assign more than 1 reduce task per heartbeat
//
final int trackerCurrentReduceCapacity =
Math.min((int)Math.ceil(reduceLoadFactor * trackerReduceCapacity),
trackerReduceCapacity);
final int availableReduceSlots =
Math.min((trackerCurrentReduceCapacity - trackerRunningReduces), 1);
boolean exceededReducePadding = false;
if (availableReduceSlots > 0) {
exceededReducePadding = exceededPadding(false, clusterStatus,
trackerReduceCapacity);
synchronized (jobQueue) {
for (JobInProgress job : jobQueue) {
if (job.getStatus().getRunState() != JobStatus.RUNNING ||
job.numReduceTasks == 0) {
continue;
}
Task t =
job.obtainNewReduceTask(taskTrackerStatus, numTaskTrackers,
taskTrackerManager.getNumberOfUniqueHosts()
);
if (t != null) {
assignedTasks.add(t);
break;
}
// Don't assign reduce tasks to the hilt!
// Leave some free slots in the cluster for future task-failures,
// speculative tasks etc. beyond the highest priority job
if (exceededReducePadding) {
break;
}
}
}
}
// if (LOG.isDebugEnabled()) {
// LOG.debug("Task assignments for " + taskTrackerStatus.getTrackerName() + " --> " +
// "[" + mapLoadFactor + ", " + trackerMapCapacity + ", " +
// trackerCurrentMapCapacity + ", " + trackerRunningMaps + "] -> [" +
// (trackerCurrentMapCapacity - trackerRunningMaps) + ", " +
// assignedMaps + " (" + numLocalMaps + ", " + numNonLocalMaps +
// ")] [" + reduceLoadFactor + ", " + trackerReduceCapacity + ", " +
// trackerCurrentReduceCapacity + "," + trackerRunningReduces +
// "] -> [" + (trackerCurrentReduceCapacity - trackerRunningReduces) +
// ", " + (assignedTasks.size()-assignedMaps) + "]");
// }
return assignedTasks;
}
private boolean exceededPadding(boolean isMapTask,
ClusterStatus clusterStatus,
int maxTaskTrackerSlots) {
int numTaskTrackers = clusterStatus.getTaskTrackers();
int totalTasks =
(isMapTask) ? clusterStatus.getMapTasks() :
clusterStatus.getReduceTasks();
int totalTaskCapacity =
isMapTask ? clusterStatus.getMaxMapTasks() :
clusterStatus.getMaxReduceTasks();
Collection<JobInProgress> jobQueue =
jobQueueJobInProgressListener.getJobQueue();
boolean exceededPadding = false;
synchronized (jobQueue) {
int totalNeededTasks = 0;
for (JobInProgress job : jobQueue) {
if (job.getStatus().getRunState() != JobStatus.RUNNING ||
job.numReduceTasks == 0) {
continue;
}
//
// Beyond the highest-priority task, reserve a little
// room for failures and speculative executions; don't
// schedule tasks to the hilt.
//
totalNeededTasks +=
isMapTask ? job.desiredMaps() : job.desiredReduces();
int padding = 0;
if (numTaskTrackers > MIN_CLUSTER_SIZE_FOR_PADDING) {
padding =
Math.min(maxTaskTrackerSlots,
(int) (totalNeededTasks * padFraction));
}
if (totalTasks + padding >= totalTaskCapacity) {
exceededPadding = true;
break;
}
}
}
return exceededPadding;
}
@Override
public synchronized Collection<JobInProgress> getJobs(String queueName) {
return jobQueueJobInProgressListener.getJobQueue();
}
}