/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred; import java.io.IOException; import junit.extensions.TestSetup; import junit.framework.Assert; import junit.framework.Test; import junit.framework.TestCase; import junit.framework.TestSuite; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.mapred.FakeObjectUtilities.FakeJobInProgress; import org.apache.hadoop.mapred.FakeObjectUtilities.FakeJobTracker; import org.apache.hadoop.mapred.UtilsForTests.FakeClock; public class TestSpeculativeExecution extends TestCase { FakeJobInProgress job; static FakeJobTracker jobTracker; static class SpecFakeClock extends FakeClock { // assuming default map/reduce speculative lags are // identical in value. just need to store one of them long speculative_lag; public SpecFakeClock(long speculative_lag) { this.speculative_lag = speculative_lag; } public void advanceBySpeculativeLag() { time += speculative_lag; } }; static SpecFakeClock clock; static final Log LOG = LogFactory.getLog(TestSpeculativeExecution.class); public static final String JT_IPC_ADDRESS = "mapred.job.tracker"; public static final String JT_HTTP_ADDRESS = "mapred.job.tracker.http.address"; static String trackers[] = new String[] {"tracker_tracker1:1000", "tracker_tracker2:1000", "tracker_tracker3:1000", "tracker_tracker4:1000", "tracker_tracker5:1000"}; public static Test suite() { TestSetup setup = new TestSetup(new TestSuite(TestSpeculativeExecution.class)) { @Override protected void setUp() throws Exception { JobConf conf = new JobConf(); conf.set(JT_IPC_ADDRESS, "localhost:0"); conf.set(JT_HTTP_ADDRESS, "0.0.0.0:0"); jobTracker = new FakeJobTracker (conf, (clock = new SpecFakeClock(conf.getMapSpeculativeLag())), trackers); for (String tracker : trackers) { FakeObjectUtilities.establishFirstContact(jobTracker, tracker); } } @Override protected void tearDown() throws Exception { //delete the build/test/logs/ dir } }; return setup; } public void testRunningTaskCountWithSpeculation() throws IOException { TaskAttemptID[] taskAttemptID = new TaskAttemptID[8]; JobConf conf = new JobConf(); conf.setSpeculativeExecution(true); conf.setNumMapTasks(3); conf.setNumReduceTasks(3); conf.setFloat(JobInProgress.SPECULATIVE_SLOWTASK_THRESHOLD, 0.5f); conf.setFloat(JobInProgress.SPECULATIVE_MAP_UNFINISHED_THRESHOLD_KEY, 0); conf.setFloat(JobInProgress.SPECULATIVE_REDUCE_UNFINISHED_THRESHOLD_KEY, 0); FakeJobInProgress job = new FakeJobInProgress(conf, jobTracker); job.initTasks(); //Check for runningMap counts first //schedule maps taskAttemptID[0] = job.findMapTask(trackers[0]); taskAttemptID[1] = job.findMapTask(trackers[1]); taskAttemptID[2] = job.findMapTask(trackers[2]); clock.advance(5000); job.finishTask(taskAttemptID[0]); clock.advance(1000); job.finishTask(taskAttemptID[1]); clock.advanceBySpeculativeLag(); //we should get a speculative task now job.refresh(clock.getTime()); taskAttemptID[3] = job.findMapTask(trackers[3]); job.refresh(clock.getTime()); int oldRunningMap = job.runningMaps(); LOG.info("No of running maps before fail was " + oldRunningMap); job.failTask(taskAttemptID[2]); job.refresh(clock.getTime()); assertEquals( "Running maps count should be updated from " + oldRunningMap + " to " + (oldRunningMap - 1), job.runningMaps(), oldRunningMap - 1); LOG.info(" Job running maps after fail " + job.runningMaps()); clock.advance(5000); job.finishTask(taskAttemptID[3]); //check for runningReduce count. taskAttemptID[4] = job.findReduceTask(trackers[0]); taskAttemptID[5] = job.findReduceTask(trackers[1]); taskAttemptID[6] = job.findReduceTask(trackers[2]); clock.advance(5000); job.finishTask(taskAttemptID[4]); clock.advance(1000); job.finishTask(taskAttemptID[5]); job.refresh(clock.getTime()); clock.advanceBySpeculativeLag(); taskAttemptID[7] = job.findReduceTask(trackers[4]); job.refresh(clock.getTime()); int oldRunningReduces = job.runningReduces(); job.failTask(taskAttemptID[6]); job.refresh(clock.getTime()); LOG.info( " No of running Reduces before fail " + oldRunningReduces); LOG.info( " No of runing reduces after fail " + job.runningReduces()); assertEquals( "Running reduces count should be updated from " + oldRunningReduces + " to " + (oldRunningReduces - 1), job.runningReduces(), oldRunningReduces - 1); job.finishTask(taskAttemptID[7]); } public void testIsSlowTracker() throws IOException { TaskAttemptID[] taskAttemptID = new TaskAttemptID[20]; JobConf conf = new JobConf(); conf.setSpeculativeExecution(true); conf.setNumMapTasks(10); conf.setNumReduceTasks(0); conf.setFloat(JobInProgress.SPECULATIVE_MAP_UNFINISHED_THRESHOLD_KEY, 0); conf.setFloat(JobInProgress.SPECULATIVE_REDUCE_UNFINISHED_THRESHOLD_KEY, 0); FakeJobInProgress job = new FakeJobInProgress(conf, jobTracker); job.initTasks(); //schedule some tasks taskAttemptID[0] = job.findMapTask(trackers[0]); taskAttemptID[1] = job.findMapTask(trackers[0]); taskAttemptID[2] = job.findMapTask(trackers[0]); taskAttemptID[3] = job.findMapTask(trackers[1]); taskAttemptID[4] = job.findMapTask(trackers[1]); taskAttemptID[5] = job.findMapTask(trackers[1]); taskAttemptID[6] = job.findMapTask(trackers[2]); taskAttemptID[7] = job.findMapTask(trackers[2]); taskAttemptID[8] = job.findMapTask(trackers[2]); clock.advance(1000); //Some tasks finish in 1 second (on trackers[0]) job.finishTask(taskAttemptID[0]); job.finishTask(taskAttemptID[1]); job.finishTask(taskAttemptID[2]); clock.advance(1000); //Some tasks finish in 2 second (on trackers[1]) job.finishTask(taskAttemptID[3]); job.finishTask(taskAttemptID[4]); job.finishTask(taskAttemptID[5]); assertEquals("Tracker "+ trackers[0] + " expected to be not slow ", job.isSlowTracker(trackers[0]), false); clock.advance(100000); //After a long time, some tasks finished on trackers[2] job.finishTask(taskAttemptID[6]); job.finishTask(taskAttemptID[7]); job.finishTask(taskAttemptID[8]); job.refresh(clock.getTime()); assertEquals("Tracker "+ trackers[2] + " expected to be slow ", job.isSlowTracker(trackers[2]), true); } public void testTaskToSpeculate() throws IOException { TaskAttemptID[] taskAttemptID = new TaskAttemptID[6]; JobConf conf = new JobConf(); conf.setSpeculativeExecution(true); conf.setNumMapTasks(5); conf.setNumReduceTasks(5); conf.setFloat(JobInProgress.SPECULATIVE_SLOWNODE_THRESHOLD, 100f); conf.setFloat(JobInProgress.SPECULATIVE_SLOWTASK_THRESHOLD, 0.5f); conf.setFloat(JobInProgress.SPECULATIVE_MAP_UNFINISHED_THRESHOLD_KEY, 0); conf.setFloat(JobInProgress.SPECULATIVE_REDUCE_UNFINISHED_THRESHOLD_KEY, 0); FakeJobInProgress job = new FakeJobInProgress(conf, jobTracker); job.initTasks(); //schedule maps taskAttemptID[0] = job.findReduceTask(trackers[0]); taskAttemptID[1] = job.findReduceTask(trackers[1]); taskAttemptID[2] = job.findReduceTask(trackers[2]); taskAttemptID[3] = job.findReduceTask(trackers[3]); taskAttemptID[4] = job.findReduceTask(trackers[3]); clock.advance(5000); job.finishTask(taskAttemptID[0]); clock.advance(1000); job.finishTask(taskAttemptID[1]); clock.advance(20000); clock.advanceBySpeculativeLag(); job.refresh(clock.getTime()); //we should get a speculative task now taskAttemptID[5] = job.findReduceTask(trackers[4]); assertEquals(taskAttemptID[5].getTaskID().getId(),2); clock.advance(5000); job.finishTask(taskAttemptID[5]); job.refresh(clock.getTime()); taskAttemptID[5] = job.findReduceTask(trackers[4]); assertEquals(taskAttemptID[5].getTaskID().getId(),3); } /** * tests that a task that has a remaining time less than duration * time */ public void testTaskSpeculationStddevCap() throws IOException { TaskAttemptID[] taskAttemptID = new TaskAttemptID[8]; JobConf conf = new JobConf(); conf.setSpeculativeExecution(true); conf.setFloat(JobInProgress.SPECULATIVE_STDDEVMEANRATIO_MAX, 0.33f); conf.setNumMapTasks(7); conf.setNumReduceTasks(0); conf.setFloat(JobInProgress.SPECULATIVE_MAP_UNFINISHED_THRESHOLD_KEY, 0); conf.setFloat(JobInProgress.SPECULATIVE_REDUCE_UNFINISHED_THRESHOLD_KEY, 0); FakeJobInProgress job = new FakeJobInProgress(conf, jobTracker); job.initTasks(); // all but one tasks start off taskAttemptID[0] = job.findMapTask(trackers[0]); taskAttemptID[1] = job.findMapTask(trackers[1]); taskAttemptID[2] = job.findMapTask(trackers[2]); taskAttemptID[3] = job.findMapTask(trackers[0]); taskAttemptID[4] = job.findMapTask(trackers[1]); taskAttemptID[5] = job.findMapTask(trackers[2]); // 3 tasks finish really fast in 15s clock.advance (15 * 1000); job.finishTask(taskAttemptID[0]); job.finishTask(taskAttemptID[1]); job.finishTask(taskAttemptID[2]); // advance to 600s and schedule last mapper clock.advance (585 * 1000); taskAttemptID[6] = job.findMapTask(trackers[0]); // advance to 700s and report progress clock.advance (10 * 60 * 1000); // set progress rates job.progressMade(taskAttemptID[3], 0.2f); job.progressMade(taskAttemptID[4], 0.5f); job.progressMade(taskAttemptID[5], 0.6f); job.progressMade(taskAttemptID[6], 0.02f); // the progress has been set in such a way that // stddev > mean. now we depend on stddev capping // for speculation. job.refresh(clock.getTime()); taskAttemptID[7] = job.findMapTask(trackers[1]); // no new map task should be found if(taskAttemptID[7] == null) Assert.fail(); } /* * Tests the fact that we choose tasks with lesser progress * among the possible candidates for speculation */ public void testTaskLATEScheduling() throws IOException { TaskAttemptID[] taskAttemptID = new TaskAttemptID[20]; JobConf conf = new JobConf(); conf.setSpeculativeExecution(true); conf.setNumMapTasks(5); conf.setNumReduceTasks(0); conf.setFloat(JobInProgress.SPECULATIVE_MAP_UNFINISHED_THRESHOLD_KEY, 0); conf.setFloat(JobInProgress.SPECULATIVE_REDUCE_UNFINISHED_THRESHOLD_KEY, 0); conf.setFloat(JobInProgress.SPECULATIVE_SLOWTASK_THRESHOLD, 0.5f); FakeJobInProgress job = new FakeJobInProgress(conf, jobTracker); job.initTasks(); taskAttemptID[0] = job.findMapTask(trackers[0]); taskAttemptID[1] = job.findMapTask(trackers[1]); taskAttemptID[2] = job.findMapTask(trackers[2]); taskAttemptID[3] = job.findMapTask(trackers[3]); clock.advance(2000); job.finishTask(taskAttemptID[0]); job.finishTask(taskAttemptID[1]); job.finishTask(taskAttemptID[2]); clock.advance(250000); taskAttemptID[4] = job.findMapTask(trackers[3]); clock.advanceBySpeculativeLag(); //by doing the above clock adjustments, we bring the progress rate of //taskID 3 lower than 4. For taskID 3, the rate is 85/317000 //and for taskID 4, the rate is 20/65000. But when we ask for a spec task //now, we should get back taskID 4 (since that is expected to complete //later than taskID 3). job.refresh(clock.getTime()); job.progressMade(taskAttemptID[3], 0.85f); job.progressMade(taskAttemptID[4], 0.20f); taskAttemptID[5] = job.findMapTask(trackers[4]); assertEquals(taskAttemptID[5].getTaskID().getId(),4); } /* * Tests the fact that we should not choose tasks * for speculation that are slated to finish in short * period of time */ public void testFastTaskScheduling() throws IOException { TaskAttemptID[] taskAttemptID = new TaskAttemptID[2]; JobConf conf = new JobConf(); conf.setSpeculativeExecution(true); conf.setNumMapTasks(2); conf.setNumReduceTasks(0); conf.setFloat(JobInProgress.SPECULATIVE_MAP_UNFINISHED_THRESHOLD_KEY, 0); conf.setFloat(JobInProgress.SPECULATIVE_REDUCE_UNFINISHED_THRESHOLD_KEY, 0); conf.setFloat(JobInProgress.SPECULATIVE_SLOWTASK_THRESHOLD, 0.5f); conf.setMapSpeculativeDuration(300L); FakeJobInProgress job = new FakeJobInProgress(conf, jobTracker); job.initTasks(); // a really fast task #1 taskAttemptID[0] = job.findMapTask(trackers[0]); clock.advance(2000); job.finishTask(taskAttemptID[0]); // task #2 is slow taskAttemptID[1] = job.findMapTask(trackers[1]); clock.advanceBySpeculativeLag(); clock.advance(5000); // 65 secs have elapsed since task scheduling // set progress so that it will complete within // 300 seconds job.progressMade(taskAttemptID[1], 0.7f); // no new map task should be found job.refresh(clock.getTime()); assertEquals(job.findMapTask(trackers[2]), null); } /* * Tests the fact that we only launch a limited number of speculative tasks, * even though we have a lot of tasks in RUNNING state */ public void testAtSpeculativeCap() throws IOException { //The expr which is evaluated for determining whether //atSpeculativeCap should //return true or false is //(#speculative-tasks < max (10, 0.01*#slots, 0.1*#running-tasks) //Tests the fact that the max tasks launched is 0.1 * #running-tasks assertEquals(speculativeCap(1200,800,1000), 40); //Tests the fact that the max tasks launched is 10 assertEquals(speculativeCap(1200,1150,800), 10); //Tests the fact that the max tasks launched is 0.01 * #slots assertEquals(speculativeCap(1200,1150,4000), 20); } private int speculativeCap(int totalTasks, int numEarlyComplete, int slots) throws IOException { TaskAttemptID[] taskAttemptID = new TaskAttemptID[1500]; JobConf conf = new JobConf(); conf.setSpeculativeExecution(true); conf.setNumMapTasks(totalTasks); conf.setNumReduceTasks(0); jobTracker.setNumSlots(slots); FakeJobInProgress job = new FakeJobInProgress(conf, jobTracker); job.initTasks(); int i; for (i = 0; i < totalTasks; i++) { taskAttemptID[i] = job.findMapTask(trackers[0]); } clock.advance(5000); for (i = 0; i < numEarlyComplete; i++) { job.finishTask(taskAttemptID[i]); } clock.advanceBySpeculativeLag(); for (i = numEarlyComplete; i < totalTasks; i++) { job.progressMade(taskAttemptID[i], 0.85f); } clock.advance(50000); for (i = 0; i < (totalTasks - numEarlyComplete); i++) { job.refresh(clock.getTime()); taskAttemptID[i] = job.findMapTask(trackers[1]); clock.advance(2000); if (taskAttemptID[i] != null) { //add some good progress constantly for the different //task-attempts so that //the tasktracker doesn't get into the slow trackers category job.progressMade(taskAttemptID[i], 0.99f); } else { break; } } return i; } public void testSpeculateLastTask() throws Exception { TaskAttemptID[] taskAttemptID = new TaskAttemptID[8]; JobConf conf = new JobConf(); conf.setSpeculativeExecution(true); conf.setNumMapTasks(3); conf.setNumReduceTasks(3); FakeJobInProgress job = new FakeJobInProgress(conf, jobTracker); job.initTasks(); taskAttemptID[0] = job.findMapTask(trackers[0]); taskAttemptID[1] = job.findMapTask(trackers[1]); taskAttemptID[2] = job.findMapTask(trackers[2]); clock.advanceBySpeculativeLag(); job.finishTask(taskAttemptID[0]); job.finishTask(taskAttemptID[1]); // Speculative last unfinised task job.refresh(clock.getTime()); taskAttemptID[3] = job.findMapTask(trackers[3]); Assert.assertNotNull(taskAttemptID[3]); job.finishTask(taskAttemptID[2]); job.finishTask(taskAttemptID[3]); taskAttemptID[4] = job.findReduceTask(trackers[0]); taskAttemptID[5] = job.findReduceTask(trackers[1]); taskAttemptID[6] = job.findReduceTask(trackers[2]); clock.advanceBySpeculativeLag(); job.finishTask(taskAttemptID[4]); job.finishTask(taskAttemptID[5]); // Speculative last unfinised task job.refresh(clock.getTime()); taskAttemptID[7] = job.findReduceTask(trackers[3]); Assert.assertNotNull(taskAttemptID[7]); job.finishTask(taskAttemptID[6]); job.finishTask(taskAttemptID[7]); } public void testSlowMapProgressingRate() throws IOException { clock.advance(1000); TaskAttemptID[] taskAttemptID = new TaskAttemptID[6]; JobConf conf = new JobConf(); conf.setSpeculativeExecution(true); conf.setNumMapTasks(3); conf.setNumReduceTasks(0); //use processing rate for speculation conf.setBoolean("mapreduce.job.speculative.using.processing.rate", true); FakeJobInProgress job = new FakeJobInProgress(conf, jobTracker); job.initTasks(); //schedule maps taskAttemptID[0] = job.findMapTask(trackers[0]); taskAttemptID[1] = job.findMapTask(trackers[1]); taskAttemptID[2] = job.findMapTask(trackers[2]); clock.advance(1000); job.finishTask(taskAttemptID[0]); //if consider the progress rate, we should speculate task 1 //but if consider the processing rate, which is map_input_bytes/time //then we should speculate task 2 job.processingRate(taskAttemptID[1], Task.Counter.MAP_INPUT_BYTES, 100000000, 0.1f, TaskStatus.Phase.MAP); job.processingRate(taskAttemptID[2], Task.Counter.MAP_INPUT_BYTES, 1000, 0.5f, TaskStatus.Phase.MAP); clock.advanceBySpeculativeLag(); //we should get a speculative task now job.refresh(clock.getTime()); taskAttemptID[3] = job.findMapTask(trackers[0]); assertEquals(taskAttemptID[3].getTaskID().getId(),2); } public void testSlowReduceProgressingRate() throws IOException { TaskAttemptID[] taskAttemptID = new TaskAttemptID[6]; JobConf conf = new JobConf(); conf.setSpeculativeExecution(true); conf.setNumMapTasks(4); conf.setNumReduceTasks(4); //use processing rate for speculation conf.setBoolean("mapreduce.job.speculative.using.processing.rate", true); FakeJobInProgress job = new FakeJobInProgress(conf, jobTracker); job.initTasks(); //schedule reduces taskAttemptID[0] = job.findReduceTask(trackers[0]); taskAttemptID[1] = job.findReduceTask(trackers[1]); taskAttemptID[2] = job.findReduceTask(trackers[2]); taskAttemptID[3] = job.findReduceTask(trackers[3]); clock.advance(1000); //task 0 just starts copying, while task 1, 2, 3 are already in the reducing //phase. If we compared the progress rate, then we should speculate 0. //However, by comparing the processing rate in the copy phase, among all 4 //tasks, task 0 is fast, and we should not speculate it. //for task 1, 2, 3, they are all in the reducing phase, with same progress, //however, task 1 has smaller processing rate(the statistics of the reduce //phase for all the tasks will also include statistics for task 0, whose //processing rate is 0) job.finishCopy(taskAttemptID[1], clock.getTime(), 10000); job.finishCopy(taskAttemptID[2], clock.getTime(), 10000); job.finishCopy(taskAttemptID[3], clock.getTime(), 10000); clock.advance(1000); job.finishSort(taskAttemptID[1], clock.getTime()); job.finishSort(taskAttemptID[2], clock.getTime()); job.finishSort(taskAttemptID[3], clock.getTime()); job.processingRate(taskAttemptID[0], Task.Counter.REDUCE_SHUFFLE_BYTES, 100000000, 0.1f, TaskStatus.Phase.SHUFFLE); job.processingRate(taskAttemptID[1], Task.Counter.REDUCE_INPUT_BYTES, 1000, 0.8f, TaskStatus.Phase.REDUCE); job.processingRate(taskAttemptID[2], Task.Counter.REDUCE_INPUT_BYTES, 100000000, 0.8f, TaskStatus.Phase.REDUCE); job.processingRate(taskAttemptID[3], Task.Counter.REDUCE_INPUT_BYTES, 100000000, 0.8f, TaskStatus.Phase.REDUCE); clock.advanceBySpeculativeLag(); //we should get a speculative task now job.refresh(clock.getTime()); taskAttemptID[4] = job.findReduceTask(trackers[4]); assertEquals(taskAttemptID[4].getTaskID().getId(),1); } }