/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.AfterClass; import org.junit.Test; import java.io.DataOutputStream; import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.test.system.MRCluster; import org.apache.hadoop.mapreduce.test.system.JTProtocol; import org.apache.hadoop.mapreduce.test.system.JobInfo; import org.apache.hadoop.mapreduce.test.system.TaskInfo; import org.apache.hadoop.mapreduce.test.system.TTClient; import org.apache.hadoop.mapreduce.test.system.JTClient; import org.apache.hadoop.mapreduce.test.system.FinishTaskControlAction; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.examples.SleepJob; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.mapred.UtilsForTests; /** * A System test for verifying the status after killing the * tasks at different conditions. */ public class TestTaskKilling { private static final Log LOG = LogFactory.getLog(TestTaskKilling.class); private static MRCluster cluster; private static JobClient jobClient = null; private static JTClient jtClient = null; private static JTProtocol remoteJTClient = null; private static Configuration conf = new Configuration(); @BeforeClass public static void before() throws Exception { cluster = MRCluster.createCluster(conf); cluster.setUp(); jtClient = cluster.getJTClient(); jobClient = jtClient.getClient(); remoteJTClient = jtClient.getProxy(); } @AfterClass public static void after() throws Exception { cluster.tearDown(); } /** * Verifying the running job status whether it succeeds or not * after failing some of its tasks. */ @Test public void testFailedTaskJobStatus() throws IOException, InterruptedException { conf = remoteJTClient.getDaemonConf(); TaskInfo taskInfo = null; SleepJob job = new SleepJob(); job.setConf(conf); JobConf jobConf = job.setupJobConf(1, 1, 10000, 4000, 100, 100); RunningJob runJob = jobClient.submitJob(jobConf); JobID jobId = runJob.getID(); JobInfo jInfo = remoteJTClient.getJobInfo(jobId); Assert.assertTrue("Job has not been started for 1 min.", jtClient.isJobStarted(jobId)); TaskInfo[] taskInfos = remoteJTClient.getTaskInfo(jobId); for (TaskInfo taskinfo : taskInfos) { if (!taskinfo.isSetupOrCleanup() && taskinfo.getTaskID().isMap()) { taskInfo = taskinfo; break; } } Assert.assertTrue("Task has not been started for 1 min.", jtClient.isTaskStarted(taskInfo)); // Fail the running task. RunningJob networkJob = jobClient.getJob(jobId); TaskID tID = TaskID.downgrade(taskInfo.getTaskID()); TaskAttemptID taskAttID = new TaskAttemptID(tID , 0); networkJob.killTask(taskAttID, true); LOG.info("Waiting till the job is completed..."); while (!jInfo.getStatus().isJobComplete()) { UtilsForTests.waitFor(100); jInfo = remoteJTClient.getJobInfo(jobId); } Assert.assertEquals("JobStatus", JobStatus.SUCCEEDED, jInfo.getStatus().getRunState()); } /** * Verifying whether task temporary output directory is cleaned up or not * after killing the task. */ @Test public void testDirCleanupAfterTaskKilled() throws IOException, InterruptedException { TaskInfo taskInfo = null; boolean isTempFolderExists = false; String localTaskDir = null; TTClient ttClient = null; FileStatus filesStatus [] = null; Path inputDir = new Path("input"); Path outputDir = new Path("output"); Configuration conf = new Configuration(cluster.getConf()); JobConf jconf = new JobConf(conf); jconf.setJobName("Word Count"); jconf.setJarByClass(WordCount.class); jconf.setMapperClass(WordCount.MapClass.class); jconf.setCombinerClass(WordCount.Reduce.class); jconf.setReducerClass(WordCount.Reduce.class); jconf.setNumMapTasks(1); jconf.setNumReduceTasks(1); jconf.setOutputKeyClass(Text.class); jconf.setOutputValueClass(IntWritable.class); cleanup(inputDir, conf); cleanup(outputDir, conf); createInput(inputDir, conf); FileInputFormat.setInputPaths(jconf, inputDir); FileOutputFormat.setOutputPath(jconf, outputDir); RunningJob runJob = jobClient.submitJob(jconf); JobID id = runJob.getID(); JobInfo jInfo = remoteJTClient.getJobInfo(id); Assert.assertTrue("Job has not been started for 1 min.", jtClient.isJobStarted(id)); JobStatus[] jobStatus = jobClient.getAllJobs(); String userName = jobStatus[0].getUsername(); TaskInfo[] taskInfos = remoteJTClient.getTaskInfo(id); for (TaskInfo taskinfo : taskInfos) { if (!taskinfo.isSetupOrCleanup() && taskinfo.getTaskID().isMap()) { taskInfo = taskinfo; break; } } Assert.assertTrue("Task has not been started for 1 min.", jtClient.isTaskStarted(taskInfo)); TaskID tID = TaskID.downgrade(taskInfo.getTaskID()); FinishTaskControlAction action = new FinishTaskControlAction(tID); String[] taskTrackers = taskInfo.getTaskTrackers(); int counter = 0; TaskInfo prvTaskInfo = taskInfo; while (counter++ < 30) { if (taskTrackers.length > 0) { break; } else { UtilsForTests.waitFor(100); taskInfo = remoteJTClient.getTaskInfo(taskInfo.getTaskID()); if (taskInfo == null) { taskInfo = prvTaskInfo; } else { prvTaskInfo = taskInfo; } taskTrackers = taskInfo.getTaskTrackers(); } } Assert.assertTrue("TaskTracker is not found.", taskTrackers.length > 0); String hostName = taskTrackers[0].split("_")[1]; hostName = hostName.split(":")[0]; ttClient = cluster.getTTClient(hostName); String localDirs[] = ttClient.getMapredLocalDirs(); TaskAttemptID taskAttID = new TaskAttemptID(tID, 0); for (String localDir : localDirs) { localTaskDir = localDir + "/" + TaskTracker.getLocalTaskDir(userName, id.toString(), taskAttID.toString()); filesStatus = ttClient.listStatus(localTaskDir, true); if (filesStatus.length > 0) { isTempFolderExists = true; break; } } Assert.assertTrue("Task Attempt directory " + taskAttID + " has not been found while task was running.", isTempFolderExists); RunningJob networkJob = jobClient.getJob(id); networkJob.killTask(taskAttID, false); ttClient.getProxy().sendAction(action); taskInfo = remoteJTClient.getTaskInfo(tID); while(taskInfo.getTaskStatus()[0].getRunState() == TaskStatus.State.RUNNING) { UtilsForTests.waitFor(1000); taskInfo = remoteJTClient.getTaskInfo(tID); } UtilsForTests.waitFor(1000); taskInfo = remoteJTClient.getTaskInfo(tID); Assert.assertTrue("Task status has not been changed to KILLED.", (TaskStatus.State.KILLED == taskInfo.getTaskStatus()[0].getRunState() || TaskStatus.State.KILLED_UNCLEAN == taskInfo.getTaskStatus()[0].getRunState())); taskInfo = remoteJTClient.getTaskInfo(tID); counter = 0; while (counter++ < 60) { filesStatus = ttClient.listStatus(localTaskDir, true); if (filesStatus.length == 0) { break; } else { UtilsForTests.waitFor(100); } } Assert.assertTrue("Task attempt temporary folder has not been cleaned.", isTempFolderExists && filesStatus.length == 0); UtilsForTests.waitFor(1000); jInfo = remoteJTClient.getJobInfo(id); LOG.info("Waiting till the job is completed..."); while (!jInfo.getStatus().isJobComplete()) { UtilsForTests.waitFor(100); jInfo = remoteJTClient.getJobInfo(id); } } private void cleanup(Path dir, Configuration conf) throws IOException { FileSystem fs = dir.getFileSystem(conf); fs.delete(dir, true); } private void createInput(Path inDir, Configuration conf) throws IOException { String input = "Hadoop is framework for data intensive distributed " + "applications.\n" + "Hadoop enables applications to work with thousands of nodes."; FileSystem fs = inDir.getFileSystem(conf); if (!fs.mkdirs(inDir)) { throw new IOException("Failed to create the input directory:" + inDir.toString()); } fs.setPermission(inDir, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); DataOutputStream file = fs.create(new Path(inDir, "data.txt")); int i = 0; while(i < 1000 * 3000) { file.writeBytes(input); i++; } file.close(); } /** * Verifying whether task temporary output directory is cleaned up or not * after failing the task. */ @Test public void testDirCleanupAfterTaskFailed() throws IOException, InterruptedException { TTClient ttClient = null; FileStatus filesStatus [] = null; String localTaskDir = null; TaskInfo taskInfo = null; TaskID tID = null; boolean isTempFolderExists = false; conf = remoteJTClient.getDaemonConf(); SleepJob job = new SleepJob(); job.setConf(conf); JobConf jobConf = job.setupJobConf(1, 0, 10000,100, 10, 10); RunningJob runJob = jobClient.submitJob(jobConf); JobID id = runJob.getID(); JobInfo jInfo = remoteJTClient.getJobInfo(id); Assert.assertTrue("Job has not been started for 1 min.", jtClient.isJobStarted(id)); JobStatus[] jobStatus = jobClient.getAllJobs(); String userName = jobStatus[0].getUsername(); TaskInfo[] taskInfos = remoteJTClient.getTaskInfo(id); for (TaskInfo taskinfo : taskInfos) { if (!taskinfo.isSetupOrCleanup() && taskinfo.getTaskID().isMap()) { taskInfo = taskinfo; break; } } Assert.assertTrue("Task has not been started for 1 min.", jtClient.isTaskStarted(taskInfo)); tID = TaskID.downgrade(taskInfo.getTaskID()); FinishTaskControlAction action = new FinishTaskControlAction(tID); String[] taskTrackers = taskInfo.getTaskTrackers(); int counter = 0; TaskInfo prvTaskInfo = taskInfo; while (counter++ < 30) { if (taskTrackers.length > 0) { break; } else { UtilsForTests.waitFor(1000); taskInfo = remoteJTClient.getTaskInfo(taskInfo.getTaskID()); if (taskInfo == null) { taskInfo = prvTaskInfo; } else { prvTaskInfo = taskInfo; } taskTrackers = taskInfo.getTaskTrackers(); } } Assert.assertTrue("Task tracker not found.", taskTrackers.length > 0); String hostName = taskTrackers[0].split("_")[1]; hostName = hostName.split(":")[0]; ttClient = cluster.getTTClient(hostName); String localDirs[] = ttClient.getMapredLocalDirs(); TaskAttemptID taskAttID = new TaskAttemptID(tID, 0); for (String localDir : localDirs) { localTaskDir = localDir + "/" + TaskTracker.getLocalTaskDir(userName, id.toString(), taskAttID.toString()); filesStatus = ttClient.listStatus(localTaskDir, true); if (filesStatus.length > 0) { isTempFolderExists = true; break; } } Assert.assertTrue("Task Attempt directory " + taskAttID + " has not been found while task was running.", isTempFolderExists); boolean isFailTask = false; JobInfo jobInfo = remoteJTClient.getJobInfo(id); int MAX_MAP_TASK_ATTEMPTS = Integer.parseInt( jobConf.get("mapred.map.max.attempts")); if (!isFailTask) { TaskID taskId = TaskID.downgrade(taskInfo.getTaskID()); TaskAttemptID tAttID = new TaskAttemptID(taskId, taskInfo.numFailedAttempts()); while(taskInfo.numFailedAttempts() < MAX_MAP_TASK_ATTEMPTS) { RunningJob networkJob = jobClient.getJob(id); networkJob.killTask(taskAttID, true); taskInfo = remoteJTClient.getTaskInfo(taskInfo.getTaskID()); taskAttID = new TaskAttemptID(taskId, taskInfo.numFailedAttempts()); } isFailTask=true; } ttClient.getProxy().sendAction(action); taskInfo = remoteJTClient.getTaskInfo(tID); Assert.assertTrue("Task status has not been changed to FAILED.", TaskStatus.State.FAILED == taskInfo.getTaskStatus()[0].getRunState() || TaskStatus.State.FAILED_UNCLEAN == taskInfo.getTaskStatus()[0].getRunState()); UtilsForTests.waitFor(1000); filesStatus = ttClient.listStatus(localTaskDir, true); Assert.assertTrue("Temporary folder has not been cleanup.", filesStatus.length == 0); UtilsForTests.waitFor(1000); jInfo = remoteJTClient.getJobInfo(id); LOG.info("Waiting till the job is completed..."); while (!jInfo.getStatus().isJobComplete()) { UtilsForTests.waitFor(100); jInfo = remoteJTClient.getJobInfo(id); } } @Test /** * This tests verification of job killing by killing of all task * attempts of a particular task * @param none * @return void */ public void testAllTaskAttemptKill() throws Exception { Configuration conf = new Configuration(cluster.getConf()); JobStatus[] jobStatus = null; SleepJob job = new SleepJob(); job.setConf(conf); conf = job.setupJobConf(2, 1, 40000, 1000, 100, 100); JobConf jconf = new JobConf(conf); //Submitting the job RunningJob rJob = cluster.getJTClient().getClient().submitJob(jconf); int MAX_MAP_TASK_ATTEMPTS = Integer. parseInt(jconf.get("mapred.map.max.attempts")); LOG.info("MAX_MAP_TASK_ATTEMPTS is : " + MAX_MAP_TASK_ATTEMPTS); Assert.assertTrue(MAX_MAP_TASK_ATTEMPTS > 0); TTClient tClient = null; TTClient[] ttClients = null; JobInfo jInfo = remoteJTClient.getJobInfo(rJob.getID()); //Assert if jobInfo is null Assert.assertNotNull(jInfo.getStatus().getRunState()); //Wait for the job to start running. while (jInfo.getStatus().getRunState() != JobStatus.RUNNING) { try { Thread.sleep(10000); } catch (InterruptedException e) {}; jInfo = remoteJTClient.getJobInfo(rJob.getID()); } //Temporarily store the jobid to use it later for comparision. JobID jobidStore = rJob.getID(); jobidStore = JobID.downgrade(jobidStore); LOG.info("job id is :" + jobidStore.toString()); TaskInfo[] taskInfos = null; //After making sure that the job is running, //the test execution has to make sure that //at least one task has started running before continuing. boolean runningCount = false; int count = 0; do { taskInfos = cluster.getJTClient().getProxy() .getTaskInfo(rJob.getID()); runningCount = false; for (TaskInfo taskInfo : taskInfos) { TaskStatus[] taskStatuses = taskInfo.getTaskStatus(); if (taskStatuses.length > 0){ LOG.info("taskStatuses[0].getRunState() is :" + taskStatuses[0].getRunState()); if (taskStatuses[0].getRunState() == TaskStatus.State.RUNNING){ runningCount = true; break; } else { LOG.info("Sleeping 5 seconds"); Thread.sleep(5000); } } } count++; //If the count goes beyond a point, then break; This is to avoid //infinite loop under unforeseen circumstances. Testcase will anyway //fail later. if (count > 10) { Assert.fail("Since the sleep count has reached beyond a point" + "failing at this point"); } } while (!runningCount); //This whole module is about getting the task Attempt id //of one task and killing it MAX_MAP_TASK_ATTEMPTS times, //whenever it re-attempts to run. String taskIdKilled = null; for (int i = 0 ; i<MAX_MAP_TASK_ATTEMPTS; i++) { taskInfos = cluster.getJTClient().getProxy() .getTaskInfo(rJob.getID()); for (TaskInfo taskInfo : taskInfos) { TaskAttemptID taskAttemptID; if (!taskInfo.isSetupOrCleanup()) { //This is the task which is going to be killed continously in //all its task attempts.The first task is getting picked up. TaskID taskid = TaskID.downgrade(taskInfo.getTaskID()); LOG.info("taskid is :" + taskid); if (i==0) { taskIdKilled = taskid.toString(); taskAttemptID = new TaskAttemptID(taskid, i); LOG.info("taskAttemptid going to be killed is : " + taskAttemptID); rJob.killTask(taskAttemptID,true); checkTaskCompletionEvent(taskAttemptID, jInfo); break; } else { if (taskIdKilled.equals(taskid.toString())) { taskAttemptID = new TaskAttemptID(taskid, i); //Make sure that task is midway and then kill UtilsForTests.waitFor(20000); LOG.info("taskAttemptid going to be killed is : " + taskAttemptID); rJob.killTask(taskAttemptID,true); checkTaskCompletionEvent(taskAttemptID,jInfo); break; } } } } } //Making sure that the job is complete. while (jInfo != null && !jInfo.getStatus().isJobComplete()) { Thread.sleep(10000); jInfo = remoteJTClient.getJobInfo(rJob.getID()); } //Making sure that the correct jobstatus is got from all the jobs jobStatus = jobClient.getAllJobs(); JobStatus jobStatusFound = null; for (JobStatus jobStatusTmp : jobStatus) { if (JobID.downgrade(jobStatusTmp.getJobID()).equals(jobidStore)) { jobStatusFound = jobStatusTmp; LOG.info("jobStatus found is :" + jobStatusFound.getJobId().toString()); } } //Making sure that the job has FAILED Assert.assertEquals("The job should have failed at this stage", JobStatus.FAILED,jobStatusFound.getRunState()); } //This method checks if task Attemptid occurs in the list //of tasks that are completed (killed) for a job.This is //required because after issuing a kill comamnd, the task //has to be killed and appear in the taskCompletion event. //After this a new task attempt will start running in a //matter of few seconds. public void checkTaskCompletionEvent (TaskAttemptID taskAttemptID, JobInfo jInfo) throws Exception { boolean match = false; int count = 0; while (!match) { org.apache.hadoop.mapreduce.JobID temp = jInfo.getID(); RunningJob rJob = jobClient.getJob(new JobID(temp.getJtIdentifier(), temp.getId())); TaskCompletionEvent[] taskCompletionEvents = rJob.getTaskCompletionEvents(0); for (TaskCompletionEvent taskCompletionEvent : taskCompletionEvents) { LOG.info("taskCompletionEvent.getTaskAttemptId().toString() is : " + taskCompletionEvent.getTaskAttemptId().toString()); LOG.info("compared to taskAttemptID.toString() :" + taskAttemptID.toString()); if ((taskCompletionEvent.getTaskAttemptId().toString()). equals(taskAttemptID.toString())){ match = true; //Sleeping for 10 seconds giving time for the next task //attempt to run Thread.sleep(10000); break; } } if (!match) { LOG.info("Thread is sleeping for 10 seconds"); Thread.sleep(10000); count++; } //If the count goes beyond a point, then break; This is to avoid //infinite loop under unforeseen circumstances.Testcase will anyway //fail later. if (count > 10) { Assert.fail("Since the task attemptid is not appearing in the" + "TaskCompletionEvent, it seems this task attempt was not killed"); } } } }