/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapred;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.mapreduce.test.system.JTProtocol;
import org.apache.hadoop.mapreduce.test.system.JobInfo;
import org.apache.hadoop.mapreduce.test.system.MRCluster;
import org.apache.hadoop.mapreduce.test.system.TTClient;
import org.apache.hadoop.mapreduce.test.system.JTClient;
import org.apache.hadoop.mapreduce.test.system.TTProtocol;
import org.apache.hadoop.mapreduce.test.system.TTTaskInfo;
import org.apache.hadoop.mapreduce.test.system.TaskInfo;
import org.apache.hadoop.mapreduce.test.system.FinishTaskControlAction;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.Tool;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Assert;
import org.junit.Test;
import java.io.IOException;
import java.io.DataOutputStream;
import java.util.Collection;
import testjar.GenerateTaskChildProcess;
import java.util.Hashtable;
/**
* Submit a job which would spawn child processes and
* verify whether the task child processes are cleaned up
* or not after either job killed or task killed or task failed.
*/
public class TestTaskChildsKilling {
private static final Log LOG = LogFactory
.getLog(TestTaskChildsKilling.class);
private static MRCluster cluster;
private static Path inputDir = new Path("input");
private static Path outputDir = new Path("output");
private static Configuration conf = new Configuration();
private static String confFile = "mapred-site.xml";
@BeforeClass
public static void before() throws Exception {
Hashtable<String,Object> prop = new Hashtable<String,Object>();
prop.put("mapred.map.max.attempts", 1L);
prop.put("mapreduce.job.complete.cancel.delegation.tokens",false);
String [] expExcludeList = {"java.net.ConnectException",
"java.io.IOException"};
cluster = MRCluster.createCluster(conf);
cluster.setExcludeExpList(expExcludeList);
cluster.setUp();
cluster.restartClusterWithNewConfig(prop, confFile);
UtilsForTests.waitFor(1000);
conf = cluster.getJTClient().getProxy().getDaemonConf();
createInput(inputDir, conf);
}
@AfterClass
public static void after() throws Exception {
cleanup(inputDir, conf);
cleanup(outputDir, conf);
cluster.tearDown();
cluster.restart();
UtilsForTests.waitFor(1000);
}
/**
* Verifying the process tree cleanup of a task after task is killed
* by using -kill-task option.
*/
@Test
public void testProcessTreeCleanupOfKilledTask1() throws
Exception {
TaskInfo taskInfo = null;
TaskID tID = null;
TTTaskInfo [] ttTaskinfo = null;
String pid = null;
TTProtocol ttIns = null;
TTClient ttClientIns = null;
int counter = 0;
JobConf jobConf = new JobConf(conf);
jobConf.setJobName("Message Display");
jobConf.setJarByClass(GenerateTaskChildProcess.class);
jobConf.setMapperClass(GenerateTaskChildProcess.StrDisplayMapper.class);
jobConf.setNumMapTasks(1);
jobConf.setNumReduceTasks(0);
jobConf.setMaxMapAttempts(1);
cleanup(outputDir, conf);
FileInputFormat.setInputPaths(jobConf, inputDir);
FileOutputFormat.setOutputPath(jobConf, outputDir);
JTClient jtClient = cluster.getJTClient();
JobClient client = jtClient.getClient();
JTProtocol wovenClient = cluster.getJTClient().getProxy();
RunningJob runJob = client.submitJob(jobConf);
JobID id = runJob.getID();
JobInfo jInfo = wovenClient.getJobInfo(id);
Assert.assertNotNull("Job information is null",jInfo);
Assert.assertTrue("Job has not been started for 1 min.",
jtClient.isJobStarted(id));
TaskInfo[] taskInfos = wovenClient.getTaskInfo(id);
for (TaskInfo taskinfo : taskInfos) {
if (!taskinfo.isSetupOrCleanup()) {
taskInfo = taskinfo;
break;
}
}
Assert.assertTrue("Task has not been started for 1 min.",
jtClient.isTaskStarted(taskInfo));
tID = TaskID.downgrade(taskInfo.getTaskID());
TaskAttemptID tAttID = new TaskAttemptID(tID,0);
FinishTaskControlAction action = new FinishTaskControlAction(tID);
Collection<TTClient> ttClients = cluster.getTTClients();
for (TTClient ttClient : ttClients) {
TTProtocol tt = ttClient.getProxy();
tt.sendAction(action);
ttTaskinfo = tt.getTasks();
for (TTTaskInfo tttInfo : ttTaskinfo) {
if (!tttInfo.isTaskCleanupTask()) {
pid = tttInfo.getPid();
ttClientIns = ttClient;
ttIns = tt;
break;
}
}
if (ttClientIns != null) {
break;
}
}
Assert.assertTrue("Map process tree is not alive before task kills.",
ttIns.isProcessTreeAlive(pid));
String args[] = new String[] { "-kill-task", tAttID.toString() };
int exitCode = runTool(jobConf, client, args);
Assert.assertEquals("Exit Code:", 0, exitCode);
LOG.info("Waiting till the task is killed...");
counter = 0;
while (counter < 30) {
if (taskInfo.getTaskStatus().length > 0) {
if (taskInfo.getTaskStatus()[0].getRunState() ==
TaskStatus.State.KILLED) {
break;
}
}
UtilsForTests.waitFor(1000);
taskInfo = wovenClient.getTaskInfo(taskInfo.getTaskID());
counter ++;
}
runJob.killJob();
LOG.info("Waiting till the job is completed...");
counter = 0;
while (counter < 60) {
if (jInfo.getStatus().isJobComplete()) {
break;
}
UtilsForTests.waitFor(1000);
jInfo = wovenClient.getJobInfo(id);
counter ++;
}
Assert.assertTrue("Job has not been completed for 1 min.",
counter != 60 );
ttIns = ttClientIns.getProxy();
UtilsForTests.waitFor(1000);
Assert.assertTrue("Map process is still alive after task has been killed.",
!ttIns.isProcessTreeAlive(pid));
}
/**
* Verifying the process tree cleanup of a particular task
* after task is killed.
*/
@Test
public void testProcessTreeCleanupOfKilledTask2() throws
IOException {
TaskInfo taskInfo = null;
TaskID tID = null;
TaskAttemptID taskAttID = null;
TTTaskInfo [] ttTaskinfo = null;
String pid = null;
TTProtocol ttIns = null;
TTClient ttClientIns = null;
int counter = 0;
JobConf jobConf = new JobConf(conf);
jobConf.setJobName("Message Display");
jobConf.setJarByClass(GenerateTaskChildProcess.class);
jobConf.setMapperClass(GenerateTaskChildProcess.StrDisplayMapper.class);
jobConf.setNumMapTasks(1);
jobConf.setNumReduceTasks(0);
cleanup(outputDir, conf);
FileInputFormat.setInputPaths(jobConf, inputDir);
FileOutputFormat.setOutputPath(jobConf, outputDir);
JTClient jtClient = cluster.getJTClient();
JobClient client = jtClient.getClient();
JTProtocol wovenClient = cluster.getJTClient().getProxy();
RunningJob runJob = client.submitJob(jobConf);
JobID id = runJob.getID();
JobInfo jInfo = wovenClient.getJobInfo(id);
Assert.assertNotNull("Job information is null", jInfo);
Assert.assertTrue("Job has not been started for 1 min.",
jtClient.isJobStarted(id));
TaskInfo[] taskInfos = wovenClient.getTaskInfo(id);
for (TaskInfo taskinfo : taskInfos) {
if (!taskinfo.isSetupOrCleanup()) {
taskInfo = taskinfo;
break;
}
}
Assert.assertTrue("Task has not been started for 1 min.",
jtClient.isTaskStarted(taskInfo));
tID = TaskID.downgrade(taskInfo.getTaskID());
taskAttID = new TaskAttemptID(tID,0);
FinishTaskControlAction action = new FinishTaskControlAction(tID);
Collection<TTClient> ttClients = cluster.getTTClients();
for (TTClient ttClient : ttClients) {
TTProtocol tt = ttClient.getProxy();
tt.sendAction(action);
ttTaskinfo = tt.getTasks();
for (TTTaskInfo tttInfo : ttTaskinfo) {
if (!tttInfo.isTaskCleanupTask()) {
pid = tttInfo.getPid();
ttClientIns = ttClient;
ttIns = tt;
break;
}
}
if (ttClientIns != null) {
break;
}
}
Assert.assertTrue("Map process is not alive before task kills.",
ttIns.isProcessTreeAlive(pid));
runJob.killTask(taskAttID, false);
LOG.info("Waiting till the task is killed...");
taskInfo = wovenClient.getTaskInfo(taskInfo.getTaskID());
counter = 0;
while (counter < 30) {
if (taskInfo.getTaskStatus()[0].getRunState() ==
TaskStatus.State.KILLED) {
break;
}
UtilsForTests.waitFor(1000);
taskInfo = wovenClient.getTaskInfo(taskInfo.getTaskID());
counter ++;
}
runJob.killJob();
LOG.info("Waiting till the job is completed...");
counter = 0;
while (counter < 60) {
if (jInfo.getStatus().isJobComplete()) {
break;
}
UtilsForTests.waitFor(1000);
jInfo = wovenClient.getJobInfo(id);
counter ++;
}
Assert.assertTrue("Job has not been completed for 1 min.",
counter != 60);
UtilsForTests.waitFor(2000);
ttIns = ttClientIns.getProxy();
Assert.assertTrue("Map process is still alive after task has been killed.",
!ttIns.isProcessTreeAlive(pid));
}
/**
* Verifying the child process tree clean up of a task which fails due
* to an exception.
*/
@Test
public void testProcessTreeCleanupOfFailedTask1() throws IOException {
TaskInfo taskInfo = null;
TaskID tID = null;
TTTaskInfo [] ttTaskinfo = null;
String pid = null;
TTProtocol ttIns = null;
TTClient ttClientIns = null;
int counter = 0;
JobConf jobConf = new JobConf(conf);
jobConf.setJobName("Message Display");
jobConf.setJarByClass(GenerateTaskChildProcess.class);
jobConf.setMapperClass(GenerateTaskChildProcess.FailedMapper.class);
jobConf.setNumMapTasks(1);
jobConf.setNumReduceTasks(0);
cleanup(outputDir, conf);
FileInputFormat.setInputPaths(jobConf, inputDir);
FileOutputFormat.setOutputPath(jobConf, outputDir);
JTClient jtClient = cluster.getJTClient();
JobClient client = jtClient.getClient();
JTProtocol wovenClient = cluster.getJTClient().getProxy();
RunningJob runJob = client.submitJob(jobConf);
JobID id = runJob.getID();
JobInfo jInfo = wovenClient.getJobInfo(id);
Assert.assertNotNull("Job information is null", jInfo);
Assert.assertTrue("Job has not been started for 1 min.",
jtClient.isJobStarted(id));
TaskInfo[] taskInfos = wovenClient.getTaskInfo(id);
for (TaskInfo taskinfo : taskInfos) {
if (!taskinfo.isSetupOrCleanup()) {
taskInfo = taskinfo;
break;
}
}
Assert.assertTrue("Task has not been started for 1 min.",
jtClient.isTaskStarted(taskInfo));
tID = TaskID.downgrade(taskInfo.getTaskID());
FinishTaskControlAction action = new FinishTaskControlAction(tID);
Collection<TTClient> ttClients = cluster.getTTClients();
for (TTClient ttClient : ttClients) {
TTProtocol tt = ttClient.getProxy();
tt.sendAction(action);
ttTaskinfo = tt.getTasks();
for (TTTaskInfo tttInfo : ttTaskinfo) {
if (!tttInfo.isTaskCleanupTask()) {
pid = tttInfo.getPid();
ttClientIns = ttClient;
ttIns = tt;
break;
}
}
if (ttClientIns != null) {
break;
}
}
Assert.assertTrue("Map process is not alive before task fails.",
ttIns.isProcessTreeAlive(pid));
LOG.info("Waiting till the task is failed...");
counter = 0;
while (counter < 60) {
if (taskInfo.getTaskStatus().length > 0) {
if (taskInfo.getTaskStatus()[0].getRunState() ==
TaskStatus.State.FAILED) {
break;
}
}
UtilsForTests.waitFor(1000);
taskInfo = wovenClient.getTaskInfo(taskInfo.getTaskID());
counter++;
}
LOG.info("Waiting till the job is completed...");
counter = 0;
while (counter < 60) {
if (jInfo.getStatus().isJobComplete()) {
break;
}
UtilsForTests.waitFor(1000);
jInfo = wovenClient.getJobInfo(id);
counter ++;
}
Assert.assertTrue("Job has not been completed for 1 min.",
counter != 60);
ttIns = ttClientIns.getProxy();
UtilsForTests.waitFor(2000);
Assert.assertTrue("Map process is still alive after task has been failed.",
!ttIns.isProcessTreeAlive(pid));
}
/**
* Verifying the process tree cleanup of a task after task is failed
* by using -fail-task option.
*/
@Test
public void testProcessTreeCleanupOfFailedTask2() throws
Exception {
TaskInfo taskInfo = null;
TaskID tID = null;
TTTaskInfo [] ttTaskinfo = null;
String pid = null;
TTProtocol ttIns = null;
TTClient ttClientIns = null;
int counter = 0;
JobConf jobConf = new JobConf(conf);
jobConf.setJobName("Message Display");
jobConf.setJarByClass(GenerateTaskChildProcess.class);
jobConf.setMapperClass(GenerateTaskChildProcess.StrDisplayMapper.class);
jobConf.setNumMapTasks(1);
jobConf.setNumReduceTasks(0);
cleanup(outputDir, conf);
FileInputFormat.setInputPaths(jobConf, inputDir);
FileOutputFormat.setOutputPath(jobConf, outputDir);
JTClient jtClient = cluster.getJTClient();
JobClient client = jtClient.getClient();
JTProtocol wovenClient = cluster.getJTClient().getProxy();
RunningJob runJob = client.submitJob(jobConf);
JobID id = runJob.getID();
JobInfo jInfo = wovenClient.getJobInfo(id);
Assert.assertNotNull("Job information is null", jInfo);
Assert.assertTrue("Job has not been started for 1 min.",
jtClient.isJobStarted(id));
TaskInfo[] taskInfos = wovenClient.getTaskInfo(id);
for (TaskInfo taskinfo : taskInfos) {
if (!taskinfo.isSetupOrCleanup()) {
taskInfo = taskinfo;
break;
}
}
Assert.assertTrue("Task has not been started for 1 min.",
jtClient.isTaskStarted(taskInfo));
tID = TaskID.downgrade(taskInfo.getTaskID());
TaskAttemptID tAttID = new TaskAttemptID(tID,0);
FinishTaskControlAction action = new FinishTaskControlAction(tID);
Collection<TTClient> ttClients = cluster.getTTClients();
for (TTClient ttClient : ttClients) {
TTProtocol tt = ttClient.getProxy();
tt.sendAction(action);
ttTaskinfo = tt.getTasks();
for (TTTaskInfo tttInfo : ttTaskinfo) {
if (!tttInfo.isTaskCleanupTask()) {
pid = tttInfo.getPid();
ttClientIns = ttClient;
ttIns = tt;
break;
}
}
if (ttClientIns != null) {
break;
}
}
Assert.assertTrue("Map process is not alive before task fails.",
ttIns.isProcessTreeAlive(pid));
String args[] = new String[] { "-fail-task", tAttID.toString() };
int exitCode = runTool(jobConf, client, args);
Assert.assertEquals("Exit Code:", 0, exitCode);
LOG.info("Waiting till the task is failed...");
taskInfo = wovenClient.getTaskInfo(taskInfo.getTaskID());
counter = 0;
while (counter < 60) {
if (taskInfo.getTaskStatus().length > 0) {
if (taskInfo.getTaskStatus()[0].getRunState() ==
TaskStatus.State.FAILED) {
break;
}
}
UtilsForTests.waitFor(1000);
taskInfo = wovenClient.getTaskInfo(taskInfo.getTaskID());
counter ++;
}
counter = 0;
LOG.info("Waiting till the job is completed...");
while (counter < 60) {
if (jInfo.getStatus().isJobComplete()) {
break;
}
UtilsForTests.waitFor(1000);
jInfo = wovenClient.getJobInfo(id);
counter ++;
}
Assert.assertTrue("Job has not been completed for 1 min",
counter != 60);
ttIns = ttClientIns.getProxy();
UtilsForTests.waitFor(1000);
Assert.assertTrue("Map process is still alive after task has been failed.",
!ttIns.isProcessTreeAlive(pid));
}
private int runTool(Configuration job, Tool tool,
String[] jobArgs) throws Exception {
int returnStatus = ToolRunner.run(job, tool, jobArgs);
return returnStatus;
}
private static void cleanup(Path dir, Configuration conf) throws
IOException {
FileSystem fs = dir.getFileSystem(conf);
fs.delete(dir, true);
}
private static void createInput(Path inDir, Configuration conf) throws
IOException {
String input = "Hadoop is framework for data intensive distributed "
+ "applications.\n Hadoop enables applications"
+ " to work with thousands of nodes.";
FileSystem fs = inDir.getFileSystem(conf);
if (!fs.mkdirs(inDir)) {
throw new IOException("Failed to create the input directory:"
+ inDir.toString());
}
fs.setPermission(inDir, new FsPermission(FsAction.ALL,
FsAction.ALL, FsAction.ALL));
DataOutputStream file = fs.create(new Path(inDir, "data.txt"));
int i = 0;
while(i < 10) {
file.writeBytes(input);
i++;
}
file.close();
}
}