/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred; import java.io.File; import java.io.FileWriter; import java.io.Writer; import java.io.BufferedWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.StringTokenizer; import junit.framework.TestCase; import static org.apache.hadoop.mapred.Task.Counter.SPILLED_RECORDS; import static org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS; import static org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS; import static org.apache.hadoop.mapred.Task.Counter.COMMITTED_HEAP_BYTES; import org.apache.commons.lang.RandomStringUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; /** * This is an wordcount application that tests job counters. * It generates simple text input files. Then * runs the wordcount map/reduce application on (1) 3 i/p files(with 3 maps * and 1 reduce) and verifies the counters and (2) 4 i/p files(with 4 maps * and 1 reduce) and verifies counters. Wordcount application reads the * text input files, breaks each line into words and counts them. The output * is a locally sorted list of words and the count of how often they occurred. * */ public class TestJobCounters extends TestCase { String TEST_ROOT_DIR = new Path(System.getProperty("test.build.data", File.separator + "tmp")).toString().replace(' ', '+'); private void validateMapredCounters(Counters counter, long spillRecCnt, long mapInputRecords, long mapOutputRecords) { // Check if the numer of Spilled Records is same as expected assertEquals(spillRecCnt, counter.findCounter(SPILLED_RECORDS).getCounter()); assertEquals(mapInputRecords, counter.findCounter(MAP_INPUT_RECORDS).getCounter()); assertEquals(mapOutputRecords, counter.findCounter(MAP_OUTPUT_RECORDS).getCounter()); } private void validateCounters(org.apache.hadoop.mapreduce.Counters counter, long spillRecCnt, long mapInputRecords, long mapOutputRecords) { // Check if the numer of Spilled Records is same as expected assertEquals(spillRecCnt, counter.findCounter(SPILLED_RECORDS).getValue()); assertEquals(mapInputRecords, counter.findCounter(MAP_INPUT_RECORDS).getValue()); assertEquals(mapOutputRecords, counter.findCounter(MAP_OUTPUT_RECORDS).getValue()); } private void createWordsFile(File inpFile) throws Exception { Writer out = new BufferedWriter(new FileWriter(inpFile)); try { // 500*4 unique words --- repeated 5 times => 5*2K words int REPLICAS=5, NUMLINES=500, NUMWORDSPERLINE=4; for (int i = 0; i < REPLICAS; i++) { for (int j = 1; j <= NUMLINES*NUMWORDSPERLINE; j+=NUMWORDSPERLINE) { out.write("word" + j + " word" + (j+1) + " word" + (j+2) + " word" + (j+3) + '\n'); } } } finally { out.close(); } } /** * The main driver for word count map/reduce program. * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker. */ public void testOldJobWithMapAndReducers() throws Exception { JobConf conf = new JobConf(TestJobCounters.class); conf.setJobName("wordcount-map-reducers"); // the keys are words (strings) conf.setOutputKeyClass(Text.class); // the values are counts (ints) conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(WordCount.MapClass.class); conf.setCombinerClass(WordCount.Reduce.class); conf.setReducerClass(WordCount.Reduce.class); conf.setNumMapTasks(3); conf.setNumReduceTasks(1); conf.setInt("io.sort.mb", 1); conf.setInt("io.sort.factor", 2); conf.set("io.sort.record.percent", "0.05"); conf.set("io.sort.spill.percent", "0.80"); FileSystem fs = FileSystem.get(conf); Path testDir = new Path(TEST_ROOT_DIR, "countertest"); conf.set("test.build.data", testDir.toString()); try { if (fs.exists(testDir)) { fs.delete(testDir, true); } if (!fs.mkdirs(testDir)) { throw new IOException("Mkdirs failed to create " + testDir.toString()); } String inDir = testDir + File.separator + "genins" + File.separator; String outDir = testDir + File.separator; Path wordsIns = new Path(inDir); if (!fs.mkdirs(wordsIns)) { throw new IOException("Mkdirs failed to create " + wordsIns.toString()); } //create 3 input files each with 5*2k words File inpFile = new File(inDir + "input5_2k_1"); createWordsFile(inpFile); inpFile = new File(inDir + "input5_2k_2"); createWordsFile(inpFile); inpFile = new File(inDir + "input5_2k_3"); createWordsFile(inpFile); FileInputFormat.setInputPaths(conf, inDir); Path outputPath1 = new Path(outDir, "output5_2k_3"); FileOutputFormat.setOutputPath(conf, outputPath1); RunningJob myJob = JobClient.runJob(conf); Counters c1 = myJob.getCounters(); // 3maps & in each map, 4 first level spills --- So total 12. // spilled records count: // Each Map: 1st level:2k+2k+2k+2k=8k;2ndlevel=4k+4k=8k; // 3rd level=2k(4k from 1st level & 4k from 2nd level & combineAndSpill) // So total 8k+8k+2k=18k // For 3 Maps, total = 3*18=54k // Reduce: each of the 3 map o/p's(2k each) will be spilled in shuffleToDisk() // So 3*2k=6k in 1st level; 2nd level:4k(2k+2k); // 3rd level directly given to reduce(4k+2k --- combineAndSpill => 2k. // So 0 records spilled to disk in 3rd level) // So total of 6k+4k=10k // Total job counter will be 54k+10k = 64k //3 maps and 2.5k lines --- So total 7.5k map input records //3 maps and 10k words in each --- So total of 30k map output recs validateMapredCounters(c1, 64000, 7500, 30000); //create 4th input file each with 5*2k words and test with 4 maps inpFile = new File(inDir + "input5_2k_4"); createWordsFile(inpFile); conf.setNumMapTasks(4); Path outputPath2 = new Path(outDir, "output5_2k_4"); FileOutputFormat.setOutputPath(conf, outputPath2); myJob = JobClient.runJob(conf); c1 = myJob.getCounters(); // 4maps & in each map 4 first level spills --- So total 16. // spilled records count: // Each Map: 1st level:2k+2k+2k+2k=8k;2ndlevel=4k+4k=8k; // 3rd level=2k(4k from 1st level & 4k from 2nd level & combineAndSpill) // So total 8k+8k+2k=18k // For 3 Maps, total = 4*18=72k // Reduce: each of the 4 map o/p's(2k each) will be spilled in shuffleToDisk() // So 4*2k=8k in 1st level; 2nd level:4k+4k=8k; // 3rd level directly given to reduce(4k+4k --- combineAndSpill => 2k. // So 0 records spilled to disk in 3rd level) // So total of 8k+8k=16k // Total job counter will be 72k+16k = 88k // 4 maps and 2.5k words in each --- So 10k map input records // 4 maps and 10k unique words --- So 40k map output records validateMapredCounters(c1, 88000, 10000, 40000); // check for a map only job conf.setNumReduceTasks(0); Path outputPath3 = new Path(outDir, "output5_2k_5"); FileOutputFormat.setOutputPath(conf, outputPath3); myJob = JobClient.runJob(conf); c1 = myJob.getCounters(); // 4 maps and 2.5k words in each --- So 10k map input records // 4 maps and 10k unique words --- So 40k map output records validateMapredCounters(c1, 0, 10000, 40000); } finally { //clean up the input and output files if (fs.exists(testDir)) { fs.delete(testDir, true); } } } public static class NewMapTokenizer extends Mapper<Object, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } } public static class NewIdentityReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } /** * The main driver for word count map/reduce program. * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker. */ public void testNewJobWithMapAndReducers() throws Exception { JobConf conf = new JobConf(TestJobCounters.class); conf.setInt("io.sort.mb", 1); conf.setInt("io.sort.factor", 2); conf.set("io.sort.record.percent", "0.05"); conf.set("io.sort.spill.percent", "0.80"); FileSystem fs = FileSystem.get(conf); Path testDir = new Path(TEST_ROOT_DIR, "countertest2"); conf.set("test.build.data", testDir.toString()); try { if (fs.exists(testDir)) { fs.delete(testDir, true); } if (!fs.mkdirs(testDir)) { throw new IOException("Mkdirs failed to create " + testDir.toString()); } String inDir = testDir + File.separator + "genins" + File.separator; Path wordsIns = new Path(inDir); if (!fs.mkdirs(wordsIns)) { throw new IOException("Mkdirs failed to create " + wordsIns.toString()); } String outDir = testDir + File.separator; //create 3 input files each with 5*2k words File inpFile = new File(inDir + "input5_2k_1"); createWordsFile(inpFile); inpFile = new File(inDir + "input5_2k_2"); createWordsFile(inpFile); inpFile = new File(inDir + "input5_2k_3"); createWordsFile(inpFile); FileInputFormat.setInputPaths(conf, inDir); Path outputPath1 = new Path(outDir, "output5_2k_3"); FileOutputFormat.setOutputPath(conf, outputPath1); Job job = new Job(conf); job.setJobName("wordcount-map-reducers"); // the keys are words (strings) job.setOutputKeyClass(Text.class); // the values are counts (ints) job.setOutputValueClass(IntWritable.class); job.setMapperClass(NewMapTokenizer.class); job.setCombinerClass(NewIdentityReducer.class); job.setReducerClass(NewIdentityReducer.class); job.setNumReduceTasks(1); job.waitForCompletion(false); org.apache.hadoop.mapreduce.Counters c1 = job.getCounters(); // 3maps & in each map, 4 first level spills --- So total 12. // spilled records count: // Each Map: 1st level:2k+2k+2k+2k=8k;2ndlevel=4k+4k=8k; // 3rd level=2k(4k from 1st level & 4k from 2nd level & combineAndSpill) // So total 8k+8k+2k=18k // For 3 Maps, total = 3*18=54k // Reduce: each of the 3 map o/p's(2k each) will be spilled in shuffleToDisk() // So 3*2k=6k in 1st level; 2nd level:4k(2k+2k); // 3rd level directly given to reduce(4k+2k --- combineAndSpill => 2k. // So 0 records spilled to disk in 3rd level) // So total of 6k+4k=10k // Total job counter will be 54k+10k = 64k //3 maps and 2.5k lines --- So total 7.5k map input records //3 maps and 10k words in each --- So total of 30k map output recs validateCounters(c1, 64000, 7500, 30000); //create 4th input file each with 5*2k words and test with 4 maps inpFile = new File(inDir + "input5_2k_4"); createWordsFile(inpFile); JobConf newJobConf = new JobConf(job.getConfiguration()); Path outputPath2 = new Path(outDir, "output5_2k_4"); FileOutputFormat.setOutputPath(newJobConf, outputPath2); Job newJob = new Job(newJobConf); newJob.waitForCompletion(false); c1 = newJob.getCounters(); // 4maps & in each map 4 first level spills --- So total 16. // spilled records count: // Each Map: 1st level:2k+2k+2k+2k=8k;2ndlevel=4k+4k=8k; // 3rd level=2k(4k from 1st level & 4k from 2nd level & combineAndSpill) // So total 8k+8k+2k=18k // For 3 Maps, total = 4*18=72k // Reduce: each of the 4 map o/p's(2k each) will be spilled in shuffleToDisk() // So 4*2k=8k in 1st level; 2nd level:4k+4k=8k; // 3rd level directly given to reduce(4k+4k --- combineAndSpill => 2k. // So 0 records spilled to disk in 3rd level) // So total of 8k+8k=16k // Total job counter will be 72k+16k = 88k // 4 maps and 2.5k words in each --- So 10k map input records // 4 maps and 10k unique words --- So 40k map output records validateCounters(c1, 88000, 10000, 40000); JobConf newJobConf2 = new JobConf(newJob.getConfiguration()); Path outputPath3 = new Path(outDir, "output5_2k_5"); FileOutputFormat.setOutputPath(newJobConf2, outputPath3); Job newJob2 = new Job(newJobConf2); newJob2.setNumReduceTasks(0); newJob2.waitForCompletion(false); c1 = newJob2.getCounters(); // 4 maps and 2.5k words in each --- So 10k map input records // 4 maps and 10k unique words --- So 40k map output records validateCounters(c1, 0, 10000, 40000); } finally { //clean up the input and output files if (fs.exists(testDir)) { fs.delete(testDir, true); } } } /** * Increases the JVM's heap usage to the specified target value. */ static class MemoryLoader { private static final int DEFAULT_UNIT_LOAD_SIZE = 10 * 1024 * 1024; // 10mb // the target value to reach private long targetValue; // a list to hold the load objects private List<String> loadObjects = new ArrayList<String>(); MemoryLoader(long targetValue) { this.targetValue = targetValue; } /** * Loads the memory to the target value. */ void load() { while (Runtime.getRuntime().totalMemory() < targetValue) { System.out.println("Loading memory with " + DEFAULT_UNIT_LOAD_SIZE + " characters. Current usage : " + Runtime.getRuntime().totalMemory()); // load some objects in the memory loadObjects.add(RandomStringUtils.random(DEFAULT_UNIT_LOAD_SIZE)); // sleep for 100ms try { Thread.sleep(100); } catch (InterruptedException ie) {} } } } /** * A mapper that increases the JVM's heap usage to a target value configured * via {@link MemoryLoaderMapper#TARGET_VALUE} using a {@link MemoryLoader}. */ @SuppressWarnings({"deprecation", "unchecked"}) static class MemoryLoaderMapper extends MapReduceBase implements org.apache.hadoop.mapred.Mapper<WritableComparable, Writable, WritableComparable, Writable> { static final String TARGET_VALUE = "map.memory-loader.target-value"; private static MemoryLoader loader = null; public void map(WritableComparable key, Writable val, OutputCollector<WritableComparable, Writable> output, Reporter reporter) throws IOException { assertNotNull("Mapper not configured!", loader); // load the memory loader.load(); // work as identity mapper output.collect(key, val); } public void configure(JobConf conf) { loader = new MemoryLoader(conf.getLong(TARGET_VALUE, -1)); } } /** * A reducer that increases the JVM's heap usage to a target value configured * via {@link MemoryLoaderReducer#TARGET_VALUE} using a {@link MemoryLoader}. */ @SuppressWarnings({"deprecation", "unchecked"}) static class MemoryLoaderReducer extends MapReduceBase implements org.apache.hadoop.mapred.Reducer<WritableComparable, Writable, WritableComparable, Writable> { static final String TARGET_VALUE = "reduce.memory-loader.target-value"; private static MemoryLoader loader = null; public void reduce(WritableComparable key, Iterator<Writable> val, OutputCollector<WritableComparable, Writable> output, Reporter reporter) throws IOException { assertNotNull("Reducer not configured!", loader); // load the memory loader.load(); // work as identity reducer output.collect(key, key); } public void configure(JobConf conf) { loader = new MemoryLoader(conf.getLong(TARGET_VALUE, -1)); } } @SuppressWarnings("deprecation") private long getTaskCounterUsage (JobClient client, JobID id, int numReports, int taskId, boolean isMap) throws Exception { TaskReport[] reports = null; if (isMap) { reports = client.getMapTaskReports(id); } else { reports = client.getReduceTaskReports(id); } assertNotNull("No reports found for " + (isMap? "map" : "reduce") + " tasks" + "' in job " + id, reports); // make sure that the total number of reports match the expected assertEquals("Mismatch in task id", numReports, reports.length); Counters counters = reports[taskId].getCounters(); return counters.getCounter(COMMITTED_HEAP_BYTES); } // set up heap options, target value for memory loader and the output // directory before running the job @SuppressWarnings("deprecation") private static RunningJob runHeapUsageTestJob(JobConf conf, Path testRootDir, String heapOptions, long targetMapValue, long targetReduceValue, FileSystem fs, JobClient client, Path inDir) throws IOException { // define a job JobConf jobConf = new JobConf(conf); // configure the jobs jobConf.setNumMapTasks(1); jobConf.setNumReduceTasks(1); jobConf.setMapperClass(MemoryLoaderMapper.class); jobConf.setReducerClass(MemoryLoaderReducer.class); jobConf.setInputFormat(TextInputFormat.class); jobConf.setOutputKeyClass(LongWritable.class); jobConf.setOutputValueClass(Text.class); jobConf.setMaxMapAttempts(1); jobConf.setMaxReduceAttempts(1); jobConf.set(JobConf.MAPRED_TASK_JAVA_OPTS, heapOptions); // set the targets jobConf.setLong(MemoryLoaderMapper.TARGET_VALUE, targetMapValue); jobConf.setLong(MemoryLoaderReducer.TARGET_VALUE, targetReduceValue); // set the input directory for the job FileInputFormat.setInputPaths(jobConf, inDir); // define job output folder Path outDir = new Path(testRootDir, "out"); fs.delete(outDir, true); FileOutputFormat.setOutputPath(jobConf, outDir); // run the job RunningJob job = client.submitJob(jobConf); job.waitForCompletion(); JobID jobID = job.getID(); assertTrue("Job " + jobID + " failed!", job.isSuccessful()); return job; } /** * Tests {@link TaskCounter}'s {@link TaskCounter.COMMITTED_HEAP_BYTES}. * The test consists of running a low-memory job which consumes less heap * memory and then running a high-memory job which consumes more heap memory, * and then ensuring that COMMITTED_HEAP_BYTES of low-memory job is smaller * than that of the high-memory job. * @throws IOException */ @SuppressWarnings("deprecation") public void testHeapUsageCounter() throws Exception { JobConf conf = new JobConf(); // create a local filesystem handle FileSystem fileSystem = FileSystem.getLocal(conf); // define test root directories File rootDir = new File(System.getProperty("test.build.data", "/tmp")); File testRootDir = new File(rootDir, "testHeapUsageCounter"); // cleanup the test root directory Path testRootDirPath = new Path(testRootDir.toString()); fileSystem.delete(testRootDirPath, true); // set the current working directory fileSystem.setWorkingDirectory(testRootDirPath); fileSystem.deleteOnExit(testRootDirPath); // create a mini cluster using the local file system MiniMRCluster mrCluster = new MiniMRCluster(1, fileSystem.getUri().toString(), 1); try { conf = mrCluster.createJobConf(); JobClient jobClient = new JobClient(conf); // define job input File file = new File(testRootDir, "in"); Path inDir = new Path(file.toString()); // create input data createWordsFile(file); // configure and run a low memory job which will run without loading the // jvm's heap RunningJob lowMemJob = runHeapUsageTestJob(conf, testRootDirPath, "-Xms32m -Xmx1G", 0, 0, fileSystem, jobClient, inDir); JobID lowMemJobID = lowMemJob.getID(); long lowMemJobMapHeapUsage = getTaskCounterUsage(jobClient, lowMemJobID, 1, 0, true); System.out.println("Job1 (low memory job) map task heap usage: " + lowMemJobMapHeapUsage); long lowMemJobReduceHeapUsage = getTaskCounterUsage(jobClient, lowMemJobID, 1, 0, false); System.out.println("Job1 (low memory job) reduce task heap usage: " + lowMemJobReduceHeapUsage); // configure and run a high memory job which will load the jvm's heap RunningJob highMemJob = runHeapUsageTestJob(conf, testRootDirPath, "-Xms32m -Xmx1G", lowMemJobMapHeapUsage + 256*1024*1024, lowMemJobReduceHeapUsage + 256*1024*1024, fileSystem, jobClient, inDir); JobID highMemJobID = highMemJob.getID(); long highMemJobMapHeapUsage = getTaskCounterUsage(jobClient, highMemJobID, 1, 0, true); System.out.println("Job2 (high memory job) map task heap usage: " + highMemJobMapHeapUsage); long highMemJobReduceHeapUsage = getTaskCounterUsage(jobClient, highMemJobID, 1, 0, false); System.out.println("Job2 (high memory job) reduce task heap usage: " + highMemJobReduceHeapUsage); assertTrue("Incorrect map heap usage reported by the map task", lowMemJobMapHeapUsage < highMemJobMapHeapUsage); assertTrue("Incorrect reduce heap usage reported by the reduce task", lowMemJobReduceHeapUsage < highMemJobReduceHeapUsage); } finally { // shutdown the mr cluster mrCluster.shutdown(); try { fileSystem.delete(testRootDirPath, true); } catch (IOException ioe) {} } } }