TestMiniMRDFSSort.java example

Explorer
yarn-comment-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapred;

import java.io.DataOutputStream;
import java.io.IOException;

import junit.extensions.TestSetup;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.mapreduce.FileSystemCounter;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.examples.RandomWriter;
import org.apache.hadoop.examples.Sort;

/**
 * A JUnit test to test the Map-Reduce framework's sort 
 * with a Mini Map-Reduce Cluster with a Mini HDFS Clusters.
 */
public class TestMiniMRDFSSort extends TestCase {
  // Input/Output paths for sort
  private static final Path SORT_INPUT_PATH = new Path("/sort/input");
  private static final Path SORT_OUTPUT_PATH = new Path("/sort/output");

  // Knobs to control randomwriter; and hence sort
  private static final int NUM_HADOOP_SLAVES = 3;
  // make it big enough to cause a spill in the map
  private static final int RW_BYTES_PER_MAP = 3 * 1024 * 1024;
  private static final int RW_MAPS_PER_HOST = 2;

  private static MiniMRCluster mrCluster = null;
  private static MiniDFSCluster dfsCluster = null;
  private static FileSystem dfs = null;
  public static Test suite() {
    TestSetup setup = new TestSetup(new TestSuite(TestMiniMRDFSSort.class)) {
      protected void setUp() throws Exception {
        Configuration conf = new Configuration();
        dfsCluster = new MiniDFSCluster(conf, NUM_HADOOP_SLAVES, true, null);
        dfs = dfsCluster.getFileSystem();
        mrCluster = new MiniMRCluster(NUM_HADOOP_SLAVES, 
                                      dfs.getUri().toString(), 1);
      }
      protected void tearDown() throws Exception {
        if (dfsCluster != null) { dfsCluster.shutdown(); }
        if (mrCluster != null) { mrCluster.shutdown(); }
      }
    };
    return setup;
  }

  public static void runRandomWriter(JobConf job, Path sortInput) 
  throws Exception {
    // Scale down the default settings for RandomWriter for the test-case
    // Generates NUM_HADOOP_SLAVES * RW_MAPS_PER_HOST * RW_BYTES_PER_MAP
    job.setInt(RandomWriter.BYTES_PER_MAP, RW_BYTES_PER_MAP);
    job.setInt(RandomWriter.MAPS_PER_HOST, RW_MAPS_PER_HOST);
    String[] rwArgs = {sortInput.toString()};
    
    // Run RandomWriter
    assertEquals(ToolRunner.run(job, new RandomWriter(), rwArgs), 0);
  }
  
  private static void runSort(JobConf job, Path sortInput, Path sortOutput) 
  throws Exception {

    job.setInt(JobContext.JVM_NUMTASKS_TORUN, -1);
    job.setInt(JobContext.IO_SORT_MB, 1);
    job.setNumMapTasks(12);

    // Setup command-line arguments to 'sort'
    String[] sortArgs = {sortInput.toString(), sortOutput.toString()};
    
    // Run Sort
    Sort sort = new Sort();
    assertEquals(ToolRunner.run(job, sort, sortArgs), 0);
    org.apache.hadoop.mapreduce.Counters counters = sort.getResult().getCounters();
    long mapInput = counters.findCounter(FileInputFormatCounter.BYTES_READ)
        .getValue();
    long hdfsRead = counters.findCounter("hdfs", FileSystemCounter.BYTES_READ)
        .getValue();
    // the hdfs read should be between 100% and 110% of the map input bytes
    assertTrue("map input = " + mapInput + ", hdfs read = " + hdfsRead,
               (hdfsRead < (mapInput * 1.1)) &&
               (hdfsRead >= mapInput));  
  }
  
  private static void runSortValidator(JobConf job, 
                                       Path sortInput, Path sortOutput) 
  throws Exception {
    String[] svArgs = {"-sortInput", sortInput.toString(), 
                       "-sortOutput", sortOutput.toString()};

    // Run Sort-Validator
    assertEquals(ToolRunner.run(job, new SortValidator(), svArgs), 0);
  }
  
  private static class ReuseDetector extends MapReduceBase
      implements Mapper<BytesWritable,BytesWritable, Text, Text> {
    static int instances = 0;
    Reporter reporter = null;

    @Override
    public void map(BytesWritable key, BytesWritable value,
                    OutputCollector<Text, Text> output, 
                    Reporter reporter) throws IOException {
      this.reporter = reporter;
    }
    
    public void close() throws IOException {
      reporter.incrCounter("jvm", "use", ++instances);
    }
  }

  private static void runJvmReuseTest(JobConf job,
                                      boolean reuse) throws IOException {
    // setup a map-only job that reads the input and only sets the counters
    // based on how many times the jvm was reused.
    job.setInt(JobContext.JVM_NUMTASKS_TORUN, reuse ? -1 : 1);
    FileInputFormat.setInputPaths(job, SORT_INPUT_PATH);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setMapperClass(ReuseDetector.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setNumMapTasks(24);
    job.setNumReduceTasks(0);
    RunningJob result = JobClient.runJob(job);
    long uses = result.getCounters().findCounter("jvm", "use").getValue();
    int maps = job.getNumMapTasks();
    if (reuse) {
      assertTrue("maps = " + maps + ", uses = " + uses, maps < uses);
    } else {
      assertEquals("uses should be number of maps", job.getNumMapTasks(), uses);
    }
  }

  public void testMapReduceSort() throws Exception {
    // Run randomwriter to generate input for 'sort'
    runRandomWriter(mrCluster.createJobConf(), SORT_INPUT_PATH);

    // Run sort
    runSort(mrCluster.createJobConf(), SORT_INPUT_PATH, SORT_OUTPUT_PATH);

    // Run sort-validator to check if sort worked correctly
    runSortValidator(mrCluster.createJobConf(), SORT_INPUT_PATH, 
                     SORT_OUTPUT_PATH);
  }
  
  public void testJvmReuse() throws Exception {
    runJvmReuseTest(mrCluster.createJobConf(), true);
  }

  public void testNoJvmReuse() throws Exception {
    runJvmReuseTest(mrCluster.createJobConf(), false);
  }

  private static class BadPartitioner
      implements Partitioner<LongWritable,Text> {
    boolean low;
    public void configure(JobConf conf) {
      low = conf.getBoolean("test.testmapred.badpartition", true);
    }
    public int getPartition(LongWritable k, Text v, int numPartitions) {
      return low ? -1 : numPartitions;
    }
  }

  public void testPartitioner() throws Exception {
    JobConf conf = mrCluster.createJobConf();
    conf.setPartitionerClass(BadPartitioner.class);
    conf.setNumReduceTasks(3);
    FileSystem fs = FileSystem.get(conf);
    Path testdir =
      new Path("blah").makeQualified(fs.getUri(), fs.getWorkingDirectory());
    Path inFile = new Path(testdir, "blah");
    DataOutputStream f = fs.create(inFile);
    f.writeBytes("blah blah blah\n");
    f.close();
    FileInputFormat.setInputPaths(conf, inFile);
    FileOutputFormat.setOutputPath(conf, new Path(testdir, "out"));
    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(IdentityReducer.class);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMaxMapAttempts(1);

    // partition too low
    conf.setBoolean("test.testmapred.badpartition", true);
    boolean pass = true;
    try {
      JobClient.runJob(conf);
    } catch (IOException e) {
      pass = false;
    }
    assertFalse("should fail for partition < 0", pass);

    // partition too high
    conf.setBoolean("test.testmapred.badpartition", false);
    pass = true;
    try {
      JobClient.runJob(conf);
    } catch (IOException e) {
      pass = false;
    }
    assertFalse("should fail for partition >= numPartitions", pass);
  }

}