TestNewCollector.java example

Explorer
hadoop-20-master
- src
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.mapred;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Random;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapred.TestMapCollection.FakeIF;
import org.apache.hadoop.mapred.lib.NullOutputFormat;

import junit.framework.TestCase;

@SuppressWarnings("deprecation")
public class TestNewCollector extends TestCase {

  private static Log LOG = LogFactory.getLog(TestNewCollector.class);

  private MiniMRCluster mrCluster;

  protected void setUp() {
    JobConf conf = new JobConf();
    try {
      mrCluster =
          new MiniMRCluster(2, "file:///", 3, null, null, conf);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

  protected void tearDown() {
    mrCluster.shutdown();
  }

  public static class BytesWritableFactory {

    private static Random random = new Random();

    public static BytesWritable getRandomBytesWritable(int size) {
      byte[] bytes = new byte[size];
      random.nextBytes(bytes);
      BytesWritable bytesWritable = new BytesWritable(bytes);
      return bytesWritable;
    }

    public static BytesWritable getRepeatedBytesWritable(
        byte[] bytes, int repeatNum) {
      int newLen = bytes.length * repeatNum;
      byte[] bb = new byte[newLen];
      for (int i = 0; i < repeatNum; i++) {
        System
            .arraycopy(bytes, 0, bb, bytes.length * i, bytes.length);
      }
      BytesWritable bytesWritable = new BytesWritable(bb);
      return bytesWritable;
    }
  }

  public static class TestNewCollectorKey extends BytesWritable {
    private int hashCode = -1;

    public TestNewCollectorKey(BytesWritable k) {
      super(k.getBytes());
    }
    
    public TestNewCollectorKey() {
      super();
    }

    public int hashCode() {
      if (hashCode < 0) {
        hashCode = super.hashCode();
      }
      return hashCode;
    }

    public void setHashCode(int hashCode) {
      this.hashCode = hashCode;
    }
  }

  public static class RecordNumStore {

    private static String RECORD_NUM_CONF =
        "test.reducer.records.num";

    /*
     * conf to specify number of big records to spill right after the mapper
     * starts, it is comma separated string for a list of values, each value is
     * one reducer.
     */
    private static String BIG_RECORDS_BEGINNING = "test.reducer.bigrecords.start";

    /*
     * conf to specify number of big records to spill in the middle of a
     * mapper, it is comma separated string for a list of values, each value is
     * one reducer.
     */
    private static String BIG_RECORDS_MIDDLE = "test.reducer.bigrecords.middle";

    /*
     * conf to specify number of big records to spill right before the mapper
     * finish, it is comma separated string for a list of values, each value is
     * one reducer.
     */
    private static String BIG_RECORDS_END = "test.reducer.bigrecords.end";
    
    private JobConf currentJobConf;
    private List<Integer> reducerToReciveRecNum;
    private int[] mapperOutNumForEachReducer;

    private static RecordNumStore inst;
    private static Object instanceLock = new Object();

    private RecordNumStore(JobConf job) {
      this.currentJobConf = job;
      init(job);
    }

    public void init(JobConf job) {
      String recordNumStr = job.get(RECORD_NUM_CONF);
      
      int numMappers = job.getNumMapTasks();
      reducerToReciveRecNum = new ArrayList<Integer>(numMappers);
      if (recordNumStr != null) {
        String[] splits = recordNumStr.split(",");
        for(String num: splits) {
          if (num == null || num.trim().equals("")) {
            continue;
          }
          reducerToReciveRecNum.add(Integer.parseInt(num));
        }
      }
      
      for (int i = reducerToReciveRecNum.size(); i < numMappers; i++) {
        reducerToReciveRecNum.add(0);
      }
    }

    public static RecordNumStore getInst(JobConf job) {
      synchronized(instanceLock) {
        if (job != null
            && (inst == null || job != inst.getCurrentJobConf())) {
          inst = new RecordNumStore(job);        
        }
        return inst;
      }
    }

    protected JobConf getCurrentJobConf() {
      return currentJobConf;
    }

    public synchronized int[] getMapperOutNumForEachReducer() {
      int numReducers = currentJobConf.getNumReduceTasks();
      int numMappers = currentJobConf.getNumMapTasks();
      if (mapperOutNumForEachReducer == null) {
        mapperOutNumForEachReducer = new int[numReducers];
      }

      List<Integer> reducerToReciveNum = this.reducerToReciveRecNum;
      for (int i = 0; i < numReducers; i++) {
        mapperOutNumForEachReducer[i] =
            reducerToReciveNum.get(i) / numMappers;
      }
      
      return mapperOutNumForEachReducer;
    }

    public synchronized int[] getBigRecodsStart() {
      String bigRecordNumStartStr =
          currentJobConf.get(BIG_RECORDS_BEGINNING);
      int[] bigRecordsStart =
          splitConfToIntArray(bigRecordNumStartStr);
      
      return bigRecordsStart;
    }

    public synchronized int[] getBigRecodsMiddle() {
      String bigRecordNumMiddleStr =
          currentJobConf.get(BIG_RECORDS_MIDDLE);
      int[] bigRecordsMiddle =
          splitConfToIntArray(bigRecordNumMiddleStr);
      
      return bigRecordsMiddle;
    }

    public synchronized int[] getBigRecodsEnd() {
      String bigRecordNumEndStr = currentJobConf.get(BIG_RECORDS_END);
      int[] bigRecordsEnd = splitConfToIntArray(bigRecordNumEndStr);
      
      return bigRecordsEnd;
    }

    private int[] splitConfToIntArray(String confStr) {
      String[] splits = confStr.split(",");
      int[] numArray = new int[splits.length];
      for (int i = 0; i < splits.length; i++) {
        String num = splits[i];
        if (num == null || num.trim().equals("")) {
          numArray[i] = 0;
        } else {
          numArray[i] = Integer.parseInt(num);
        }
      }
      return numArray;
    }
    
    public boolean checkReducerReceiveRecNum(int reducerNum) {
      return reducerToReciveRecNum
          .remove(Integer.valueOf(reducerNum));
    }
    
    /**
     * Each mapper is omitting the same number of records. And
     * reducerRecPercents array decides how many should go to each reducer. One
     * reducer will receive the same number of records from different mappers.
     * 
     * @param numReducers
     *          number of reducers to run
     * @param mappers
     *          number of mappers to run
     * @param recordNumPerMapper
     *          how many records each mapper outputs
     * @param reducerRecPercents
     *          for one mapper, how to allocate output records to reducers
     * @param numBigRecordsStart
     * @param numBigRecordsMiddle
     * @param numBigRecordsEnd
     * @param job
     */
    public static void setJobConf(int numReducers, int mappers,
        int recordNumPerMapper, double[] reducerRecPercents,
        int[] numBigRecordsStart, int[] numBigRecordsMiddle,
        int[] numBigRecordsEnd, JobConf job) {
      int[] recNumReducerOneMapper = new int[numReducers];
      double left = 1.0f;
      int preAllocated = 0;
      int leftToAllocate = recordNumPerMapper;

      if (numBigRecordsStart == null) {
        numBigRecordsStart = new int[numReducers];
        fillZero(numBigRecordsStart);
      }

      if (numBigRecordsMiddle == null) {
        numBigRecordsMiddle = new int[numReducers];
        fillZero(numBigRecordsMiddle);
      }

      if (numBigRecordsEnd == null) {
        numBigRecordsEnd = new int[numReducers];
        fillZero(numBigRecordsEnd);
      }

      if (reducerRecPercents != null) {
        if (reducerRecPercents.length > numReducers) {
          throw new IllegalArgumentException(
              "percents array length is " + reducerRecPercents.length
                  + " while numReducers is " + numReducers);
        }
        preAllocated = reducerRecPercents.length;
      }
      for (int i = 0; i < preAllocated; i++) {
        left -= reducerRecPercents[i];
        if (left < 0) {
          throw new IllegalArgumentException(
              "sum of percents array is bigger than 1.0");
        }
        recNumReducerOneMapper[i] =
            (int) (recordNumPerMapper * reducerRecPercents[i]);
        leftToAllocate -= recNumReducerOneMapper[i];
      }

      int toAllocateReducer = preAllocated;
      while (leftToAllocate > 0 && toAllocateReducer < numReducers) {
        recNumReducerOneMapper[toAllocateReducer] += 1;
        toAllocateReducer++;
        if (toAllocateReducer == numReducers) {
          toAllocateReducer = preAllocated;
        }
        leftToAllocate--;
      }

      for (int i = 0; i < recNumReducerOneMapper.length; i++) {
        recNumReducerOneMapper[i] =
            recNumReducerOneMapper[i] * mappers;
        int bigRecords =
            numBigRecordsStart[i] + numBigRecordsMiddle[i]
                + numBigRecordsEnd[i];
        if (bigRecords > recNumReducerOneMapper[i]) {
          throw new IllegalArgumentException(
              "big records number is bigger than total.");
        }
      }

      String recordNumConf = getStringConf(recNumReducerOneMapper);
      job.set(RECORD_NUM_CONF, recordNumConf);

      String bigRecordStartConf = getStringConf(numBigRecordsStart);
      job.set(BIG_RECORDS_BEGINNING, bigRecordStartConf);

      String bigRecordMiddleConf = getStringConf(numBigRecordsMiddle);
      job.set(BIG_RECORDS_MIDDLE, bigRecordMiddleConf);

      String bigRecordEndConf = getStringConf(numBigRecordsEnd);
      job.set(BIG_RECORDS_END, bigRecordEndConf);
      
      System.out.println("RECORD_NUM_CONF is " + recordNumConf);
      System.out.println("BIG_RECORDS_BEGINNING is " + bigRecordStartConf);
      System.out.println("BIG_RECORDS_MIDDLE is " + bigRecordMiddleConf);
      System.out.println("BIG_RECORDS_END is " + bigRecordEndConf);
    }

    private static String getStringConf(int[] numArray) {
      StringBuilder sb = new StringBuilder();
      boolean first = true;
      for (int num : numArray) {
        if (first) {
          first = false;
        } else {
          sb.append(",");
        }
        sb.append(num);
      }
      return sb.toString();
    }

    private static void fillZero(int[] numBigRecordsStart) {
      for (int i = 0; i < numBigRecordsStart.length; i++) {
        numBigRecordsStart[i] = 0;
      }
    }
  }
  
  public static String toString(int[] numArray) {
    StringBuilder sb = new StringBuilder();
    for(int num: numArray) {
      sb.append(num);
      sb.append(",");
    }
    
    return sb.toString();
  }

  public static class TestNewCollectorMapper
      implements
      Mapper<NullWritable, NullWritable, BytesWritable, BytesWritable> {

    private int keylen = 1;
    private int vallen = 1;
    private int bigKeyLen = 10000;
    private int bigValLen = 10000;

    private int[] recNumForReducer;
    private int[] bigRecordsStart;
    private int[] normalKVNum;
    private int[] bigRecordsMiddle;
    private int[] bigRecordsEnd;
    
    
    public void configure(JobConf job) {
      recNumForReducer =
          RecordNumStore.getInst(job).getMapperOutNumForEachReducer();
      keylen = job.getInt("test.key.length", 1);
      vallen = job.getInt("test.value.length", 1);
      bigKeyLen = job.getInt("test.bigkey.length", 10000);
      bigValLen = job.getInt("test.bigvalue.length", 10000);
      bigRecordsStart =
          RecordNumStore.getInst(job).getBigRecodsStart();
      bigRecordsMiddle =
          RecordNumStore.getInst(job).getBigRecodsMiddle();
      bigRecordsEnd = RecordNumStore.getInst(job).getBigRecodsEnd();
      normalKVNum = new int[bigRecordsStart.length];
      for (int i = 0; i < normalKVNum.length; i++) {
        normalKVNum[i] =
            recNumForReducer[i]
                - (bigRecordsStart[i] + bigRecordsMiddle[i] + bigRecordsEnd[i]); 
      }
    }

    public void close() {
    }

    @Override
    public void map(NullWritable key, NullWritable value,
        OutputCollector<BytesWritable, BytesWritable> output,
        Reporter reporter) throws IOException {
      boolean outputed = false;
      int i = -1;
      while (true) {
        reporter.progress();
        i++;
        if (i == recNumForReducer.length) {
          if (!outputed) {
            break;
          }
          i = 0;
          outputed = false;
        }
        if (recNumForReducer[i] == 0) {
          continue;
        }
        if (bigRecordsStart[i] > 0) {
          collectBigKV(output, i);
          bigRecordsStart[i]--;
          recNumForReducer[i]--;
        } else if (normalKVNum[i] > 0 || bigRecordsMiddle[i] > 0) {
          if (normalKVNum[i] > 0) {
            collectNormalKV(output, i);
            normalKVNum[i]--;
            recNumForReducer[i]--;
          }
          if (bigRecordsMiddle[i] > 0) {
            collectBigKV(output, i);
            bigRecordsMiddle[i]--;
            recNumForReducer[i]--;
          }
        } else if (bigRecordsEnd[i] > 0) {
          collectBigKV(output, i);
          bigRecordsEnd[i]--;
          recNumForReducer[i]--;
        } else {
          throw new RuntimeException("Uncatched situation.");
        }
        outputed = true;
      }
    }
    
    private void collectKV(
        OutputCollector<BytesWritable, BytesWritable> output,
        int reducerNo, int keyLen, int valueLen) throws IOException {
      BytesWritable k =
          BytesWritableFactory.getRandomBytesWritable(keyLen);
      BytesWritable val =
          BytesWritableFactory.getRandomBytesWritable(valueLen);
      TestNewCollectorKey collectorKey = new TestNewCollectorKey(k);
      collectorKey.setHashCode(reducerNo);
      output.collect(collectorKey, val);
    }
    
    private void collectBigKV(
        OutputCollector<BytesWritable, BytesWritable> output,
        int reduceNo) throws IOException {
      this.collectKV(output, reduceNo, bigKeyLen, bigValLen);
    }

    private void collectNormalKV(
        OutputCollector<BytesWritable, BytesWritable> output,
        int reducerNo) throws IOException {
      this.collectKV(output, reducerNo, keylen, vallen);
    }
    
  }

  public static class TestNewCollectorReducer
      implements
      Reducer<BytesWritable, BytesWritable, NullWritable, NullWritable> {

    private int received = 0;
    private JobConf job;
    private BytesWritable lastKey = null;
    private RawComparator rawComparator;

    public void configure(JobConf job) {
      this.job = job;
      this.rawComparator =
          WritableComparator.get(BytesWritable.class);
    }

    public void close() {
      boolean found =
          RecordNumStore.getInst(job).checkReducerReceiveRecNum(
              received);
      System.out.println("received count is " + received
          + ", found is " + found);
      assertTrue("Unexpected record count (" + received + ")", found);
    }
    
    @Override
    @SuppressWarnings("unchecked")
    public void reduce(BytesWritable key,
        Iterator<BytesWritable> values,
        OutputCollector<NullWritable, NullWritable> output,
        Reporter reporter) throws IOException {
      if(lastKey == null) {
        lastKey = new BytesWritable();
        lastKey.set(key.getBytes(), 0, key.getLength());
      } else {
        int ret = rawComparator.compare(lastKey, key);
        assertTrue("Incorrect comparasion result given by mapreduce",
            ret < 0);
        lastKey.set(key.getBytes(), 0, key.getLength());
      }
      while (values.hasNext()) {
        values.next();
        ++received;
      }
    }

    private void printBytes(BytesWritable key) {
      byte[] bytes = key.getBytes();
      for (int i = 0; i < key.getLength(); i++) {
        System.out.printf("%02x", bytes[i]);
      }
      System.out.println();
    }
  }
  
  private void runTest(String name, int keyLen, int valLen,
      int recordsNumPerMapper, int sortMb, float spillPer,
      int numMapperTasks, int numReducerTask,
      double[] reducerRecPercents) throws Exception {
    this.runTest(name, keyLen, valLen, 0, 0, recordsNumPerMapper,
        sortMb, spillPer, numMapperTasks, numReducerTask,
        reducerRecPercents, null, null, null);
  }

  private void runTest(String name, int keyLen, int valLen,
      int bigKeyLen, int bigValLen, int recordsNumPerMapper,
      int sortMb, float spillPer, int numMapperTasks,
      int numReducerTask, double[] reducerRecPercents,
      int[] numBigRecordsStart, int[] numBigRecordsMiddle,
      int[] numBigRecordsEnd) throws Exception {
    JobConf conf = mrCluster.createJobConf();
    conf.setInt("io.sort.mb", sortMb);
    conf.set("io.sort.spill.percent", Float.toString(spillPer));
    conf.setInt("test.key.length", keyLen);
    conf.setInt("test.value.length", valLen);
    conf.setInt("test.bigkey.length", bigKeyLen);
    conf.setInt("test.bigvalue.length", bigValLen);
    conf.setNumMapTasks(numMapperTasks);
    conf.setNumReduceTasks(numReducerTask);
    conf.setInputFormat(FakeIF.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setMapperClass(TestNewCollectorMapper.class);
    conf.setReducerClass(TestNewCollectorReducer.class);
    conf.setMapOutputKeyClass(TestNewCollectorKey.class);
    conf.setMapOutputValueClass(BytesWritable.class);
    conf.setBoolean("mapred.map.output.blockcollector", true);

    RecordNumStore.setJobConf(numReducerTask, numMapperTasks,
        recordsNumPerMapper, reducerRecPercents, numBigRecordsStart,
        numBigRecordsMiddle, numBigRecordsEnd, conf);
    RecordNumStore.getInst(conf);
    LOG.info("Running " + name);
    JobClient.runJob(conf);
  }
  
  public void testNormalInMemory() throws Exception {
    runTest("testSmallScale_1", 1, 1, 1, 40, 0.5f, 1, 1,
        new double[] { 1.0f });
    // 200 bytes for each record, and 10000 records for each mapper, so
    // serialized data will use 2MB
    // data, and should be able to hold in memory.
    runTest("testSmallScale_2", 100, 100, 10000, 4, 0.8f, 1, 1,
        new double[] { 1.0f });
    runTest("testSmallScale_2", 100, 100, 10000, 4, 0.8f, 10, 1,
        new double[] { 1.0f });
    // run 2 mappers and 1 reducers, and each mapper output 4MB data.
    runTest("testSmallScale_3", 100, 100, 10000, 4, 0.8f, 2, 1,
        new double[] { 1.0f });
    // run 2 mappers and 2 reducers, and each mapper output 4MB data.
    runTest("testSmallScale_4", 100, 100, 10000, 4, 0.8f, 2, 2,
        new double[] { 0.5f, 0.5f });
  }
  
  //test cases that require spilling data to disk
  public void testSpill() throws Exception {
    // 600 bytes for each mapper, 10K records for each mapper, and totally use
    // 6MB data. So it will require spill to disk
    runTest("testSpill_1", 100, 500, 10000, 4, 0.8f, 1, 1,
        new double[] { 1.0f });
    runTest("testSpill_2", 100, 500, 10000, 4, 0.8f, 2, 1,
        new double[] { 1.0f });
    runTest("testSpill_3", 100, 500, 10000, 4, 0.8f, 2, 2,
        new double[] { 0.5f, 0.5f });
  }
  
  //test cases that require spilling data to disk
  public void testSpillMore() throws Exception {
    // 600 bytes for each mapper, 10K records for each mapper, and totally use
    // 6MB data. So it will require spill to disk
    runTest("testSpillMore_1", 100, 500, 10000, 1, 0.8f, 1, 1,
        new double[] { 1.0f });
    runTest("testSpillMore_2", 100, 500, 10000, 1, 0.8f, 2, 1,
        new double[] { 1.0f });
    runTest("testSpillMore_3", 100, 500, 10000, 1, 0.8f, 2, 2,
        new double[] { 0.5f, 0.5f });
  }

  //test skew cases
  public void testSkew() throws Exception {
    // first reducer got 90% records
    runTest("testSpillSkew_1", 100, 500, 10000, 4, 0.8f, 1, 10,
        new double[] { 0.9f});
    // first got 40%, and second got 40%
    runTest("testSpillSkew_2", 100, 500, 10000, 4, 0.8f, 1, 10,
        new double[] { 0.4f, 0.4f });
    // first got 60%, and second got 30%
    runTest("testSpillSkew_3", 100, 500, 10000, 4, 0.8f, 2, 10,
        new double[] { 0.6f, 0.3f });
  }
  
  public void testBigRecords() throws Exception {
    // 600 bytes for each small kv, and also output 60 big
    // records, 20 at the beginning, 20 in the middle, and 20 at the end
    runTest("testSpillBigRecords_1", 100, 500, 10000, 500000, 3000,
        1, 0.8f, 1, 1, new double[] { 1.0f }, new int[] { 20 },
        new int[] { 20 }, new int[] { 20 });
    runTest("testSpillBigRecords_2", 100, 500, 10000, 500000, 3000,
        1, 0.8f, 2, 1, new double[] { 1.0f }, new int[] { 20 },
        new int[] { 20 }, new int[] { 20 });
    runTest("testSpillBigRecords_3", 100, 500, 10000, 500000, 3000,
        1, 0.8f, 2, 2, new double[] { 0.5f, 0.5f }, new int[] { 20,
            20 }, new int[] { 20, 20 }, new int[] { 20, 20 });
  }
  
}