TestUniformSizeInputFormat.java example

Explorer
HDP-2.2-Patched-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.tools.mapred;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.task.JobContextImpl;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.tools.CopyListing;
import org.apache.hadoop.tools.DistCpOptions;
import org.apache.hadoop.tools.StubContext;
import org.apache.hadoop.security.Credentials;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;

import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;


public class TestUniformSizeInputFormat {
  private static MiniDFSCluster cluster;
  private static final int N_FILES = 20;
  private static final int SIZEOF_EACH_FILE=1024;
  private static final Random random = new Random();
  private static int totalFileSize = 0;

  private static final Credentials CREDENTIALS = new Credentials();


  @BeforeClass
  public static void setup() throws Exception {
    cluster = new MiniDFSCluster.Builder(new Configuration()).numDataNodes(1)
                                          .format(true).build();
    totalFileSize = 0;

    for (int i=0; i<N_FILES; ++i)
      totalFileSize += createFile("/tmp/source/" + String.valueOf(i), SIZEOF_EACH_FILE);
  }

  private static DistCpOptions getOptions(int nMaps) throws Exception {
    Path sourcePath = new Path(cluster.getFileSystem().getUri().toString()
                               + "/tmp/source");
    Path targetPath = new Path(cluster.getFileSystem().getUri().toString()
                               + "/tmp/target");

    List<Path> sourceList = new ArrayList<Path>();
    sourceList.add(sourcePath);
    final DistCpOptions distCpOptions = new DistCpOptions(sourceList, targetPath);
    distCpOptions.setMaxMaps(nMaps);
    return distCpOptions;
  }

  private static int createFile(String path, int fileSize) throws Exception {
    FileSystem fileSystem = null;
    DataOutputStream outputStream = null;
    try {
      fileSystem = cluster.getFileSystem();
      outputStream = fileSystem.create(new Path(path), true, 0);
      int size = (int) Math.ceil(fileSize + (1 - random.nextFloat()) * fileSize);
      outputStream.write(new byte[size]);
      return size;
    }
    finally {
      IOUtils.cleanup(null, fileSystem, outputStream);
    }
  }

  @AfterClass
  public static void tearDown() {
    cluster.shutdown();
  }

  public void testGetSplits(int nMaps) throws Exception {
    DistCpOptions options = getOptions(nMaps);
    Configuration configuration = new Configuration();
    configuration.set("mapred.map.tasks",
                      String.valueOf(options.getMaxMaps()));
    Path listFile = new Path(cluster.getFileSystem().getUri().toString()
        + "/tmp/testGetSplits_1/fileList.seq");
    CopyListing.getCopyListing(configuration, CREDENTIALS, options).
        buildListing(listFile, options);

    JobContext jobContext = new JobContextImpl(configuration, new JobID());
    UniformSizeInputFormat uniformSizeInputFormat = new UniformSizeInputFormat();
    List<InputSplit> splits
            = uniformSizeInputFormat.getSplits(jobContext);

    int sizePerMap = totalFileSize/nMaps;

    checkSplits(listFile, splits);

    int doubleCheckedTotalSize = 0;
    int previousSplitSize = -1;
    for (int i=0; i<splits.size(); ++i) {
      InputSplit split = splits.get(i);
      int currentSplitSize = 0;
      RecordReader<Text, FileStatus> recordReader = uniformSizeInputFormat.createRecordReader(
              split, null);
      StubContext stubContext = new StubContext(jobContext.getConfiguration(),
                                                recordReader, 0);
      final TaskAttemptContext taskAttemptContext
         = stubContext.getContext();
      recordReader.initialize(split, taskAttemptContext);
      while (recordReader.nextKeyValue()) {
        Path sourcePath = recordReader.getCurrentValue().getPath();
        FileSystem fs = sourcePath.getFileSystem(configuration);
        FileStatus fileStatus [] = fs.listStatus(sourcePath);
        Assert.assertEquals(fileStatus.length, 1);
        currentSplitSize += fileStatus[0].getLen();
      }
      Assert.assertTrue(
           previousSplitSize == -1
               || Math.abs(currentSplitSize - previousSplitSize) < 0.1*sizePerMap
               || i == splits.size()-1);

      doubleCheckedTotalSize += currentSplitSize;
    }

    Assert.assertEquals(totalFileSize, doubleCheckedTotalSize);
  }

  private void checkSplits(Path listFile, List<InputSplit> splits) throws IOException {
    long lastEnd = 0;

    //Verify if each split's start is matching with the previous end and
    //we are not missing anything
    for (InputSplit split : splits) {
      FileSplit fileSplit = (FileSplit) split;
      long start = fileSplit.getStart();
      Assert.assertEquals(lastEnd, start);
      lastEnd = start + fileSplit.getLength();
    }

    //Verify there is nothing more to read from the input file
    SequenceFile.Reader reader
            = new SequenceFile.Reader(cluster.getFileSystem().getConf(),
                    SequenceFile.Reader.file(listFile));

    try {
      reader.seek(lastEnd);
      FileStatus srcFileStatus = new FileStatus();
      Text srcRelPath = new Text();
      Assert.assertFalse(reader.next(srcRelPath, srcFileStatus));
    } finally {
      IOUtils.closeStream(reader);
    }
  }

  @Test
  public void testGetSplits() throws Exception {
    testGetSplits(9);
    for (int i=1; i<N_FILES; ++i)
      testGetSplits(i);
  }
}