UniformSizeInputFormat.java example

Explorer
HDP-2.2-Patched-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.tools.mapred;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.tools.DistCpConstants;
import org.apache.hadoop.tools.util.DistCpUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.conf.Configuration;

import java.io.IOException;
import java.util.List;
import java.util.ArrayList;

/**
 * UniformSizeInputFormat extends the InputFormat<> class, to produce
 * input-splits for DistCp.
 * It looks at the copy-listing and groups the contents into input-splits such
 * that the total-number of bytes to be copied for each input split is
 * uniform.
 */
public class UniformSizeInputFormat extends InputFormat<Text, FileStatus> {
  private static final Log LOG
                = LogFactory.getLog(UniformSizeInputFormat.class);

  /**
   * Implementation of InputFormat::getSplits(). Returns a list of InputSplits,
   * such that the number of bytes to be copied for all the splits are
   * approximately equal.
   * @param context JobContext for the job.
   * @return The list of uniformly-distributed input-splits.
   * @throws IOException: On failure.
   * @throws InterruptedException
   */
  @Override
  public List<InputSplit> getSplits(JobContext context)
                      throws IOException, InterruptedException {
    Configuration configuration = context.getConfiguration();
    int numSplits = DistCpUtils.getInt(configuration,
                                       JobContext.NUM_MAPS);

    if (numSplits == 0) return new ArrayList<InputSplit>();

    return getSplits(configuration, numSplits,
                     DistCpUtils.getLong(configuration,
                          DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED));
  }

  private List<InputSplit> getSplits(Configuration configuration, int numSplits,
                                     long totalSizeBytes) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
    long nBytesPerSplit = (long) Math.ceil(totalSizeBytes * 1.0 / numSplits);

    FileStatus srcFileStatus = new FileStatus();
    Text srcRelPath = new Text();
    long currentSplitSize = 0;
    long lastSplitStart = 0;
    long lastPosition = 0;

    final Path listingFilePath = getListingFilePath(configuration);

    if (LOG.isDebugEnabled()) {
      LOG.debug("Average bytes per map: " + nBytesPerSplit +
          ", Number of maps: " + numSplits + ", total size: " + totalSizeBytes);
    }
    SequenceFile.Reader reader=null;
    try {
      reader = getListingFileReader(configuration);
      while (reader.next(srcRelPath, srcFileStatus)) {
        // If adding the current file would cause the bytes per map to exceed
        // limit. Add the current file to new split
        if (currentSplitSize + srcFileStatus.getLen() > nBytesPerSplit && lastPosition != 0) {
          FileSplit split = new FileSplit(listingFilePath, lastSplitStart,
              lastPosition - lastSplitStart, null);
          if (LOG.isDebugEnabled()) {
            LOG.debug ("Creating split : " + split + ", bytes in split: " + currentSplitSize);
          }
          splits.add(split);
          lastSplitStart = lastPosition;
          currentSplitSize = 0;
        }
        currentSplitSize += srcFileStatus.getLen();
        lastPosition = reader.getPosition();
      }
      if (lastPosition > lastSplitStart) {
        FileSplit split = new FileSplit(listingFilePath, lastSplitStart,
            lastPosition - lastSplitStart, null);
        if (LOG.isDebugEnabled()) {
          LOG.info ("Creating split : " + split + ", bytes in split: " + currentSplitSize);
        }
        splits.add(split);
      }

    } finally {
      IOUtils.closeStream(reader);
    }

    return splits;
  }

  private static Path getListingFilePath(Configuration configuration) {
    final String listingFilePathString =
            configuration.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, "");

    assert !listingFilePathString.equals("")
              : "Couldn't find listing file. Invalid input.";
    return new Path(listingFilePathString);
  }

  private SequenceFile.Reader getListingFileReader(Configuration configuration) {

    final Path listingFilePath = getListingFilePath(configuration);
    try {
      final FileSystem fileSystem = listingFilePath.getFileSystem(configuration);
      if (!fileSystem.exists(listingFilePath))
        throw new IllegalArgumentException("Listing file doesn't exist at: "
                                           + listingFilePath);

      return new SequenceFile.Reader(configuration,
                                     SequenceFile.Reader.file(listingFilePath));
    }
    catch (IOException exception) {
      LOG.error("Couldn't find listing file at: " + listingFilePath, exception);
      throw new IllegalArgumentException("Couldn't find listing-file at: "
                                         + listingFilePath, exception);
    }
  }

  /**
   * Implementation of InputFormat::createRecordReader().
   * @param split The split for which the RecordReader is sought.
   * @param context The context of the current task-attempt.
   * @return A SequenceFileRecordReader instance, (since the copy-listing is a
   * simple sequence-file.)
   * @throws IOException
   * @throws InterruptedException
   */
  @Override
  public RecordReader<Text, FileStatus> createRecordReader(InputSplit split,
                                                     TaskAttemptContext context)
                                      throws IOException, InterruptedException {
    return new SequenceFileRecordReader<Text, FileStatus>();
  }
}