DataFileDistCp.java example

Explorer
eoulsan-master
- src
/*
 *                  Eoulsan development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public License version 2.1 or
 * later and CeCILL-C. This should be distributed with the code.
 * If you do not have a copy, see:
 *
 *      http://www.gnu.org/licenses/lgpl-2.1.txt
 *      http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
 *
 * Copyright for this code is held jointly by the Genomic platform
 * of the Institut de Biologie de l'École normale supérieure and
 * the individual authors. These should be listed in @author doc
 * comments.
 *
 * For more information on the Eoulsan project and its aims,
 * or to join the Eoulsan Google group, visit the home page
 * at:
 *
 *      http://outils.genomique.biologie.ens.fr/eoulsan
 *
 */

package fr.ens.biologie.genomique.eoulsan.modules.mgmt.upload;

import static fr.ens.biologie.genomique.eoulsan.EoulsanLogger.getLogger;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.text.NumberFormat;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.google.common.collect.Lists;

import fr.ens.biologie.genomique.eoulsan.EoulsanRuntime;
import fr.ens.biologie.genomique.eoulsan.EoulsanRuntimeException;
import fr.ens.biologie.genomique.eoulsan.Globals;
import fr.ens.biologie.genomique.eoulsan.HadoopEoulsanRuntime;
import fr.ens.biologie.genomique.eoulsan.data.DataFile;
import fr.ens.biologie.genomique.eoulsan.data.DataFormatConverter;
import fr.ens.biologie.genomique.eoulsan.util.StringUtils;
import fr.ens.biologie.genomique.eoulsan.util.hadoop.PathUtils;

/**
 * This class allow to copy and transform data in a distributed manner.
 * @since 1.0
 * @author Laurent Jourdren
 */
public class DataFileDistCp {

  /* Default Charset. */
  private static final Charset CHARSET =
      Charset.forName(Globals.DEFAULT_FILE_ENCODING);

  private final Configuration conf;
  private final Path jobPath;

  private static final long MAX_COPY_DURATION = 120 * 60 * 1000;

  /**
   * This inner class define the mapper class for DataSourceDistCp map-reduce
   * job.
   * @author Laurent Jourdren
   */
  public static final class DistCpMapper
      extends Mapper<LongWritable, Text, Text, Text> {

    private static final String COUNTER_GROUP_NAME = "DataSourceDistCp";

    /**
     * Internal class to store an exception if occurs while coping.
     * @author Laurent Jourdren
     */
    private static final class MyIOExceptionWrapper {
      public IOException ioexception;
    }

    @Override
    protected void setup(final Context context)
        throws IOException, InterruptedException {

      if (!EoulsanRuntime.isRuntime()) {
        HadoopEoulsanRuntime.newEoulsanRuntime(context.getConfiguration());
      }

    }

    @Override
    protected void map(final LongWritable key, final Text value,
        final Context context) throws IOException, InterruptedException {

      final String val = value.toString();

      final int tabPos = val.indexOf('\t');

      if (tabPos == -1) {
        return;
      }

      final Configuration conf = context.getConfiguration();

      final String srcPathname = val.substring(0, tabPos);
      final Path srcPath = new Path(srcPathname);
      final Path destPath = new Path(val.substring(tabPos + 1));

      final FileSystem srcFs = srcPath.getFileSystem(conf);
      final FileSystem destFs = destPath.getFileSystem(conf);

      // Statistic about src file
      final FileStatus fStatusSrc = srcFs.getFileStatus(srcPath);
      final long srcSize = fStatusSrc == null ? 0 : fStatusSrc.getLen();

      getLogger().info("Start copy "
          + srcPathname + " to " + destPath + " (" + srcSize + " bytes)\n");

      final long startTime = System.currentTimeMillis();

      final DataFile src = new DataFile(srcPathname);
      final DataFile dest = new DataFile(destPath.toString());

      // Copy the file
      copyFile(src, dest, context);

      // Compute copy statistics
      final long duration = System.currentTimeMillis() - startTime;
      final FileStatus fStatusDest = destFs.getFileStatus(destPath);
      final long destSize = fStatusDest == null ? 0 : fStatusDest.getLen();
      final double speed =
          destSize == 0 ? 0 : (double) destSize / (double) duration * 1000;

      getLogger().info("End copy "
          + srcPathname + " to " + destPath + " in "
          + StringUtils.toTimeHumanReadable(duration) + " (" + destSize
          + " bytes, " + ((int) speed) + " bytes/s)\n");

      context.getCounter(COUNTER_GROUP_NAME, "Input file size")
          .increment(srcSize);
      context.getCounter(COUNTER_GROUP_NAME, "Output file size")
          .increment(destSize);
    }

    /**
     * Copy the file using a Thread and inform Hadoop of the live of the copy
     * with a counter.
     * @param src source
     * @param dest destination
     * @param context context object
     * @throws InterruptedException if another thread has interrupted the
     *           current thread
     * @throws IOException if an error occurs while copying data
     */
    private static void copyFile(final DataFile src, final DataFile dest,
        final Context context) throws InterruptedException, IOException {

      // Define a wrapper object to store exception if needed
      final MyIOExceptionWrapper exp = new MyIOExceptionWrapper();

      // Create the thread for copy
      final Thread t = new Thread(new Runnable() {

        @Override
        public void run() {
          try {
            new DataFormatConverter(src, dest).convert();
          } catch (IOException e) {
            exp.ioexception = e;
          }
        }
      });

      // Start thread
      t.start();

      // Create counter
      final Counter counter =
          context.getCounter(COUNTER_GROUP_NAME, "5_seconds");

      final long startTime = System.currentTimeMillis();

      // Sleep and increment counter until the end of copy
      while (t.isAlive()) {
        Thread.sleep(5000);
        counter.increment(1);

        final long duration = System.currentTimeMillis() - startTime;

        if (duration > MAX_COPY_DURATION) {
          throw new IOException("Copy timeout, copy exceed "
              + (MAX_COPY_DURATION / 1000) + " seconds.");
        }

      }

      // Throw Exception if needed
      if (exp.ioexception != null) {
        throw exp.ioexception;
      }
    }

  }

  public void copy(final Map<DataFile, DataFile> entries) throws IOException {

    if (entries == null || entries.size() == 0) {
      return;
    }

    final Configuration conf = this.conf;
    final Path tmpInputDir =
        PathUtils.createTempPath(this.jobPath, "distcp-in-", "", conf);
    final Path tmpOutputDir =
        PathUtils.createTempPath(this.jobPath, "distcp-out-", "", conf);

    //
    // Create entries for distcp
    //

    final FileSystem fs = tmpInputDir.getFileSystem(conf);
    fs.mkdirs(tmpInputDir);

    // Sort files by size
    final List<DataFile> inFiles = Lists.newArrayList(entries.keySet());
    sortInFilesByDescSize(inFiles);

    // Set the format for the id of the copy task
    final NumberFormat nf = NumberFormat.getInstance();
    nf.setMinimumIntegerDigits(Integer.toString(inFiles.size()).length());
    nf.setGroupingUsed(false);

    int count = 0;
    for (DataFile inFile : inFiles) {

      count++;

      final DataFile outFile = entries.get(inFile);

      final Path f =
          new Path(tmpInputDir, "distcp-" + nf.format(count) + ".cp");

      getLogger().info("Task copy " + inFile + " in " + f.toString());

      BufferedWriter bw =
          new BufferedWriter(new OutputStreamWriter(fs.create(f), CHARSET));

      bw.write(inFile.getSource() + "\t" + outFile.getSource() + "\n");
      bw.close();
    }

    final Job job = createJobConf(conf, tmpInputDir, tmpOutputDir);

    try {
      job.waitForCompletion(false);
    } catch (InterruptedException | ClassNotFoundException e) {
      throw new EoulsanRuntimeException("Error while distcp: " + e.getMessage(),
          e);
    }

    // Remove tmp directory
    PathUtils.fullyDelete(tmpInputDir, conf);
    PathUtils.fullyDelete(tmpOutputDir, conf);

    if (!job.isSuccessful()) {
      throw new IOException("Unable to copy files using DataFileDistCp.");
    }

  }

  /**
   * Sort a list of DataFile by dissident order.
   * @param inFiles list of DataFile to sort
   */
  private void sortInFilesByDescSize(final List<DataFile> inFiles) {

    Collections.sort(inFiles, new Comparator<DataFile>() {

      @Override
      public int compare(final DataFile f1, final DataFile f2) {

        long size1;

        try {
          size1 = f1.getMetaData().getContentLength();
        } catch (IOException e) {
          size1 = -1;
        }

        long size2;
        try {
          size2 = f2.getMetaData().getContentLength();
        } catch (IOException e) {
          size2 = -1;
        }

        return Long.compare(size1, size2) * -1;
      }

    });

  }

  private static Job createJobConf(final Configuration parentConf,
      final Path cpEntriesPath, final Path outputPath) throws IOException {

    final Configuration jobConf = new Configuration(parentConf);

    // timeout
    jobConf.set("mapreduce.task.timeout", "" + MAX_COPY_DURATION);

    // Create the job and its name
    final Job job = Job.getInstance(jobConf, "DataFileDistcp");

    // Set the jar
    job.setJarByClass(DataFileDistCp.class);

    // Add input path
    FileInputFormat.addInputPath(job, cpEntriesPath);

    // Set the input format
    job.setInputFormatClass(TextInputFormat.class);

    // Set the Mapper class
    job.setMapperClass(DistCpMapper.class);

    // Set the reducer class
    // job.setReducerClass(IdentityReducer.class);

    // Set the output key class
    job.setOutputKeyClass(Text.class);

    // Set the output value class
    job.setOutputValueClass(Text.class);

    // Set the number of reducers
    job.setNumReduceTasks(1);

    // Set the output Path
    FileOutputFormat.setOutputPath(job, outputPath);

    return job;
  }

  //
  // Constructor
  //

  /**
   * Public constructor.
   * @param conf Configuration object
   * @param jobPath the path where create job temporary file
   */
  public DataFileDistCp(final Configuration conf, final Path jobPath) {

    if (conf == null) {
      throw new NullPointerException("The configuration is null");
    }

    if (jobPath == null) {
      throw new NullPointerException("The job Path is null");
    }

    this.conf = conf;
    this.jobPath = jobPath;

  }

}