ReadsMapperHadoopModule.java example

Explorer
eoulsan-master
- src
/*
 *                  Eoulsan development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public License version 2.1 or
 * later and CeCILL-C. This should be distributed with the code.
 * If you do not have a copy, see:
 *
 *      http://www.gnu.org/licenses/lgpl-2.1.txt
 *      http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.txt
 *
 * Copyright for this code is held jointly by the Genomic platform
 * of the Institut de Biologie de l'École normale supérieure and
 * the individual authors. These should be listed in @author doc
 * comments.
 *
 * For more information on the Eoulsan project and its aims,
 * or to join the Eoulsan Google group, visit the home page
 * at:
 *
 *      http://outils.genomique.biologie.ens.fr/eoulsan
 *
 */

package fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop;

import static fr.ens.biologie.genomique.eoulsan.CommonHadoop.createConfiguration;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.MAPPER_RESULTS_SAM;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.READS_FASTQ;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.READS_TFQ;
import static fr.ens.biologie.genomique.eoulsan.util.StringUtils.doubleQuotes;

import java.io.IOException;
import java.io.InputStream;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileChecksum;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;

import fr.ens.biologie.genomique.eoulsan.CommonHadoop;
import fr.ens.biologie.genomique.eoulsan.EoulsanException;
import fr.ens.biologie.genomique.eoulsan.Settings;
import fr.ens.biologie.genomique.eoulsan.annotations.HadoopOnly;
import fr.ens.biologie.genomique.eoulsan.bio.FastqFormat;
import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.FastqInputFormat;
import fr.ens.biologie.genomique.eoulsan.bio.io.hadoop.SAMOutputFormat;
import fr.ens.biologie.genomique.eoulsan.core.InputPorts;
import fr.ens.biologie.genomique.eoulsan.core.InputPortsBuilder;
import fr.ens.biologie.genomique.eoulsan.core.Modules;
import fr.ens.biologie.genomique.eoulsan.core.Parameter;
import fr.ens.biologie.genomique.eoulsan.core.StepConfigurationContext;
import fr.ens.biologie.genomique.eoulsan.core.TaskContext;
import fr.ens.biologie.genomique.eoulsan.core.TaskResult;
import fr.ens.biologie.genomique.eoulsan.core.TaskStatus;
import fr.ens.biologie.genomique.eoulsan.data.Data;
import fr.ens.biologie.genomique.eoulsan.data.DataFile;
import fr.ens.biologie.genomique.eoulsan.data.DataFormat;
import fr.ens.biologie.genomique.eoulsan.modules.mapping.AbstractReadsMapperModule;
import fr.ens.biologie.genomique.eoulsan.util.hadoop.MapReduceUtils;

/**
 * This class define an mapper module in Hadoop mode.
 * @since 1.0
 * @author Laurent Jourdren
 */
@HadoopOnly
public class ReadsMapperHadoopModule extends AbstractReadsMapperModule {

  @Override
  public InputPorts getInputPorts() {

    final InputPortsBuilder builder = new InputPortsBuilder();
    builder.addPort(READS_PORT_NAME, READS_FASTQ, true);
    builder.addPort(MAPPER_INDEX_PORT_NAME, getMapper().getArchiveFormat(),
        true);

    return builder.create();
  }

  @Override
  public void configure(final StepConfigurationContext context,
      final Set<Parameter> stepParameters) throws EoulsanException {

    super.configure(context, stepParameters);

    // Check if the mapper can be used with Hadoop
    if (!getMapper().isSplitsAllowed()) {
      Modules.invalidConfiguration(context,
          "The selected mapper cannot be used in Hadoop mode as "
              + "computation cannot be parallelized: "
              + getMapper().getMapperName());
    }

    // Check if user wants to use non bundled mapper binaries
    if (!isUseBundledBinaries()) {
      Modules.invalidConfiguration(context,
          "Non bundled mapper binaries cannot be used in Hadoop mode");
    }

    // Check if user wants to use a mapper Docker image
    if (!getMapperDockerImage().isEmpty()) {
      Modules.invalidConfiguration(context,
          "Cannot use a mapper Docker image in Hadoop mode");
    }

  }

  @Override
  public TaskResult execute(final TaskContext context,
      final TaskStatus status) {

    // Create configuration object
    final Configuration conf = createConfiguration();

    try {

      // Get input and output data
      final Data readsData = context.getInputData(READS_FASTQ);
      final String dataName = readsData.getName();

      final DataFile mapperIndexFile =
          context.getInputData(getMapper().getArchiveFormat()).getDataFile();
      final DataFile outFile =
          context.getOutputData(MAPPER_RESULTS_SAM, readsData).getDataFile();

      DataFile tfqFile = null;

      // Get FASTQ format
      final FastqFormat fastqFormat = readsData.getMetadata().getFastqFormat();

      // Create the job to run
      final Job job;

      // Pre-process paired-end files
      if (readsData.getDataFileCount() == 1) {
        job = createJobConf(conf, context, dataName, readsData.getDataFile(0),
            false, READS_FASTQ, fastqFormat, mapperIndexFile, outFile);
      } else {

        final DataFile inFile1 = readsData.getDataFile(0);
        final DataFile inFile2 = readsData.getDataFile(1);

        tfqFile = new DataFile(inFile1.getParent(),
            inFile1.getBasename() + READS_TFQ.getDefaultExtension());

        // Convert FASTQ files to TFQ
        MapReduceUtils.submitAndWaitForJob(
            PairedEndFastqToTfq.convert(conf, inFile1, inFile2, tfqFile,
                getReducerTaskCount()),
            readsData.getName(), CommonHadoop.CHECK_COMPLETION_TIME, status,
            COUNTER_GROUP);

        job = createJobConf(conf, context, dataName, tfqFile, true, READS_TFQ,
            fastqFormat, mapperIndexFile, outFile);
      }

      // Launch jobs
      MapReduceUtils.submitAndWaitForJob(job, readsData.getName(),
          CommonHadoop.CHECK_COMPLETION_TIME, status, COUNTER_GROUP);

      // Cleanup paired-end
      if (tfqFile != null) {

        final FileSystem fs = FileSystem.get(conf);
        fs.delete(new Path(tfqFile.getSource()), true);
      }

      return status.createTaskResult();

    } catch (IOException | EoulsanException e) {

      return status.createTaskResult(e,
          "Error while running job: " + e.getMessage());
    }

  }

  /**
   * Create the JobConf object for a sample.
   * @param parentConf Hadoop configuration
   * @param dataName data name
   * @param readsFile reads file
   * @param inputFormat inputFormat
   * @param fastqFormat FASTQ format
   * @param mapperIndexFile mapper index file
   * @param outFile output file
   * @return a new JobConf object
   * @throws IOException if an error occurs while creating the job
   */
  private Job createJobConf(final Configuration parentConf,
      final TaskContext context, final String dataName,
      final DataFile readsFile, final boolean pairedEnd,
      final DataFormat inputFormat, final FastqFormat fastqFormat,
      final DataFile mapperIndexFile, final DataFile outFile)
      throws IOException {

    final Configuration jobConf = new Configuration(parentConf);

    final Path inputPath = new Path(readsFile.getSource());

    // Set mapper name
    jobConf.set(ReadsMapperMapper.MAPPER_NAME_KEY, getMapperName());

    // Set mapper version
    jobConf.set(ReadsMapperMapper.MAPPER_VERSION_KEY, getMapperVersion());

    // Set mapper flavor
    jobConf.set(ReadsMapperMapper.MAPPER_FLAVOR_KEY, getMapperFlavor());

    // Set pair end or single end mode
    jobConf.set(ReadsMapperMapper.PAIR_END_KEY, Boolean.toString(pairedEnd));

    // Set the number of threads for the mapper
    if (getMapperLocalThreads() > 0) {
      jobConf.set(ReadsMapperMapper.MAPPER_THREADS_KEY,
          "" + getMapperHadoopThreads());
    }

    // Set mapper arguments
    if (getMapperArguments() != null) {
      jobConf.set(ReadsMapperMapper.MAPPER_ARGS_KEY,
          doubleQuotes(getMapperArguments()));
    }

    // Set Mapper fastq format
    jobConf.set(ReadsMapperMapper.FASTQ_FORMAT_KEY, "" + fastqFormat);

    // Set mapper index checksum
    jobConf.set(ReadsMapperMapper.INDEX_CHECKSUM_KEY,
        "" + computeZipCheckSum(mapperIndexFile, parentConf));

    // Set counter group
    jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, COUNTER_GROUP);

    // timeout
    jobConf.set("mapreduce.task.timeout", "" + HADOOP_TIMEOUT);

    // No JVM task resuse
    jobConf.set("mapreduce.job.jvm.numtasks", "" + 1);

    // Set the memory required by the reads mapper
    jobConf.set("mapreduce.map.memory.mb",
        "" + getMapperHadoopMemoryRequired());

    // Set the memory required by JVM (BWA need more memory than the other
    // mapper for buffering named pipes)
    jobConf.set("mapreduce.map.java.opts", "-Xmx4096M");

    // Set ZooKeeper client configuration
    setZooKeeperJobConfiguration(jobConf, context);

    // Create the job and its name
    final Job job = Job.getInstance(jobConf,
        "Mapping reads in "
            + fastqFormat + " with " + getMapperName() + " (" + dataName + ", "
            + readsFile.getName() + ")");

    // Set genome index reference path in the distributed cache
    final Path genomeIndex = new Path(mapperIndexFile.getSource());

    job.addCacheFile(genomeIndex.toUri());

    // Set the jar
    job.setJarByClass(ReadsMapperHadoopModule.class);

    // Set input path
    FileInputFormat.addInputPath(job, inputPath);

    // Set the input format
    if (inputFormat == READS_FASTQ) {
      job.setInputFormatClass(FastqInputFormat.class);
    } else {
      job.setInputFormatClass(KeyValueTextInputFormat.class);
    }

    // Set the Mapper class
    job.setMapperClass(ReadsMapperMapper.class);

    // Set the output format
    job.setOutputFormatClass(SAMOutputFormat.class);

    // Set the output key class
    job.setOutputKeyClass(Text.class);

    // Set the output value class
    job.setOutputValueClass(Text.class);

    // Set the number of reducers
    job.setNumReduceTasks(0);

    // Set output path
    FileOutputFormat.setOutputPath(job, new Path(outFile.getSource()));

    return job;
  }

  /**
   * Configure ZooKeeper client.
   * @param jobConf job configuration
   * @param context Eoulsan context
   */
  static void setZooKeeperJobConfiguration(final Configuration jobConf,
      final TaskContext context) {

    final Settings settings = context.getSettings();

    String connectString = settings.getZooKeeperConnectString();

    if (connectString == null) {

      connectString = jobConf.get("yarn.resourcemanager.hostname").split(":")[0]
          + ":" + settings.getZooKeeperDefaultPort();

    }

    jobConf.set(ReadsMapperMapper.ZOOKEEPER_CONNECT_STRING_KEY, connectString);
    jobConf.set(ReadsMapperMapper.ZOOKEEPER_SESSION_TIMEOUT_KEY,
        "" + settings.getZooKeeperSessionTimeout());
  }

  /**
   * Compute the checksum of a ZIP file or use the HDFS checksum if available.
   * @param file the zip input file
   * @param conf The Hadoop configuration
   * @return the checksum as a string
   * @throws IOException if an error occurs while creating the checksum
   */
  static String computeZipCheckSum(final DataFile file,
      final Configuration conf) throws IOException {

    final Path path = new Path(file.getSource());

    FileSystem fs = FileSystem.get(path.toUri(), conf);
    final FileChecksum checksum = fs.getFileChecksum(path);

    // If exists use checksum provided by the file system
    if (checksum != null) {
      return new BigInteger(1, checksum.getBytes()).toString(16);
    }

    // Fallback solution
    return computeZipCheckSum(file.open());
  }

  /**
   * Compute the checksum of a ZIP file.
   * @param in input stream
   * @return the checksum as a string
   * @throws IOException if an error occurs while creating the checksum
   */
  private static String computeZipCheckSum(final InputStream in)
      throws IOException {

    ZipArchiveInputStream zais = new ZipArchiveInputStream(in);

    // Create Hash function
    final Hasher hs = Hashing.md5().newHasher();

    // Store entries in a map
    final Map<String, long[]> map = new HashMap<>();

    ZipArchiveEntry e;

    while ((e = zais.getNextZipEntry()) != null) {
      map.put(e.getName(), new long[] {e.getSize(), e.getCrc()});
    }

    zais.close();

    // Add values to hash function in an ordered manner
    for (String filename : new TreeSet<>(map.keySet())) {

      hs.putString(filename, StandardCharsets.UTF_8);
      for (long l : map.get(filename)) {
        hs.putLong(l);
      }
    }

    return hs.hash().toString();
  }

}