SAM2BAMHadoopModule.java example

Explorer
eoulsan-master
- src
package fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop;

import hbparquet.hadoop.util.ContextUtil;
import htsjdk.samtools.BAMIndexer;
import htsjdk.samtools.SAMFileHeader.SortOrder;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SamInputResource;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.ValidationStringency;

import static fr.ens.biologie.genomique.eoulsan.CommonHadoop.createConfiguration;
import static fr.ens.biologie.genomique.eoulsan.core.InputPortsBuilder.allPortsRequiredInWorkingDirectory;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.MAPPER_RESULTS_BAM;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.MAPPER_RESULTS_INDEX_BAI;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.MAPPER_RESULTS_SAM;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
import org.seqdoop.hadoop_bam.AnySAMInputFormat;
import org.seqdoop.hadoop_bam.AnySAMOutputFormat;
import org.seqdoop.hadoop_bam.BAMRecordReader;
import org.seqdoop.hadoop_bam.KeyIgnoringAnySAMOutputFormat;
import org.seqdoop.hadoop_bam.SAMFormat;
import org.seqdoop.hadoop_bam.SAMRecordWritable;
import org.seqdoop.hadoop_bam.cli.CLIMergingAnySAMOutputFormat;
import org.seqdoop.hadoop_bam.cli.Utils;
import org.seqdoop.hadoop_bam.util.SAMHeaderReader;

import fr.ens.biologie.genomique.eoulsan.CommonHadoop;
import fr.ens.biologie.genomique.eoulsan.EoulsanException;
import fr.ens.biologie.genomique.eoulsan.annotations.HadoopOnly;
import fr.ens.biologie.genomique.eoulsan.core.InputPorts;
import fr.ens.biologie.genomique.eoulsan.core.TaskContext;
import fr.ens.biologie.genomique.eoulsan.core.TaskResult;
import fr.ens.biologie.genomique.eoulsan.core.TaskStatus;
import fr.ens.biologie.genomique.eoulsan.data.Data;
import fr.ens.biologie.genomique.eoulsan.data.DataFile;
import fr.ens.biologie.genomique.eoulsan.modules.mapping.AbstractSAM2BAMModule;
import fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.SortRecordReader.IndexerMapper;
import fr.ens.biologie.genomique.eoulsan.util.hadoop.HadoopJobEmergencyStopTask;
import fr.ens.biologie.genomique.eoulsan.util.hadoop.MapReduceUtils;

/**
 * This class define a module for converting SAM files into BAM.
 * @since 2.0
 * @author Laurent Jourdren
 */
@HadoopOnly
public class SAM2BAMHadoopModule extends AbstractSAM2BAMModule {

  //
  // Module methods
  //

  @Override
  public InputPorts getInputPorts() {

    return allPortsRequiredInWorkingDirectory(super.getInputPorts());
  }

  @Override
  public TaskResult execute(final TaskContext context,
      final TaskStatus status) {

    // Create configuration object
    final Configuration conf = createConfiguration();

    // Get input and output data
    final Data samData = context.getInputData(MAPPER_RESULTS_SAM);
    final Data bamData = context.getOutputData(MAPPER_RESULTS_BAM, samData);
    final Data indexData =
        context.getOutputData(MAPPER_RESULTS_INDEX_BAI, samData);

    // Get input and output files
    final DataFile samFile = samData.getDataFile();
    final DataFile bamFile = bamData.getDataFile();
    final DataFile indexFile = indexData.getDataFile();

    final Path bamPath = new Path(bamFile.toUri());

    final Path workPath =
        new Path(bamPath.getParent(), bamPath.getName() + ".tmp");

    final Job job;
    try {

      // Create the job to run
      job = createJobConf(conf, context, samData.getName(), samFile, bamFile,
          workPath);

      // Submit main job
      MapReduceUtils.submitAndWaitForJob(job, samData.getName(),
          CommonHadoop.CHECK_COMPLETION_TIME, status, COUNTER_GROUP);
    } catch (IOException | ClassNotFoundException | InterruptedException
        | EoulsanException e) {
      return status.createTaskResult(e);
    }

    try {
      HadoopBamUtils.mergeSAMInto(bamPath, workPath, "", "", SAMFormat.BAM,
          job.getConfiguration(), "sort");
    } catch (IOException e) {
      return status.createTaskResult(e);
    }

    // Create Indexing Hadoop job
    try {

      // Create the indexer submit file
      final DataFile indexerSubmitFile = createSubmitFile(bamFile, indexFile);

      // Create the indexer job
      final Job indexingJob = createIndexJob(conf, indexerSubmitFile,
          "Create " + indexFile + " index file");

      // Submit the Hadoop job
      indexingJob.submit();

      // Add the Hadoop job to the list of job to kill if workflow fails
      HadoopJobEmergencyStopTask.addHadoopJobEmergencyStopTask(indexingJob);

      // Submit the job to the Hadoop scheduler, and wait the end of the job
      // in non verbose mode
      indexingJob.waitForCompletion(false);

      // Removes the Hadoop job to the list of job to kill if workflow fails
      HadoopJobEmergencyStopTask.removeHadoopJobEmergencyStopTask(indexingJob);

      if (!indexingJob.isSuccessful()) {
        throw new IOException("Error while running Hadoop job for creating "
            + indexFile + " index file");
      }

      // Delete the indexer submit file
      indexerSubmitFile.delete();

    } catch (IOException | ClassNotFoundException | InterruptedException e) {
      return status.createTaskResult(e);
    }

    return status.createTaskResult();
  }

  /**
   * Create the sam2bam job.
   * @param conf Hadoop configuration
   * @param context Step context
   * @param sampleName sample sample
   * @param samFile SAM file
   * @param bamFile BAM file
   * @param workPath work path
   * @return an Hadoop Job instance
   * @throws IOException if an error occurs while creating the job
   * @throws ClassNotFoundException if an error occurs while creating the job
   * @throws InterruptedException if an error occurs while creating the job
   */
  private Job createJobConf(final Configuration conf, final TaskContext context,
      final String sampleName, final DataFile samFile, final DataFile bamFile,
      final Path workPath)
      throws IOException, ClassNotFoundException, InterruptedException {

    final ValidationStringency stringency =
        ValidationStringency.DEFAULT_STRINGENCY;

    Path input = new Path(samFile.toUri());
    Path output = new Path(bamFile.toUri());

    context.getLogger().info("Input SAM path: " + input);
    context.getLogger().info("Output BAM path: " + output);
    context.getLogger().info("Working path: " + workPath);

    Utils.setHeaderMergerSortOrder(conf, SortOrder.coordinate);

    if (stringency != null)
      conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY,
          stringency.toString());

    // Used by Utils.getMergeableWorkFile() to name the output files.
    final String intermediateOutName = output.getName();
    conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName);

    conf.set(AnySAMOutputFormat.OUTPUT_SAM_FORMAT_PROPERTY,
        SAMFormat.BAM.toString());
    conf.set(AnySAMInputFormat.TRUST_EXTS_PROPERTY, "true");
    conf.set(KeyIgnoringAnySAMOutputFormat.WRITE_HEADER_PROPERTY, "false");

    Utils.configureSampling(workPath, intermediateOutName, conf);

    final Job job = Job.getInstance(conf,
        "Sam2Bam ("
            + sampleName + ", input file: " + input + ", output file: "
            + workPath + ")");

    job.setJarByClass(SAM2BAMHadoopModule.class);
    job.setMapperClass(Mapper.class);
    job.setReducerClass(SortReducer.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(SAMRecordWritable.class);

    job.setInputFormatClass(SortInputFormat.class);
    job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class);

    // Set the reducer task count
    if (getReducerTaskCount() > 0) {
      job.setNumReduceTasks(getReducerTaskCount());
    }

    // Set input paths
    FileSystem fs = input.getFileSystem(conf);
    final FileStatus status = fs.getFileStatus(input);

    if (status.isDirectory()) {

      boolean first = true;
      for (FileStatus status2 : fs.listStatus(input)) {

        Path p = status2.getPath();
        if (!p.getName().startsWith("_")) {
          FileInputFormat.addInputPath(job, p);

          if (first) {
            job.getConfiguration()
                .setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, p.toString());
          }

          context.getLogger().info("add path1: " + p);
        }
      }
    } else {
      FileInputFormat.addInputPath(job, input);
      job.getConfiguration().setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY,
          input.toString());
      context.getLogger().info("add path2: " + input);
    }

    FileOutputFormat.setOutputPath(job, workPath);

    job.setPartitionerClass(TotalOrderPartitioner.class);
    context.getLogger().info(Utils.HEADERMERGER_INPUTS_PROPERTY
        + ":" + job.getConfiguration().get(Utils.HEADERMERGER_INPUTS_PROPERTY));

    InputSampler.writePartitionFile(job,
        new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(0.01,
            10000, Math.max(100, job.getNumReduceTasks())));

    return job;
  }

  private DataFile createSubmitFile(final DataFile bamFile,
      final DataFile indexFile) throws IOException {

    DataFile out = new DataFile(indexFile.getParent(),
        indexFile.getName() + ".submitfile");

    Writer writer = new OutputStreamWriter(out.create());
    writer.write(bamFile.getSource() + '\t' + indexFile.getSource());
    writer.close();

    return out;
  }

  /**
   * Create the index Hadoop job.
   * @param conf the Hadoop configuration
   * @param submitFile the path to the submit file
   * @param jobDescription the job description
   * @return a Job object
   * @throws IOException if an error occurs while creating the index
   */
  private Job createIndexJob(final Configuration conf,
      final DataFile submitFile, final String jobDescription)
      throws IOException {

    final Configuration jobConf = new Configuration(conf);

    // Set one task per map
    jobConf.set("mapreduce.input.lineinputformat.linespermap", "" + 1);

    // Set Job name
    // Create the job and its name
    final Job job = Job.getInstance(jobConf, jobDescription);

    // Set the jar
    job.setJarByClass(IndexerMapper.class);

    // Set input path
    FileInputFormat.addInputPath(job, new Path(submitFile.getSource()));

    job.setInputFormatClass(NLineInputFormat.class);

    // Set the Mapper class
    job.setMapperClass(IndexerMapper.class);

    // Set the output key class
    job.setOutputKeyClass(NullWritable.class);

    // Set the output value class
    job.setOutputValueClass(NullWritable.class);

    // Set the output format
    job.setOutputFormatClass(NullOutputFormat.class);

    // Set the number of reducers
    job.setNumReduceTasks(0);

    return job;
  }

  /**
   * Create the BAI index.
   * @param conf the Hadoop configuration
   * @param bamFile the BAM file
   * @param indexFile the BAI file
   * @throws IOException if an error occurs while creating the index
   */
  static void createIndex(final Configuration conf, final Path bamFile,
      final Path indexFile) throws IOException {

    final InputStream in = FileSystem.get(conf).open(bamFile);

    final SamReader reader = SamReaderFactory.makeDefault()
        .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS)
        .validationStringency(ValidationStringency.DEFAULT_STRINGENCY)
        .open(SamInputResource.of(in));

    final BAMIndexer indexer =
        new BAMIndexer(indexFile.getFileSystem(conf).create(indexFile),
            reader.getFileHeader());

    for (SAMRecord rec : reader) {
      indexer.processAlignment(rec);

    }

    indexer.finish();
  }

}

//
// Hadoop-BAM classes
//

final class SortReducer extends
    Reducer<LongWritable, SAMRecordWritable, NullWritable, SAMRecordWritable> {
  @Override
  protected void reduce(LongWritable ignored,
      Iterable<SAMRecordWritable> records,
      Reducer<LongWritable, SAMRecordWritable, NullWritable, SAMRecordWritable>.Context ctx)
      throws IOException, InterruptedException {
    for (SAMRecordWritable rec : records)
      ctx.write(NullWritable.get(), rec);
  }
}

// Because we want a total order and we may change the key when merging
// headers, we can't use a mapper here: the InputSampler reads directly from
// the InputFormat.
final class SortInputFormat
    extends FileInputFormat<LongWritable, SAMRecordWritable> {
  private AnySAMInputFormat baseIF = null;

  private void initBaseIF(final Configuration conf) {
    if (baseIF == null)
      baseIF = new AnySAMInputFormat(conf);
  }

  @Override
  public RecordReader<LongWritable, SAMRecordWritable> createRecordReader(
      InputSplit split, TaskAttemptContext ctx)
      throws InterruptedException, IOException {
    initBaseIF(ContextUtil.getConfiguration(ctx));

    final RecordReader<LongWritable, SAMRecordWritable> rr =
        new SortRecordReader(baseIF.createRecordReader(split, ctx));
    rr.initialize(split, ctx);
    return rr;
  }

  @Override
  protected boolean isSplitable(JobContext job, Path path) {
    initBaseIF(ContextUtil.getConfiguration(job));
    return baseIF.isSplitable(job, path);
  }

  @Override
  public List<InputSplit> getSplits(JobContext job) throws IOException {
    initBaseIF(ContextUtil.getConfiguration(job));
    return baseIF.getSplits(job);
  }
}

final class SortRecordReader
    extends RecordReader<LongWritable, SAMRecordWritable> {
  private final RecordReader<LongWritable, SAMRecordWritable> baseRR;

  private Configuration conf;

  public SortRecordReader(RecordReader<LongWritable, SAMRecordWritable> rr) {
    baseRR = rr;
  }

  @Override
  public void initialize(InputSplit spl, TaskAttemptContext ctx)
      throws InterruptedException, IOException {
    conf = ContextUtil.getConfiguration(ctx);
  }

  @Override
  public void close() throws IOException {
    baseRR.close();
  }

  @Override
  public float getProgress() throws InterruptedException, IOException {
    return baseRR.getProgress();
  }

  @Override
  public LongWritable getCurrentKey() throws InterruptedException, IOException {
    return baseRR.getCurrentKey();
  }

  @Override
  public SAMRecordWritable getCurrentValue()
      throws InterruptedException, IOException {
    return baseRR.getCurrentValue();
  }

  @Override
  public boolean nextKeyValue() throws InterruptedException, IOException {
    if (!baseRR.nextKeyValue())
      return false;

    final SAMRecord rec = getCurrentValue().get();

    final int ri = rec.getReferenceIndex();

    Utils.correctSAMRecordForMerging(rec, conf);

    if (rec.getReferenceIndex() != ri)
      getCurrentKey().set(BAMRecordReader.getKey(rec));

    return true;
  }

  //
  // Index creation map reduces classes
  //

  /**
   * This class define the mapper that index a BAM file.
   * @author Laurent Jourdren
   */
  public static final class IndexerMapper
      extends Mapper<LongWritable, Text, NullWritable, NullWritable> {

    @Override
    protected void map(final LongWritable key, final Text value,
        final Context context) throws IOException, InterruptedException {

      final String[] files = value.toString().split("\t");

      if (files.length != 2) {
        throw new IOException("Invalid arguments: " + value);
      }

      final Path bamFile = new Path(files[0]);
      final Path indexFile = new Path(files[1]);

      // Create index
      SAM2BAMHadoopModule.createIndex(context.getConfiguration(), bamFile,
          indexFile);
    }
  }

}