package fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop;
import hbparquet.hadoop.util.ContextUtil;
import htsjdk.samtools.BAMIndexer;
import htsjdk.samtools.SAMFileHeader.SortOrder;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SamInputResource;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.ValidationStringency;
import static fr.ens.biologie.genomique.eoulsan.CommonHadoop.createConfiguration;
import static fr.ens.biologie.genomique.eoulsan.core.InputPortsBuilder.allPortsRequiredInWorkingDirectory;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.MAPPER_RESULTS_BAM;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.MAPPER_RESULTS_INDEX_BAI;
import static fr.ens.biologie.genomique.eoulsan.data.DataFormats.MAPPER_RESULTS_SAM;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
import org.seqdoop.hadoop_bam.AnySAMInputFormat;
import org.seqdoop.hadoop_bam.AnySAMOutputFormat;
import org.seqdoop.hadoop_bam.BAMRecordReader;
import org.seqdoop.hadoop_bam.KeyIgnoringAnySAMOutputFormat;
import org.seqdoop.hadoop_bam.SAMFormat;
import org.seqdoop.hadoop_bam.SAMRecordWritable;
import org.seqdoop.hadoop_bam.cli.CLIMergingAnySAMOutputFormat;
import org.seqdoop.hadoop_bam.cli.Utils;
import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
import fr.ens.biologie.genomique.eoulsan.CommonHadoop;
import fr.ens.biologie.genomique.eoulsan.EoulsanException;
import fr.ens.biologie.genomique.eoulsan.annotations.HadoopOnly;
import fr.ens.biologie.genomique.eoulsan.core.InputPorts;
import fr.ens.biologie.genomique.eoulsan.core.TaskContext;
import fr.ens.biologie.genomique.eoulsan.core.TaskResult;
import fr.ens.biologie.genomique.eoulsan.core.TaskStatus;
import fr.ens.biologie.genomique.eoulsan.data.Data;
import fr.ens.biologie.genomique.eoulsan.data.DataFile;
import fr.ens.biologie.genomique.eoulsan.modules.mapping.AbstractSAM2BAMModule;
import fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.SortRecordReader.IndexerMapper;
import fr.ens.biologie.genomique.eoulsan.util.hadoop.HadoopJobEmergencyStopTask;
import fr.ens.biologie.genomique.eoulsan.util.hadoop.MapReduceUtils;
/**
* This class define a module for converting SAM files into BAM.
* @since 2.0
* @author Laurent Jourdren
*/
@HadoopOnly
public class SAM2BAMHadoopModule extends AbstractSAM2BAMModule {
//
// Module methods
//
@Override
public InputPorts getInputPorts() {
return allPortsRequiredInWorkingDirectory(super.getInputPorts());
}
@Override
public TaskResult execute(final TaskContext context,
final TaskStatus status) {
// Create configuration object
final Configuration conf = createConfiguration();
// Get input and output data
final Data samData = context.getInputData(MAPPER_RESULTS_SAM);
final Data bamData = context.getOutputData(MAPPER_RESULTS_BAM, samData);
final Data indexData =
context.getOutputData(MAPPER_RESULTS_INDEX_BAI, samData);
// Get input and output files
final DataFile samFile = samData.getDataFile();
final DataFile bamFile = bamData.getDataFile();
final DataFile indexFile = indexData.getDataFile();
final Path bamPath = new Path(bamFile.toUri());
final Path workPath =
new Path(bamPath.getParent(), bamPath.getName() + ".tmp");
final Job job;
try {
// Create the job to run
job = createJobConf(conf, context, samData.getName(), samFile, bamFile,
workPath);
// Submit main job
MapReduceUtils.submitAndWaitForJob(job, samData.getName(),
CommonHadoop.CHECK_COMPLETION_TIME, status, COUNTER_GROUP);
} catch (IOException | ClassNotFoundException | InterruptedException
| EoulsanException e) {
return status.createTaskResult(e);
}
try {
HadoopBamUtils.mergeSAMInto(bamPath, workPath, "", "", SAMFormat.BAM,
job.getConfiguration(), "sort");
} catch (IOException e) {
return status.createTaskResult(e);
}
// Create Indexing Hadoop job
try {
// Create the indexer submit file
final DataFile indexerSubmitFile = createSubmitFile(bamFile, indexFile);
// Create the indexer job
final Job indexingJob = createIndexJob(conf, indexerSubmitFile,
"Create " + indexFile + " index file");
// Submit the Hadoop job
indexingJob.submit();
// Add the Hadoop job to the list of job to kill if workflow fails
HadoopJobEmergencyStopTask.addHadoopJobEmergencyStopTask(indexingJob);
// Submit the job to the Hadoop scheduler, and wait the end of the job
// in non verbose mode
indexingJob.waitForCompletion(false);
// Removes the Hadoop job to the list of job to kill if workflow fails
HadoopJobEmergencyStopTask.removeHadoopJobEmergencyStopTask(indexingJob);
if (!indexingJob.isSuccessful()) {
throw new IOException("Error while running Hadoop job for creating "
+ indexFile + " index file");
}
// Delete the indexer submit file
indexerSubmitFile.delete();
} catch (IOException | ClassNotFoundException | InterruptedException e) {
return status.createTaskResult(e);
}
return status.createTaskResult();
}
/**
* Create the sam2bam job.
* @param conf Hadoop configuration
* @param context Step context
* @param sampleName sample sample
* @param samFile SAM file
* @param bamFile BAM file
* @param workPath work path
* @return an Hadoop Job instance
* @throws IOException if an error occurs while creating the job
* @throws ClassNotFoundException if an error occurs while creating the job
* @throws InterruptedException if an error occurs while creating the job
*/
private Job createJobConf(final Configuration conf, final TaskContext context,
final String sampleName, final DataFile samFile, final DataFile bamFile,
final Path workPath)
throws IOException, ClassNotFoundException, InterruptedException {
final ValidationStringency stringency =
ValidationStringency.DEFAULT_STRINGENCY;
Path input = new Path(samFile.toUri());
Path output = new Path(bamFile.toUri());
context.getLogger().info("Input SAM path: " + input);
context.getLogger().info("Output BAM path: " + output);
context.getLogger().info("Working path: " + workPath);
Utils.setHeaderMergerSortOrder(conf, SortOrder.coordinate);
if (stringency != null)
conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY,
stringency.toString());
// Used by Utils.getMergeableWorkFile() to name the output files.
final String intermediateOutName = output.getName();
conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName);
conf.set(AnySAMOutputFormat.OUTPUT_SAM_FORMAT_PROPERTY,
SAMFormat.BAM.toString());
conf.set(AnySAMInputFormat.TRUST_EXTS_PROPERTY, "true");
conf.set(KeyIgnoringAnySAMOutputFormat.WRITE_HEADER_PROPERTY, "false");
Utils.configureSampling(workPath, intermediateOutName, conf);
final Job job = Job.getInstance(conf,
"Sam2Bam ("
+ sampleName + ", input file: " + input + ", output file: "
+ workPath + ")");
job.setJarByClass(SAM2BAMHadoopModule.class);
job.setMapperClass(Mapper.class);
job.setReducerClass(SortReducer.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(SAMRecordWritable.class);
job.setInputFormatClass(SortInputFormat.class);
job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class);
// Set the reducer task count
if (getReducerTaskCount() > 0) {
job.setNumReduceTasks(getReducerTaskCount());
}
// Set input paths
FileSystem fs = input.getFileSystem(conf);
final FileStatus status = fs.getFileStatus(input);
if (status.isDirectory()) {
boolean first = true;
for (FileStatus status2 : fs.listStatus(input)) {
Path p = status2.getPath();
if (!p.getName().startsWith("_")) {
FileInputFormat.addInputPath(job, p);
if (first) {
job.getConfiguration()
.setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, p.toString());
}
context.getLogger().info("add path1: " + p);
}
}
} else {
FileInputFormat.addInputPath(job, input);
job.getConfiguration().setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY,
input.toString());
context.getLogger().info("add path2: " + input);
}
FileOutputFormat.setOutputPath(job, workPath);
job.setPartitionerClass(TotalOrderPartitioner.class);
context.getLogger().info(Utils.HEADERMERGER_INPUTS_PROPERTY
+ ":" + job.getConfiguration().get(Utils.HEADERMERGER_INPUTS_PROPERTY));
InputSampler.writePartitionFile(job,
new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(0.01,
10000, Math.max(100, job.getNumReduceTasks())));
return job;
}
private DataFile createSubmitFile(final DataFile bamFile,
final DataFile indexFile) throws IOException {
DataFile out = new DataFile(indexFile.getParent(),
indexFile.getName() + ".submitfile");
Writer writer = new OutputStreamWriter(out.create());
writer.write(bamFile.getSource() + '\t' + indexFile.getSource());
writer.close();
return out;
}
/**
* Create the index Hadoop job.
* @param conf the Hadoop configuration
* @param submitFile the path to the submit file
* @param jobDescription the job description
* @return a Job object
* @throws IOException if an error occurs while creating the index
*/
private Job createIndexJob(final Configuration conf,
final DataFile submitFile, final String jobDescription)
throws IOException {
final Configuration jobConf = new Configuration(conf);
// Set one task per map
jobConf.set("mapreduce.input.lineinputformat.linespermap", "" + 1);
// Set Job name
// Create the job and its name
final Job job = Job.getInstance(jobConf, jobDescription);
// Set the jar
job.setJarByClass(IndexerMapper.class);
// Set input path
FileInputFormat.addInputPath(job, new Path(submitFile.getSource()));
job.setInputFormatClass(NLineInputFormat.class);
// Set the Mapper class
job.setMapperClass(IndexerMapper.class);
// Set the output key class
job.setOutputKeyClass(NullWritable.class);
// Set the output value class
job.setOutputValueClass(NullWritable.class);
// Set the output format
job.setOutputFormatClass(NullOutputFormat.class);
// Set the number of reducers
job.setNumReduceTasks(0);
return job;
}
/**
* Create the BAI index.
* @param conf the Hadoop configuration
* @param bamFile the BAM file
* @param indexFile the BAI file
* @throws IOException if an error occurs while creating the index
*/
static void createIndex(final Configuration conf, final Path bamFile,
final Path indexFile) throws IOException {
final InputStream in = FileSystem.get(conf).open(bamFile);
final SamReader reader = SamReaderFactory.makeDefault()
.enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS)
.validationStringency(ValidationStringency.DEFAULT_STRINGENCY)
.open(SamInputResource.of(in));
final BAMIndexer indexer =
new BAMIndexer(indexFile.getFileSystem(conf).create(indexFile),
reader.getFileHeader());
for (SAMRecord rec : reader) {
indexer.processAlignment(rec);
}
indexer.finish();
}
}
//
// Hadoop-BAM classes
//
final class SortReducer extends
Reducer<LongWritable, SAMRecordWritable, NullWritable, SAMRecordWritable> {
@Override
protected void reduce(LongWritable ignored,
Iterable<SAMRecordWritable> records,
Reducer<LongWritable, SAMRecordWritable, NullWritable, SAMRecordWritable>.Context ctx)
throws IOException, InterruptedException {
for (SAMRecordWritable rec : records)
ctx.write(NullWritable.get(), rec);
}
}
// Because we want a total order and we may change the key when merging
// headers, we can't use a mapper here: the InputSampler reads directly from
// the InputFormat.
final class SortInputFormat
extends FileInputFormat<LongWritable, SAMRecordWritable> {
private AnySAMInputFormat baseIF = null;
private void initBaseIF(final Configuration conf) {
if (baseIF == null)
baseIF = new AnySAMInputFormat(conf);
}
@Override
public RecordReader<LongWritable, SAMRecordWritable> createRecordReader(
InputSplit split, TaskAttemptContext ctx)
throws InterruptedException, IOException {
initBaseIF(ContextUtil.getConfiguration(ctx));
final RecordReader<LongWritable, SAMRecordWritable> rr =
new SortRecordReader(baseIF.createRecordReader(split, ctx));
rr.initialize(split, ctx);
return rr;
}
@Override
protected boolean isSplitable(JobContext job, Path path) {
initBaseIF(ContextUtil.getConfiguration(job));
return baseIF.isSplitable(job, path);
}
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
initBaseIF(ContextUtil.getConfiguration(job));
return baseIF.getSplits(job);
}
}
final class SortRecordReader
extends RecordReader<LongWritable, SAMRecordWritable> {
private final RecordReader<LongWritable, SAMRecordWritable> baseRR;
private Configuration conf;
public SortRecordReader(RecordReader<LongWritable, SAMRecordWritable> rr) {
baseRR = rr;
}
@Override
public void initialize(InputSplit spl, TaskAttemptContext ctx)
throws InterruptedException, IOException {
conf = ContextUtil.getConfiguration(ctx);
}
@Override
public void close() throws IOException {
baseRR.close();
}
@Override
public float getProgress() throws InterruptedException, IOException {
return baseRR.getProgress();
}
@Override
public LongWritable getCurrentKey() throws InterruptedException, IOException {
return baseRR.getCurrentKey();
}
@Override
public SAMRecordWritable getCurrentValue()
throws InterruptedException, IOException {
return baseRR.getCurrentValue();
}
@Override
public boolean nextKeyValue() throws InterruptedException, IOException {
if (!baseRR.nextKeyValue())
return false;
final SAMRecord rec = getCurrentValue().get();
final int ri = rec.getReferenceIndex();
Utils.correctSAMRecordForMerging(rec, conf);
if (rec.getReferenceIndex() != ri)
getCurrentKey().set(BAMRecordReader.getKey(rec));
return true;
}
//
// Index creation map reduces classes
//
/**
* This class define the mapper that index a BAM file.
* @author Laurent Jourdren
*/
public static final class IndexerMapper
extends Mapper<LongWritable, Text, NullWritable, NullWritable> {
@Override
protected void map(final LongWritable key, final Text value,
final Context context) throws IOException, InterruptedException {
final String[] files = value.toString().split("\t");
if (files.length != 2) {
throw new IOException("Invalid arguments: " + value);
}
final Path bamFile = new Path(files[0]);
final Path indexFile = new Path(files[1]);
// Create index
SAM2BAMHadoopModule.createIndex(context.getConfiguration(), bamFile,
indexFile);
}
}
}