package fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SamFileHeaderMerger;
import htsjdk.samtools.SamInputResource;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.util.BlockCompressedStreamConstants;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.seqdoop.hadoop_bam.SAMFormat;
import org.seqdoop.hadoop_bam.cli.Utils;
import org.seqdoop.hadoop_bam.util.SAMOutputPreparer;
public class HadoopBamUtils {
private static final String HEADERMERGER_SORTORDER_PROP =
"hadoopbam.headermerger.sortorder";
/**
* Computes the merger of the SAM headers in the files listed in
* HEADERMERGER_INPUTS_PROPERTY. The sort order of the result is set according
* to the last call to setHeaderMergerSortOrder, or otherwise to "unsorted".
* The result is cached locally to prevent it from being recomputed too often.
*/
public static SamFileHeaderMerger getSAMHeaderMerger(Configuration conf)
throws IOException {
// TODO: it would be preferable to cache this beforehand instead of
// having every task read the header block of every input file. But that
// would be trickier, given that SamFileHeaderMerger isn't trivially
// serializable.
final List<SAMFileHeader> headers = new ArrayList<>();
for (final String in : conf
.getStrings(Utils.HEADERMERGER_INPUTS_PROPERTY)) {
final Path p = new Path(in);
final SamReader r = SamReaderFactory.makeDefault()
.open(SamInputResource.of(p.getFileSystem(conf).open(p)));
headers.add(r.getFileHeader());
r.close();
}
final String orderStr = conf.get(HEADERMERGER_SORTORDER_PROP);
final SAMFileHeader.SortOrder order = orderStr == null
? SAMFileHeader.SortOrder.unsorted
: SAMFileHeader.SortOrder.valueOf(orderStr);
return new SamFileHeaderMerger(order, headers, true);
}
/**
* Merges the files in the given directory that have names given by
* getMergeableWorkFile() into out in the given SAMFormat, using
* getSAMHeaderMerger().getMergedHeader() as the header. Outputs progress
* reports if commandName is non-null.
*/
public static void mergeSAMInto(Path out, Path directory, String basePrefix,
String basePostfix, SAMFormat format, Configuration conf,
String commandName) throws IOException {
final OutputStream outs = out.getFileSystem(conf).create(out);
// First, place the SAM or BAM header.
//
// Don't use the returned stream, because we're concatenating directly
// and don't want to apply another layer of compression to BAM.
new SAMOutputPreparer().prepareForRecords(outs, format,
getSAMHeaderMerger(conf).getMergedHeader());
// Then, the actual SAM or BAM contents.
mergeInto(outs, directory, basePrefix, basePostfix, conf, commandName);
// And if BAM, the BGZF terminator.
if (format == SAMFormat.BAM)
outs.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
outs.close();
}
/**
* Merges the files in the given directory that have names given by
* getMergeableWorkFile() into out. Outputs progress reports if commandName is
* non-null.
*/
public static void mergeInto(OutputStream out, Path directory,
String basePrefix, String basePostfix, Configuration conf,
String commandName) throws IOException {
final FileSystem fs = directory.getFileSystem(conf);
final FileStatus[] parts = fs.globStatus(new Path(directory,
basePrefix
+ conf.get(Utils.WORK_FILENAME_PROPERTY) + basePostfix
+ "-[0-9][0-9][0-9][0-9][0-9][0-9]*"));
for (final FileStatus part : parts) {
final InputStream in = fs.open(part.getPath());
IOUtils.copyBytes(in, out, conf, false);
in.close();
}
for (final FileStatus part : parts) {
fs.delete(part.getPath(), false);
}
}
}