package picard.sam;
import htsjdk.samtools.*;
import htsjdk.samtools.SAMFileHeader.SortOrder;
import htsjdk.samtools.filter.OverclippedReadFilter;
import htsjdk.samtools.util.*;
import htsjdk.variant.utils.SAMSequenceDictionaryExtractor;
import picard.PicardException;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
/**
* Class that takes in a set of alignment information in SAM format and merges it with the set
* of all reads for which alignment was attempted, stored in an unmapped SAM file. This
* class overrides mergeAlignment in AbstractAlignmentNMerger and proceeds on the assumption that
* the underlying alignment records are aleady in query-name sorted order (true for bwa). If
* they are not, the mergeAlignment method catches the IllegalStateException, forces a sort
* of the underlying alignment records, and tries again.
*
* @author ktibbett@broadinstitute.org
*/
public class SamAlignmentMerger extends AbstractAlignmentMerger {
private final Log log = Log.getInstance(SamAlignmentMerger.class);
private final List<File> alignedSamFile;
private final List<File> read1AlignedSamFile;
private final List<File> read2AlignedSamFile;
private final int maxGaps;
private final int minUnclippedBases;
private boolean forceSort = false;
private final OverclippedReadFilter contaminationFilter;
private final List<String> requiredMatchingDictionaryTags;
private SAMSequenceDictionary alignedSamDictionary;
/**
* Constructor with a default value for unmappingReadStrategy
*
*/
public SamAlignmentMerger(final File unmappedBamFile, final File targetBamFile, final File referenceFasta,
final SAMProgramRecord programRecord, final boolean clipAdapters, final boolean bisulfiteSequence,
final boolean alignedReadsOnly,
final List<File> alignedSamFile, final int maxGaps, final List<String> attributesToRetain,
final List<String> attributesToRemove,
final Integer read1BasesTrimmed, final Integer read2BasesTrimmed,
final List<File> read1AlignedSamFile, final List<File> read2AlignedSamFile,
final List<SamPairUtil.PairOrientation> expectedOrientations,
final SortOrder sortOrder,
final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy,
final boolean addMateCigar,
final boolean unmapContaminantReads,
final int minUnclippedBases) {
this(unmappedBamFile,
targetBamFile,
referenceFasta,
programRecord,
clipAdapters,
bisulfiteSequence,
alignedReadsOnly,
alignedSamFile,
maxGaps,
attributesToRetain,
attributesToRemove,
read1BasesTrimmed,
read2BasesTrimmed,
read1AlignedSamFile,
read2AlignedSamFile,
expectedOrientations,
sortOrder,
primaryAlignmentSelectionStrategy,
addMateCigar,
unmapContaminantReads,
minUnclippedBases,
UnmappingReadStrategy.DO_NOT_CHANGE,
SAMSequenceDictionary.DEFAULT_DICTIONARY_EQUAL_TAG
);
}
/**
* Constructor
*
* @param unmappedBamFile The BAM file that was used as the input to the aligner, which will
* include info on all the reads that did not map. Required.
* @param targetBamFile The file to which to write the merged SAM records. Required.
* @param referenceFasta The reference sequence for the map files. Required.
* @param programRecord Program record for taget file SAMRecords created.
* @param clipAdapters Whether adapters marked in unmapped BAM file should be marked as
* soft clipped in the merged bam. Required.
* @param bisulfiteSequence Whether the reads are bisulfite sequence (used when calculating the
* NM and UQ tags). Required.
* @param alignedReadsOnly Whether to output only those reads that have alignment data
* @param alignedSamFile The SAM file(s) with alignment information. Optional. If this is
* not provided, then read1AlignedSamFile and read2AlignedSamFile must be.
* @param maxGaps The maximum number of insertions or deletions permitted in an
* alignment. Alignments with more than this many gaps will be ignored.
* -1 means to allow any number of gaps.
* @param attributesToRetain attributes from the alignment record that should be
* retained when merging, overridden by attributesToRemove if they share
* common tags.
* @param attributesToRemove attributes from the alignment record that should be
* removed when merging. This overrides attributesToRetain if they share
* common tags.
* @param read1BasesTrimmed The number of bases trimmed from start of read 1 prior to alignment. Optional.
* @param read2BasesTrimmed The number of bases trimmed from start of read 2 prior to alignment. Optional.
* @param read1AlignedSamFile The alignment records for read1. Used when the two ends of a read are
* aligned separately. This is optional, but must be specified if
* alignedSamFile is not.
* @param read2AlignedSamFile The alignment records for read1. Used when the two ends of a read are
* aligned separately. This is optional, but must be specified if
* alignedSamFile is not.
* @param expectedOrientations A List of SamPairUtil.PairOrientations that are expected for
* aligned pairs. Used to determine the properPair flag.
* @param sortOrder The order in which the merged records should be output. If null,
* output will be coordinate-sorted
* @param primaryAlignmentSelectionStrategy How to handle multiple alignments for a fragment or read pair,
* in which none are primary, or more than one is marked primary
* @param addMateCigar True if we are to add or maintain the mate CIGAR (MC) tag, false if we are to remove or not include.
* @param unmapContaminantReads If true, identify reads having the signature of cross-species contamination (i.e. mostly clipped bases),
* and mark them as unmapped.
* @param minUnclippedBases If unmapContaminantReads is set, require this many unclipped bases or else the read will be marked as contaminant.
* @param unmappingReadStrategy An enum describing how to deal with reads whose mapping information are being removed (currently this happens due to cross-species
* contamination). Ignored unless unmapContaminantReads is true.
* @param requiredMatchingDictionaryTags A list of SAMSequenceRecord tags that must be equal (if present) in the aligned bam and the reference dictionary.
* Program will issue a warning about other tags, if present in both files and are different.
*/
public SamAlignmentMerger(final File unmappedBamFile, final File targetBamFile, final File referenceFasta,
final SAMProgramRecord programRecord, final boolean clipAdapters, final boolean bisulfiteSequence,
final boolean alignedReadsOnly,
final List<File> alignedSamFile, final int maxGaps, final List<String> attributesToRetain,
final List<String> attributesToRemove,
final Integer read1BasesTrimmed, final Integer read2BasesTrimmed,
final List<File> read1AlignedSamFile, final List<File> read2AlignedSamFile,
final List<SamPairUtil.PairOrientation> expectedOrientations,
final SortOrder sortOrder,
final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy,
final boolean addMateCigar,
final boolean unmapContaminantReads,
final int minUnclippedBases,
final UnmappingReadStrategy unmappingReadStrategy,
final List<String> requiredMatchingDictionaryTags) {
super(unmappedBamFile, targetBamFile, referenceFasta, clipAdapters, bisulfiteSequence,
alignedReadsOnly, programRecord, attributesToRetain, attributesToRemove, read1BasesTrimmed,
read2BasesTrimmed, expectedOrientations, sortOrder, primaryAlignmentSelectionStrategy, addMateCigar,
unmapContaminantReads, unmappingReadStrategy);
if ((alignedSamFile == null || alignedSamFile.isEmpty()) &&
(read1AlignedSamFile == null || read1AlignedSamFile.isEmpty() ||
read2AlignedSamFile == null || read2AlignedSamFile.isEmpty())) {
throw new IllegalArgumentException("Either alignedSamFile or BOTH of read1AlignedSamFile and " +
"read2AlignedSamFile must be specified.");
}
if (alignedSamFile != null) {
alignedSamFile.forEach(IOUtil::assertFileIsReadable);
} else {
read1AlignedSamFile.forEach(IOUtil::assertFileIsReadable);
read2AlignedSamFile.forEach(IOUtil::assertFileIsReadable);
}
this.alignedSamFile = alignedSamFile;
this.read1AlignedSamFile = read1AlignedSamFile;
this.read2AlignedSamFile = read2AlignedSamFile;
this.maxGaps = maxGaps;
this.minUnclippedBases = minUnclippedBases;
this.contaminationFilter = new OverclippedReadFilter(minUnclippedBases, false);
this.requiredMatchingDictionaryTags = requiredMatchingDictionaryTags;
log.info("Processing SAM file(s): " + ((alignedSamFile != null) ? alignedSamFile : (read1AlignedSamFile + "," + read2AlignedSamFile)));
}
/**
* Merges the alignment from the map file with the non-aligned records from the source BAM file.
* Overrides mergeAlignment in AbstractAlignmentMerger. Tries first to proceed on the assumption
* that the alignment records are pre-sorted. If not, catches the exception, forces a sort, and
* tries again.
*/
public void mergeAlignment(final File referenceFasta) {
try {
super.mergeAlignment(referenceFasta);
} catch (final IllegalStateException ise) {
log.warn("Exception merging bam alignment - attempting to sort aligned reads and try again: ", ise.getMessage());
forceSort = true;
resetRefSeqFileWalker();
super.mergeAlignment(referenceFasta);
}
}
@Override
protected SAMSequenceDictionary getDictionaryForMergedBam() {
SAMSequenceDictionary referenceDict = SAMSequenceDictionaryExtractor.extractDictionary(referenceFasta);
if (referenceDict == null) {
throw new PicardException("No sequence dictionary found for " + referenceFasta.getAbsolutePath() +
". Use Picard's CreateSequenceDictionary to create a sequence dictionary.");
}
return SAMSequenceDictionary.mergeDictionaries(alignedSamDictionary, referenceDict, requiredMatchingDictionaryTags);
}
/**
* Reads the aligned SAM records into a SortingCollection and returns an iterator over that collection
*/
protected CloseableIterator<SAMRecord> getQuerynameSortedAlignedRecords() {
final CloseableIterator<SAMRecord> mergingIterator;
final SAMFileHeader header;
// When the alignment records, including both ends of a pair, are in SAM files
if (alignedSamFile != null && !alignedSamFile.isEmpty()) {
final List<SAMFileHeader> headers = new ArrayList<>(alignedSamFile.size());
final List<SamReader> readers = new ArrayList<>(alignedSamFile.size());
for (final File f : this.alignedSamFile) {
final SamReader r = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).open(f);
headers.add(r.getFileHeader());
readers.add(r);
// As we're going through and opening the aligned files, if we don't have a @PG yet
// and there is only a single one in the input file, use that!
if (getProgramRecord() == null && r.getFileHeader().getProgramRecords().size() == 1) {
setProgramRecord(r.getFileHeader().getProgramRecords().iterator().next());
}
}
// assert that all the dictionaries are the same before merging the headers.
alignedSamDictionary = headers.get(0).getSequenceDictionary();
headers.stream()
.map(SAMFileHeader::getSequenceDictionary)
.forEach(alignedSamDictionary::assertSameDictionary);
final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SortOrder.queryname, headers, false);
mergingIterator = new MergingSamRecordIterator(headerMerger, readers, true);
header = headerMerger.getMergedHeader();
}
// When the ends are aligned separately and don't have firstOfPair information correctly
// set we use this branch.
else {
// this merging iterator already asserts that all the headers are the same
mergingIterator = new SeparateEndAlignmentIterator(this.read1AlignedSamFile, this.read2AlignedSamFile, referenceFasta);
header = ((SeparateEndAlignmentIterator) mergingIterator).getHeader();
alignedSamDictionary = header.getSequenceDictionary();
// As we're going through and opening the aligned files, if we don't have a @PG yet
// and there is only a single one in the input file, use that!
if (getProgramRecord() == null && header.getProgramRecords().size() == 1) {
setProgramRecord(header.getProgramRecords().iterator().next());
}
}
if (!forceSort) {
return mergingIterator;
}
final SortingCollection<SAMRecord> alignmentSorter = SortingCollection.newInstance(SAMRecord.class,
new BAMRecordCodec(header), new SAMRecordQueryNameComparator(), MAX_RECORDS_IN_RAM);
int count = 0;
while (mergingIterator.hasNext()) {
alignmentSorter.add(mergingIterator.next());
count++;
if (count > 0 && count % 1000000 == 0) {
log.info("Read " + count + " records from alignment SAM/BAM.");
}
}
log.info("Finished reading " + count + " total records from alignment SAM/BAM.");
mergingIterator.close();
return new DelegatingIterator<SAMRecord>(alignmentSorter.iterator()) {
@Override
public void close() {
super.close();
alignmentSorter.cleanup();
}
};
}
private final class SuffixTrimingSamRecordIterator implements CloseableIterator<SAMRecord> {
private final CloseableIterator<SAMRecord> underlyingIterator;
private final String suffixToTrim;
private SuffixTrimingSamRecordIterator(final CloseableIterator<SAMRecord> underlyingIterator, final String suffixToTrim) {
this.underlyingIterator = underlyingIterator;
this.suffixToTrim = suffixToTrim;
}
@Override
public void close() {
underlyingIterator.close();
}
@Override
public boolean hasNext() {
return underlyingIterator.hasNext();
}
@Override
public SAMRecord next() {
final SAMRecord rec = underlyingIterator.next();
final String readName = rec.getReadName();
if (readName.endsWith(suffixToTrim)) {
rec.setReadName(readName.substring(0, readName.length() - suffixToTrim.length()));
}
return rec;
}
@Override
public void remove() {
underlyingIterator.remove();
}
}
private class SeparateEndAlignmentIterator implements CloseableIterator<SAMRecord> {
private final PeekableIterator<SAMRecord> read1Iterator;
private final PeekableIterator<SAMRecord> read2Iterator;
private final SAMFileHeader header;
public SeparateEndAlignmentIterator(final List<File> read1Alignments, final List<File> read2Alignments, File referenceFasta) {
final List<SAMFileHeader> headers = new ArrayList<>();
final List<SamReader> read1 = new ArrayList<>(read1Alignments.size());
final List<SamReader> read2 = new ArrayList<>(read2Alignments.size());
for (final File f : read1Alignments) {
final SamReader r = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).open(f);
headers.add(r.getFileHeader());
read1.add(r);
}
for (final File f : read2Alignments) {
final SamReader r = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).open(f);
headers.add(r.getFileHeader());
read2.add(r);
}
final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, false);
read1Iterator = new PeekableIterator<>(
new SuffixTrimingSamRecordIterator(new MergingSamRecordIterator(headerMerger, read1, true), "/1"));
read2Iterator = new PeekableIterator<>(
new SuffixTrimingSamRecordIterator(new MergingSamRecordIterator(headerMerger, read2, true), "/2"));
header = headerMerger.getMergedHeader();
}
public void close() {
read1Iterator.close();
read2Iterator.close();
}
public boolean hasNext() {
return read1Iterator.hasNext() || read2Iterator.hasNext();
}
public SAMRecord next() {
if (read1Iterator.hasNext()) {
if (read2Iterator.hasNext()) {
return (read1Iterator.peek().getReadName().compareTo(read2Iterator.peek().getReadName()) <= 0)
? setPairFlags(read1Iterator.next(), true)
: setPairFlags(read2Iterator.next(), false);
} else {
return setPairFlags(read1Iterator.next(), true);
}
} else {
return setPairFlags(read2Iterator.next(), false);
}
}
public void remove() {
throw new UnsupportedOperationException("remove() not supported");
}
public SAMFileHeader getHeader() { return this.header; }
private SAMRecord setPairFlags(final SAMRecord sam, final boolean firstOfPair) {
sam.setReadPairedFlag(true);
sam.setFirstOfPairFlag(firstOfPair);
sam.setSecondOfPairFlag(!firstOfPair);
return sam;
}
}
/**
* For now, we only ignore those alignments that have more than <code>maxGaps</code> insertions
* or deletions.
*/
protected boolean ignoreAlignment(final SAMRecord sam) {
if (maxGaps == -1) return false;
int gaps = 0;
for (final CigarElement el : sam.getCigar().getCigarElements()) {
if (el.getOperator() == CigarOperator.I || el.getOperator() == CigarOperator.D) {
gaps++;
}
}
return gaps > maxGaps;
}
/**
* Criteria for contaminant reads:
* 1. primary alignment has fewer than minUnclippedBases unclipped bases
* 2. primary alignment has both ends clipped
* 3. for pairs, at least one end of primary alignment meets above criteria
*/
protected boolean isContaminant(final HitsForInsert hits) {
boolean isContaminant = false;
if (hits.numHits() > 0) {
final int primaryIndex = hits.getIndexOfEarliestPrimary();
if (primaryIndex < 0) throw new IllegalStateException("No primary alignment was found, despite having nonzero hits.");
final SAMRecord primaryRead1 = hits.getFirstOfPair(primaryIndex);
final SAMRecord primaryRead2 = hits.getSecondOfPair(primaryIndex);
if (primaryRead1 != null && primaryRead2 != null) isContaminant = contaminationFilter.filterOut(primaryRead1, primaryRead2);
else if (primaryRead1 != null) isContaminant = contaminationFilter.filterOut(primaryRead1);
else if (primaryRead2 != null) isContaminant = contaminationFilter.filterOut(primaryRead2);
else throw new IllegalStateException("Neither read1 or read2 exist for chosen primary alignment");
}
return isContaminant;
}
// Accessor for testing
public boolean getForceSort() {return this.forceSort; }
}