/* * The MIT License * * Copyright (c) 2010 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools.util; import htsjdk.samtools.AlignmentBlock; import htsjdk.samtools.SAMException; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.filter.AggregateFilter; import htsjdk.samtools.filter.DuplicateReadFilter; import htsjdk.samtools.filter.FilteringIterator; import htsjdk.samtools.filter.SamRecordFilter; import htsjdk.samtools.filter.SecondaryOrSupplementaryFilter; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Iterator; import java.util.List; /** * Iterator that traverses a SAM File, accumulating information on a per-locus basis. * Optionally takes a target interval list, in which case the loci returned are the ones covered by * the interval list. If no target interval list, whatever loci are covered by the input reads are returned. * By default duplicate reads and non-primary alignments are filtered out. Filtering may be changed * via setSamFilters(). * * @author alecw@broadinstitute.org */ public class SamLocusIterator implements Iterable<SamLocusIterator.LocusInfo>, CloseableIterator<SamLocusIterator.LocusInfo> { private static final Log LOG = Log.getInstance(SamLocusIterator.class); /** * Holds a SAMRecord plus the zero-based offset into that SAMRecord's bases and quality scores that corresponds * to the base and quality at the genomic position described the containing LocusInfo. */ public static class RecordAndOffset { private final SAMRecord record; private final int offset; public RecordAndOffset(final SAMRecord record, final int offset) { this.offset = offset; this.record = record; } /** Zero-based offset into the read corresponding to the current position in LocusInfo */ public int getOffset() { return offset; } public SAMRecord getRecord() { return record; } public byte getReadBase() { return record.getReadBases()[offset]; } public byte getBaseQuality() { return record.getBaseQualities()[offset]; } } /** * The unit of iteration. Holds information about the locus (the SAMSequenceRecord and 1-based position * on the reference), plus List of ReadAndOffset objects, one for each read that overlaps the locus */ public static final class LocusInfo implements Locus { private final SAMSequenceRecord referenceSequence; private final int position; private final List<RecordAndOffset> recordAndOffsets = new ArrayList<RecordAndOffset>(100); LocusInfo(final SAMSequenceRecord referenceSequence, final int position) { this.referenceSequence = referenceSequence; this.position = position; } /** Accumulate info for one read at the locus. */ public void add(final SAMRecord read, final int position) { recordAndOffsets.add(new RecordAndOffset(read, position)); } public int getSequenceIndex() { return referenceSequence.getSequenceIndex(); } /** @return 1-based reference position */ public int getPosition() { return position; } public List<RecordAndOffset> getRecordAndPositions() { return Collections.unmodifiableList(recordAndOffsets); } public String getSequenceName() { return referenceSequence.getSequenceName(); } @Override public String toString() { return referenceSequence.getSequenceName() + ":" + position; } public int getSequenceLength(){return referenceSequence.getSequenceLength();} } private final SAMFileReader samReader; private final ReferenceSequenceMask referenceSequenceMask; private PeekableIterator<SAMRecord> samIterator; private List<SamRecordFilter> samFilters = Arrays.asList(new SecondaryOrSupplementaryFilter(), new DuplicateReadFilter()); private final List<Interval> intervals; private final boolean useIndex; /** * LocusInfos on this list are ready to be returned by iterator. All reads that overlap * the locus have been accumulated before the LocusInfo is moved into this list. */ private final ArrayList<LocusInfo> complete = new ArrayList<LocusInfo>(100); /** * LocusInfos for which accumulation is in progress. When {@link #accumulateSamRecord(SAMRecord)} is called * the state of this list is guaranteed to be either: * a) Empty, or * b) That the element at index 0 corresponds to the same genomic locus as the first aligned base * in the read being accumulated * * Before each new read is accumulated the accumulator is examined and: * i) any LocusInfos at positions earlier than the read start are moved to {@link #complete} * ii) any uncovered positions between the last LocusInfo and the first aligned base of the new read * have LocusInfos created and added to {@link #complete} if we are emitting uncovered loci */ private final ArrayList<LocusInfo> accumulator = new ArrayList<LocusInfo>(100); private int qualityScoreCutoff = Integer.MIN_VALUE; private int mappingQualityScoreCutoff = Integer.MIN_VALUE; private boolean includeNonPfReads = true; /** * If true, emit a LocusInfo for every locus in the target map, or if no target map, * emit a LocusInfo for every locus in the reference sequence. * If false, emit a LocusInfo only if a locus has coverage. */ private boolean emitUncoveredLoci = true; // When there is a target mask, these members remember the last locus for which a LocusInfo has been // returned, so that any uncovered locus in the target mask can be covered by a 0-coverage LocusInfo private int lastReferenceSequence = 0; private int lastPosition = 0; // Set to true when past all aligned reads in input SAM file private boolean finishedAlignedReads = false; private final LocusComparator<Locus> locusComparator = new LocusComparator<Locus>(); /** * Prepare to iterate through the given SAM records, skipping non-primary alignments. Do not use * BAM index even if available. */ public SamLocusIterator(final SAMFileReader samReader) { this(samReader, null); } /** * Prepare to iterate through the given SAM records, skipping non-primary alignments. Do not use * BAM index even if available. * * @param intervalList Either the list of desired intervals, or null. Note that if an intervalList is * passed in that is not coordinate sorted, it will eventually be coordinated sorted by this class. */ public SamLocusIterator(final SAMFileReader samReader, final IntervalList intervalList) { this(samReader, intervalList, samReader.hasIndex()); } /** * Prepare to iterate through the given SAM records, skipping non-primary alignments * * @param samReader must be coordinate sorted * @param intervalList Either the list of desired intervals, or null. Note that if an intervalList is * passed in that is not coordinate sorted, it will eventually be coordinated sorted by this class. * @param useIndex If true, do indexed lookup to improve performance. Not relevant if intervalList == null. * It is no longer the case the useIndex==true can make performance worse. It should always perform at least * as well as useIndex==false, and generally will be much faster. */ public SamLocusIterator(final SAMFileReader samReader, final IntervalList intervalList, final boolean useIndex) { if (samReader.getFileHeader().getSortOrder() == null || samReader.getFileHeader().getSortOrder() == SAMFileHeader.SortOrder.unsorted) { LOG.warn("SamLocusIterator constructed with samReader that has SortOrder == unsorted. ", "" + "Assuming SAM is coordinate sorted, but exceptions may occur if it is not."); } else if (samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) { throw new SAMException("SamLocusIterator cannot operate on a SAM file that is not coordinate sorted."); } this.samReader = samReader; this.useIndex = useIndex; if (intervalList != null) { intervals = intervalList.getUniqueIntervals(); this.referenceSequenceMask = new IntervalListReferenceSequenceMask(intervalList); } else { intervals = null; this.referenceSequenceMask = new WholeGenomeReferenceSequenceMask(samReader.getFileHeader()); } } public Iterator<LocusInfo> iterator() { if (samIterator != null) { throw new IllegalStateException("Cannot call iterator() more than once on SamLocusIterator"); } CloseableIterator<SAMRecord> tempIterator; if (intervals != null) { tempIterator = new SamRecordIntervalIteratorFactory().makeSamRecordIntervalIterator(samReader, intervals, useIndex); } else { tempIterator = samReader.iterator(); } if (samFilters != null) { tempIterator = new FilteringIterator(tempIterator, new AggregateFilter(samFilters)); } samIterator = new PeekableIterator<SAMRecord>(tempIterator); return this; } public void close() { this.samIterator.close(); } private boolean samHasMore() { return !finishedAlignedReads && (samIterator.peek() != null); } /** * Returns true if there are more LocusInfo objects that can be returned, due to any of the following reasons: * 1) there are more aligned reads in the SAM file * 2) there are LocusInfos in some stage of accumulation * 3) there are loci in the target mask that have yet to be accumulated (even if there are no reads covering them) */ public boolean hasNext() { if (this.samIterator == null) { iterator(); } while (complete.isEmpty() && ((!accumulator.isEmpty()) || samHasMore() || hasRemainingMaskBases())) { final LocusInfo locusInfo = next(); if (locusInfo != null) { complete.add(0, locusInfo); } } return !complete.isEmpty(); } /** * Returns true if there are more bases at which the locus iterator must emit LocusInfos because * there are loci beyond the last emitted loci which are in the set of loci to be emitted and * the iterator is setup to emit uncovered loci - so we can guarantee we'll emit those loci. */ private boolean hasRemainingMaskBases() { // if there are more sequences in the mask, by definition some of them must have // marked bases otherwise if we're in the last sequence, but we're not at the last marked position, // there is also more in the mask if (!emitUncoveredLoci) { // If not emitting uncovered loci, this check is irrelevant return false; } return (lastReferenceSequence < referenceSequenceMask.getMaxSequenceIndex() || (lastReferenceSequence == referenceSequenceMask.getMaxSequenceIndex() && lastPosition < referenceSequenceMask.nextPosition(lastReferenceSequence, lastPosition))); } /** * hasNext() has been fixed so that if it returns true, next() is now guaranteed not to return null. */ public LocusInfo next() { // if we don't have any completed entries to return, try and make some! while(complete.isEmpty() && samHasMore()) { final SAMRecord rec = samIterator.peek(); // There might be unmapped reads mixed in with the mapped ones, but when a read // is encountered with no reference index it means that all the mapped reads have been seen. if (rec.getReferenceIndex() == -1) { this.finishedAlignedReads = true; continue; } // Skip over an unaligned read that has been forced to be sorted with the aligned reads if (rec.getReadUnmappedFlag() || rec.getMappingQuality() < this.mappingQualityScoreCutoff || (!this.includeNonPfReads && rec.getReadFailsVendorQualityCheckFlag())) { samIterator.next(); continue; } final Locus alignmentStart = new LocusImpl(rec.getReferenceIndex(), rec.getAlignmentStart()); // emit everything that is before the start of the current read, because we know no more // coverage will be accumulated for those loci. while (!accumulator.isEmpty() && locusComparator.compare(accumulator.get(0), alignmentStart) < 0) { final LocusInfo first = accumulator.get(0); populateCompleteQueue(alignmentStart); if (!complete.isEmpty()) { return complete.remove(0); } if (!accumulator.isEmpty() && first == accumulator.get(0)) { throw new SAMException("Stuck in infinite loop"); } } // at this point, either the accumulator list is empty or the head should // be the same position as the first base of the read if (!accumulator.isEmpty()) { if (accumulator.get(0).getSequenceIndex() != rec.getReferenceIndex() || accumulator.get(0).position != rec.getAlignmentStart()) { throw new IllegalStateException("accumulator should be empty or aligned with current SAMRecord"); } } // Store the loci for the read in the accumulator accumulateSamRecord(rec); samIterator.next(); } final Locus endLocus = new LocusImpl(Integer.MAX_VALUE, Integer.MAX_VALUE); // if we have nothing to return to the user, and we're at the end of the SAM iterator, // push everything into the complete queue if (complete.isEmpty() && !samHasMore()) { while(!accumulator.isEmpty()) { populateCompleteQueue(endLocus); if (!complete.isEmpty()) { return complete.remove(0); } } } // if there are completed entries, return those if (!complete.isEmpty()) { return complete.remove(0); } else if (emitUncoveredLoci){ final Locus afterLastMaskPositionLocus = new LocusImpl(referenceSequenceMask.getMaxSequenceIndex(), referenceSequenceMask.getMaxPosition() + 1); // In this case... we're past the last read from SAM so see if we can // fill out any more (zero coverage) entries from the mask return createNextUncoveredLocusInfo(afterLastMaskPositionLocus); } else { return null; } } /** * Capture the loci covered by the given SAMRecord in the LocusInfos in the accumulator, * creating new LocusInfos as needed. */ private void accumulateSamRecord(final SAMRecord rec) { final SAMSequenceRecord ref = getReferenceSequence(rec.getReferenceIndex()); final int alignmentStart = rec.getAlignmentStart(); final int alignmentEnd = rec.getAlignmentEnd(); final int alignmentLength = alignmentEnd - alignmentStart; // Ensure there are LocusInfos up to and including this position for (int i=accumulator.size(); i<=alignmentLength; ++i) { accumulator.add(new LocusInfo(ref, alignmentStart + i)); } final int minQuality = getQualityScoreCutoff(); final boolean dontCheckQualities = minQuality == 0; final byte[] baseQualities = dontCheckQualities ? null : rec.getBaseQualities(); // interpret the CIGAR string and add the base info for(final AlignmentBlock alignmentBlock : rec.getAlignmentBlocks()) { final int readStart = alignmentBlock.getReadStart(); final int refStart = alignmentBlock.getReferenceStart(); final int blockLength = alignmentBlock.getLength(); for (int i=0; i<blockLength; ++i) { // 0-based offset into the read of the current base final int readOffset = readStart + i - 1; // 0-based offset from the aligned position of the first base in the read to the aligned position of the current base. final int refOffset = refStart + i - alignmentStart; // if the quality score cutoff is met, accumulate the base info if (dontCheckQualities || baseQualities[readOffset] >= minQuality) { accumulator.get(refOffset).add(rec, readOffset); } } } } /** * Create the next relevant zero-coverage LocusInfo * @param stopBeforeLocus don't go up to this sequence and position * @return a zero-coverage LocusInfo, or null if there is none before the stopBefore locus */ private LocusInfo createNextUncoveredLocusInfo(final Locus stopBeforeLocus) { while (lastReferenceSequence <= stopBeforeLocus.getSequenceIndex() && lastReferenceSequence <= referenceSequenceMask.getMaxSequenceIndex()) { if (lastReferenceSequence == stopBeforeLocus.getSequenceIndex() && lastPosition +1 >= stopBeforeLocus.getPosition()) { return null; } final int nextbit = referenceSequenceMask.nextPosition(lastReferenceSequence, lastPosition); // try the next reference sequence if (nextbit == -1) { // No more in this reference sequence if (lastReferenceSequence == stopBeforeLocus.getSequenceIndex()) { lastPosition = stopBeforeLocus.getPosition(); return null; } lastReferenceSequence++; lastPosition = 0; } else if (lastReferenceSequence < stopBeforeLocus.getSequenceIndex() || nextbit < stopBeforeLocus.getPosition()) { lastPosition = nextbit; return new LocusInfo(getReferenceSequence(lastReferenceSequence), lastPosition); } else if (nextbit >= stopBeforeLocus.getPosition()) { return null; } } return null; } /** * Pop the first entry from the LocusInfo accumulator into the complete queue. In addition, * check the ReferenceSequenceMask and if there are intervening mask positions between the last popped base and the one * about to be popped, put those on the complete queue as well. * Note that a single call to this method may not empty the accumulator completely, or even * empty it at all, because it may just put a zero-coverage LocusInfo into the complete queue. */ private void populateCompleteQueue(final Locus stopBeforeLocus) { // Because of gapped alignments, it is possible to create LocusInfo's with no reads associated with them. // Skip over these. while (!accumulator.isEmpty() && accumulator.get(0).getRecordAndPositions().isEmpty() && locusComparator.compare(accumulator.get(0), stopBeforeLocus) < 0) { accumulator.remove(0); } if (accumulator.isEmpty()) { return; } final LocusInfo locusInfo = accumulator.get(0); if (locusComparator.compare(stopBeforeLocus, locusInfo) <= 0) { return; } // If necessary, emit a zero-coverage LocusInfo if (emitUncoveredLoci) { final LocusInfo zeroCoverage = createNextUncoveredLocusInfo(locusInfo); if (zeroCoverage != null) { complete.add(zeroCoverage); return; } } // At this point we know we're going to process the LocusInfo, so remove it from the accumulator. accumulator.remove(0); // fill in any gaps based on our genome mask final int sequenceIndex = locusInfo.getSequenceIndex(); // only add to the complete queue if it's in the mask (or we have no mask!) if (referenceSequenceMask.get(locusInfo.getSequenceIndex(), locusInfo.getPosition())) { complete.add(locusInfo); } lastReferenceSequence = sequenceIndex; lastPosition = locusInfo.getPosition(); } private SAMSequenceRecord getReferenceSequence(final int referenceSequenceIndex) { return samReader.getFileHeader().getSequence(referenceSequenceIndex); } public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } // -------------------------------------------------------------------------------------------- // Helper methods below this point... // -------------------------------------------------------------------------------------------- /** * Controls which, if any, SAMRecords are filtered. By default duplicate reads and non-primary alignments * are filtered out. The list of filters passed here replaces any existing filters. * @param samFilters list of filters, or null if no filtering is desired. */ public void setSamFilters(final List<SamRecordFilter> samFilters) { this.samFilters = samFilters; } public int getQualityScoreCutoff() { return qualityScoreCutoff; } public void setQualityScoreCutoff(final int qualityScoreCutoff) { this.qualityScoreCutoff = qualityScoreCutoff; } public int getMappingQualityScoreCutoff() { return mappingQualityScoreCutoff; } public void setMappingQualityScoreCutoff(final int mappingQualityScoreCutoff) { this.mappingQualityScoreCutoff = mappingQualityScoreCutoff; } public boolean isIncludeNonPfReads() { return includeNonPfReads; } public void setIncludeNonPfReads(final boolean includeNonPfReads) { this.includeNonPfReads = includeNonPfReads; } public boolean isEmitUncoveredLoci() { return emitUncoveredLoci; } public void setEmitUncoveredLoci(final boolean emitUncoveredLoci) { this.emitUncoveredLoci = emitUncoveredLoci; } }