/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.BinaryCodec; import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CoordMath; import htsjdk.samtools.util.StringLineReader; import java.io.DataInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.NoSuchElementException; /** * Class for reading and querying BAM files. */ class BAMFileReader extends SAMFileReader.ReaderImplementation { // True if reading from a File rather than an InputStream private boolean mIsSeekable = false; // For converting bytes into other primitive types private BinaryCodec mStream = null; // Underlying compressed data stream. private final BlockCompressedInputStream mCompressedInputStream; private SAMFileHeader mFileHeader = null; // One of these is populated if the file is seekable and an index exists private File mIndexFile = null; private SeekableStream mIndexStream = null; private BAMIndex mIndex = null; private long mFirstRecordPointer = 0; // If non-null, there is an unclosed iterator extant. private CloseableIterator<SAMRecord> mCurrentIterator = null; // If true, all SAMRecords are fully decoded as they are read. private boolean eagerDecode; // For error-checking. private ValidationStringency mValidationStringency; // For creating BAMRecords private SAMRecordFactory samRecordFactory; /** * Use the caching index reader implementation rather than the disk-hit-per-file model. */ private boolean mEnableIndexCaching = false; /** * Use the traditional memory-mapped implementation for BAM file indexes rather than regular I/O. */ private boolean mEnableIndexMemoryMapping = true; /** * Add information about the origin (reader and position) to SAM records. */ private SamReader mReader = null; /** * Prepare to read BAM from a stream (not seekable) * @param stream source of bytes. * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. * @param validationStringency Controls how to handle invalidate reads or header lines. */ BAMFileReader(final InputStream stream, final File indexFile, final boolean eagerDecode, final ValidationStringency validationStringency, final SAMRecordFactory factory) throws IOException { mIndexFile = indexFile; mIsSeekable = false; mCompressedInputStream = new BlockCompressedInputStream(stream); mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); this.eagerDecode = eagerDecode; this.mValidationStringency = validationStringency; this.samRecordFactory = factory; this.mFileHeader = readHeader(this.mStream, this.mValidationStringency, null); } /** * Prepare to read BAM from a file (seekable) * @param file source of bytes. * @param eagerDecode if true, decode all BAM fields as reading rather than lazily. * @param validationStringency Controls how to handle invalidate reads or header lines. */ BAMFileReader(final File file, final File indexFile, final boolean eagerDecode, final ValidationStringency validationStringency, final SAMRecordFactory factory) throws IOException { this(new BlockCompressedInputStream(file), indexFile!=null ? indexFile : SamFiles.findIndex(file), eagerDecode, file.getAbsolutePath(), validationStringency, factory); if (mIndexFile != null && mIndexFile.lastModified() < file.lastModified()) { System.err.println("WARNING: BAM index file " + mIndexFile.getAbsolutePath() + " is older than BAM " + file.getAbsolutePath()); } // Provide better error message when there is an error reading. mStream.setInputFileName(file.getAbsolutePath()); } BAMFileReader(final SeekableStream strm, final File indexFile, final boolean eagerDecode, final ValidationStringency validationStringency, final SAMRecordFactory factory) throws IOException { this(new BlockCompressedInputStream(strm), indexFile, eagerDecode, strm.getSource(), validationStringency, factory); } BAMFileReader(final SeekableStream strm, final SeekableStream indexStream, final boolean eagerDecode, final ValidationStringency validationStringency, final SAMRecordFactory factory) throws IOException { this(new BlockCompressedInputStream(strm), indexStream, eagerDecode, strm.getSource(), validationStringency, factory); } private BAMFileReader(final BlockCompressedInputStream compressedInputStream, final File indexFile, final boolean eagerDecode, final String source, final ValidationStringency validationStringency, final SAMRecordFactory factory) throws IOException { mIndexFile = indexFile; mIsSeekable = true; mCompressedInputStream = compressedInputStream; mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); this.eagerDecode = eagerDecode; this.mValidationStringency = validationStringency; this.samRecordFactory = factory; this.mFileHeader = readHeader(this.mStream, this.mValidationStringency, source); mFirstRecordPointer = mCompressedInputStream.getFilePointer(); } private BAMFileReader(final BlockCompressedInputStream compressedInputStream, final SeekableStream indexStream, final boolean eagerDecode, final String source, final ValidationStringency validationStringency, final SAMRecordFactory factory) throws IOException { mIndexStream = indexStream; mIsSeekable = true; mCompressedInputStream = compressedInputStream; mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); this.eagerDecode = eagerDecode; this.mValidationStringency = validationStringency; this.samRecordFactory = factory; this.mFileHeader = readHeader(this.mStream, this.mValidationStringency, source); mFirstRecordPointer = mCompressedInputStream.getFilePointer(); } /** Reads through the header and sequence records to find the virtual file offset of the first record in the BAM file. */ static long findVirtualOffsetOfFirstRecord(final File bam) throws IOException { final BAMFileReader reader = new BAMFileReader(bam, null, false, ValidationStringency.SILENT, new DefaultSAMRecordFactory()); final long offset = reader.mFirstRecordPointer; reader.close(); return offset; } /** * If true, writes the source of every read into the source SAMRecords. * @param enabled true to write source information into each SAMRecord. */ void enableFileSource(final SamReader reader, final boolean enabled) { this.mReader = enabled ? reader : null; } /** * If true, uses the caching version of the index reader. * @param enabled true to write source information into each SAMRecord. */ protected void enableIndexCaching(final boolean enabled) { if(mIndex != null) throw new SAMException("Unable to turn on index caching; index file has already been loaded."); this.mEnableIndexCaching = enabled; } /** * If false, disable the use of memory mapping for accessing index files (default behavior is to use memory mapping). * This is slower but more scalable when accessing large numbers of BAM files sequentially. * @param enabled True to use memory mapping, false to use regular I/O. */ protected void enableIndexMemoryMapping(final boolean enabled) { if (mIndex != null) { throw new SAMException("Unable to change index memory mapping; index file has already been loaded."); } this.mEnableIndexMemoryMapping = enabled; } @Override void enableCrcChecking(final boolean enabled) { this.mCompressedInputStream.setCheckCrcs(enabled); } @Override void setSAMRecordFactory(final SAMRecordFactory factory) { this.samRecordFactory = factory; } @Override public SamReader.Type type() { return SamReader.Type.BAM_TYPE; } /** * @return true if ths is a BAM file, and has an index */ public boolean hasIndex() { return mIsSeekable && ((mIndexFile != null) || (mIndexStream != null)); } /** * Retrieves the index for the given file type. Ensure that the index is of the specified type. * @return An index of the given type. */ public BAMIndex getIndex() { if(!hasIndex()) throw new SAMException("No index is available for this BAM file."); if(mIndex == null) { if (mIndexFile != null) mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping) : new DiskBasedBAMFileIndex(mIndexFile, getFileHeader().getSequenceDictionary(), mEnableIndexMemoryMapping); else mIndex = mEnableIndexCaching ? new CachingBAMFileIndex(mIndexStream, getFileHeader().getSequenceDictionary()) : new DiskBasedBAMFileIndex(mIndexStream, getFileHeader().getSequenceDictionary()); } return mIndex; } public void setEagerDecode(final boolean desired) { this.eagerDecode = desired; } public void close() { if (mStream != null) { mStream.close(); } if (mIndex != null) { mIndex.close(); } mStream = null; mFileHeader = null; mIndex = null; } public SAMFileHeader getFileHeader() { return mFileHeader; } /** * Set error-checking level for subsequent SAMRecord reads. */ void setValidationStringency(final ValidationStringency validationStringency) { this.mValidationStringency = validationStringency; } public ValidationStringency getValidationStringency() { return this.mValidationStringency; } /** * Prepare to iterate through the SAMRecords in file order. * Only a single iterator on a BAM file can be extant at a time. If getIterator() or a query method has been called once, * that iterator must be closed before getIterator() can be called again. * A somewhat peculiar aspect of this method is that if the file is not seekable, a second call to * getIterator() begins its iteration where the last one left off. That is the best that can be * done in that situation. */ public CloseableIterator<SAMRecord> getIterator() { if (mStream == null) { throw new IllegalStateException("File reader is closed"); } if (mCurrentIterator != null) { throw new IllegalStateException("Iteration in progress"); } if (mIsSeekable) { try { mCompressedInputStream.seek(mFirstRecordPointer); } catch (final IOException exc) { throw new RuntimeException(exc.getMessage(), exc); } } mCurrentIterator = new BAMFileIterator(); return mCurrentIterator; } @Override public CloseableIterator<SAMRecord> getIterator(final SAMFileSpan chunks) { if (mStream == null) { throw new IllegalStateException("File reader is closed"); } if (mCurrentIterator != null) { throw new IllegalStateException("Iteration in progress"); } if (!(chunks instanceof BAMFileSpan)) { throw new IllegalStateException("BAMFileReader cannot handle this type of file span."); } // Create an iterator over the given chunk boundaries. mCurrentIterator = new BAMFileIndexIterator(((BAMFileSpan)chunks).toCoordinateArray()); return mCurrentIterator; } /** * Gets an unbounded pointer to the first record in the BAM file. Because the reader doesn't necessarily know * when the file ends, the rightmost bound of the file pointer will not end exactly where the file ends. However, * the rightmost bound is guaranteed to be after the last read in the file. * @return An unbounded pointer to the first record in the BAM file. */ @Override public SAMFileSpan getFilePointerSpanningReads() { return new BAMFileSpan(new Chunk(mFirstRecordPointer,Long.MAX_VALUE)); } /** * Prepare to iterate through the SAMRecords that match the given interval. * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed * before calling any of the methods that return an iterator. * * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate * matches the specified interval. * * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. * * @param sequence Reference sequence sought. * @param start Desired SAMRecords must overlap or be contained in the interval specified by start and end. * A value of zero implies the start of the reference sequence. * @param end A value of zero implies the end of the reference sequence. * @param contained If true, the alignments for the SAMRecords must be completely contained in the interval * specified by start and end. If false, the SAMRecords need only overlap the interval. * @return Iterator for the matching SAMRecords */ CloseableIterator<SAMRecord> query(final String sequence, final int start, final int end, final boolean contained) { if (mStream == null) { throw new IllegalStateException("File reader is closed"); } if (mCurrentIterator != null) { throw new IllegalStateException("Iteration in progress"); } if (!mIsSeekable) { throw new UnsupportedOperationException("Cannot query stream-based BAM file"); } final int referenceIndex = mFileHeader.getSequenceIndex(sequence); if (referenceIndex == -1) { mCurrentIterator = new EmptyBamIterator(); } else { final QueryInterval[] queryIntervals = {new QueryInterval(referenceIndex, start, end)}; mCurrentIterator = createIndexIterator(queryIntervals, contained); } return mCurrentIterator; } /** * Prepare to iterate through the SAMRecords that match any of the given intervals. * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed * before calling any of the methods that return an iterator. * * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate * matches the specified interval. * * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. * * @param intervals list of intervals to be queried. Must be optimized. * @param contained If true, the alignments for the SAMRecords must be completely contained in the interval * specified by start and end. If false, the SAMRecords need only overlap the interval. * @return Iterator for the matching SAMRecords * @see QueryInterval#optimizeIntervals(QueryInterval[]) */ public CloseableIterator<SAMRecord> query(final QueryInterval[] intervals, final boolean contained) { if (mStream == null) { throw new IllegalStateException("File reader is closed"); } if (mCurrentIterator != null) { throw new IllegalStateException("Iteration in progress"); } if (!mIsSeekable) { throw new UnsupportedOperationException("Cannot query stream-based BAM file"); } mCurrentIterator = createIndexIterator(intervals, contained); return mCurrentIterator; } /** * Prepare to iterate through the SAMRecords with the given alignment start. * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed * before calling any of the methods that return an iterator. * * Note that an unmapped SAMRecord may still have a reference name and an alignment start for sorting * purposes (typically this is the coordinate of its mate), and will be found by this method if the coordinate * matches the specified interval. * * Note that this method is not necessarily efficient in terms of disk I/O. The index does not have perfect * resolution, so some SAMRecords may be read and then discarded because they do not match the specified interval. * * @param sequence Reference sequence sought. * @param start Alignment start sought. * @return Iterator for the matching SAMRecords. */ public CloseableIterator<SAMRecord> queryAlignmentStart(final String sequence, final int start) { if (mStream == null) { throw new IllegalStateException("File reader is closed"); } if (mCurrentIterator != null) { throw new IllegalStateException("Iteration in progress"); } if (!mIsSeekable) { throw new UnsupportedOperationException("Cannot query stream-based BAM file"); } final int referenceIndex = mFileHeader.getSequenceIndex(sequence); if (referenceIndex == -1) { mCurrentIterator = new EmptyBamIterator(); } else { mCurrentIterator = createStartingAtIndexIterator(referenceIndex, start); } return mCurrentIterator; } /** * Prepare to iterate through the SAMRecords that are unmapped and do not have a reference name or alignment start. * Only a single iterator on a BAMFile can be extant at a time. The previous one must be closed * before calling any of the methods that return an iterator. * * @return Iterator for the matching SAMRecords. */ public CloseableIterator<SAMRecord> queryUnmapped() { if (mStream == null) { throw new IllegalStateException("File reader is closed"); } if (mCurrentIterator != null) { throw new IllegalStateException("Iteration in progress"); } if (!mIsSeekable) { throw new UnsupportedOperationException("Cannot query stream-based BAM file"); } try { final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin(); if (startOfLastLinearBin != -1) { mCompressedInputStream.seek(startOfLastLinearBin); } else { // No mapped reads in file, just start at the first read in file. mCompressedInputStream.seek(mFirstRecordPointer); } mCurrentIterator = new BAMFileIndexUnmappedIterator(); return mCurrentIterator; } catch (final IOException e) { throw new RuntimeException("IOException seeking to unmapped reads", e); } } /** * Reads the header of a BAM file from a stream * @param stream A BinaryCodec to read the header from * @param validationStringency Determines how stringent to be when validating the sam * @param source Note that this is used only for reporting errors. */ protected static SAMFileHeader readHeader(final BinaryCodec stream, final ValidationStringency validationStringency, final String source) throws IOException { final byte[] buffer = new byte[4]; stream.readBytes(buffer); if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) { throw new IOException("Invalid BAM file header"); } final int headerTextLength = stream.readInt(); final String textHeader = stream.readString(headerTextLength); final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec(); headerCodec.setValidationStringency(validationStringency); final SAMFileHeader samFileHeader = headerCodec.decode(new StringLineReader(textHeader), source); final int sequenceCount = stream.readInt(); if (samFileHeader.getSequenceDictionary().size() > 0) { // It is allowed to have binary sequences but no text sequences, so only validate if both are present if (sequenceCount != samFileHeader.getSequenceDictionary().size()) { throw new SAMFormatException("Number of sequences in text header (" + samFileHeader.getSequenceDictionary().size() + ") != number of sequences in binary header (" + sequenceCount + ") for file " + source); } for (int i = 0; i < sequenceCount; i++) { final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(stream, source); final SAMSequenceRecord sequenceRecord = samFileHeader.getSequence(i); if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) { throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " + source); } if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) { throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " + source); } } } else { // If only binary sequences are present, copy them into samFileHeader final List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>(sequenceCount); for (int i = 0; i < sequenceCount; i++) { sequences.add(readSequenceRecord(stream, source)); } samFileHeader.setSequenceDictionary(new SAMSequenceDictionary(sequences)); } return samFileHeader; } /** * Reads a single binary sequence record from the file or stream * @param source Note that this is used only for reporting errors. */ private static SAMSequenceRecord readSequenceRecord(final BinaryCodec stream, final String source) { final int nameLength = stream.readInt(); if (nameLength <= 1) { throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + source); } final String sequenceName = stream.readString(nameLength - 1); // Skip the null terminator stream.readByte(); final int sequenceLength = stream.readInt(); return new SAMSequenceRecord(SAMSequenceRecord.truncateSequenceName(sequenceName), sequenceLength); } /** * Encapsulates the restriction that only one iterator may be open at a time. */ private abstract class AbstractBamIterator implements CloseableIterator<SAMRecord> { private boolean isClosed = false; public void close() { if (!isClosed) { if (mCurrentIterator != null && this != mCurrentIterator) { throw new IllegalStateException("Attempt to close non-current iterator"); } mCurrentIterator = null; isClosed = true; } } protected void assertOpen() { if (isClosed) throw new AssertionError("Iterator has been closed"); } public void remove() { throw new UnsupportedOperationException("Not supported: remove"); } } private class EmptyBamIterator extends AbstractBamIterator { @Override public boolean hasNext() { return false; } @Override public SAMRecord next() { throw new NoSuchElementException("next called on empty iterator"); } } /** /** * Iterator for non-indexed sequential iteration through all SAMRecords in file. * Starting point of iteration is wherever current file position is when the iterator is constructed. */ private class BAMFileIterator extends AbstractBamIterator { private SAMRecord mNextRecord = null; private final BAMRecordCodec bamRecordCodec; private long samRecordIndex = 0; // Records at what position (counted in records) we are at in the file BAMFileIterator() { this(true); } /** * @param advance Trick to enable subclass to do more setup before advancing */ BAMFileIterator(final boolean advance) { this.bamRecordCodec = new BAMRecordCodec(getFileHeader(), samRecordFactory); this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream(), BAMFileReader.this.mStream.getInputFileName()); if (advance) { advance(); } } public boolean hasNext() { assertOpen(); return (mNextRecord != null); } public SAMRecord next() { assertOpen(); final SAMRecord result = mNextRecord; advance(); return result; } void advance() { try { mNextRecord = getNextRecord(); if (mNextRecord != null) { ++this.samRecordIndex; // Because some decoding is done lazily, the record needs to remember the validation stringency. mNextRecord.setValidationStringency(mValidationStringency); if (mValidationStringency != ValidationStringency.SILENT) { final List<SAMValidationError> validationErrors = mNextRecord.isValid(mValidationStringency == ValidationStringency.STRICT); SAMUtils.processValidationErrors(validationErrors, this.samRecordIndex, BAMFileReader.this.getValidationStringency()); } } if (eagerDecode && mNextRecord != null) { mNextRecord.eagerDecode(); } } catch (final IOException exc) { throw new RuntimeException(exc.getMessage(), exc); } } /** * Read the next record from the input stream. */ SAMRecord getNextRecord() throws IOException { final long startCoordinate = mCompressedInputStream.getFilePointer(); final SAMRecord next = bamRecordCodec.decode(); final long stopCoordinate = mCompressedInputStream.getFilePointer(); if(mReader != null && next != null) next.setFileSource(new SAMFileSource(mReader,new BAMFileSpan(new Chunk(startCoordinate,stopCoordinate)))); return next; } /** * @return The record that will be return by the next call to next() */ protected SAMRecord peek() { return mNextRecord; } } /** * Prepare to iterate through SAMRecords in the given reference that start exactly at the given start coordinate. * @param referenceIndex Desired reference sequence. * @param start 1-based alignment start. */ private CloseableIterator<SAMRecord> createStartingAtIndexIterator(final int referenceIndex, final int start) { // Hit the index to determine the chunk boundaries for the required data. final BAMIndex fileIndex = getIndex(); final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping(referenceIndex, start, 0); final long[] filePointers = fileSpan != null ? fileSpan.toCoordinateArray() : null; // Create an iterator over the above chunk boundaries. final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers); // Add some preprocessing filters for edge-case reads that don't fit into this // query type. return new BAMQueryFilteringIterator(iterator,new BAMStartingAtIteratorFilter(referenceIndex,start)); } /** * @throws java.lang.IllegalArgumentException if the intervals are not optimized * @see QueryInterval#optimizeIntervals(QueryInterval[]) */ private void assertIntervalsOptimized(final QueryInterval[] intervals) { if (intervals.length == 0) return; for (int i = 1; i < intervals.length; ++i) { final QueryInterval prev = intervals[i-1]; final QueryInterval thisInterval = intervals[i]; if (prev.compareTo(thisInterval) >= 0) { throw new IllegalArgumentException(String.format("List of intervals is not sorted: %s >= %s", prev, thisInterval)); } if (prev.overlaps(thisInterval)) { throw new IllegalArgumentException(String.format("List of intervals is not optimized: %s intersects %s", prev, thisInterval)); } if (prev.abuts(thisInterval)) { throw new IllegalArgumentException(String.format("List of intervals is not optimized: %s abuts %s", prev, thisInterval)); } } } private CloseableIterator<SAMRecord> createIndexIterator(final QueryInterval[] intervals, final boolean contained) { assertIntervalsOptimized(intervals); // Hit the index to determine the chunk boundaries for the required data. final BAMFileSpan[] inputSpans = new BAMFileSpan[intervals.length]; final BAMIndex fileIndex = getIndex(); for (int i = 0; i < intervals.length; ++i) { final QueryInterval interval = intervals[i]; final BAMFileSpan span = fileIndex.getSpanOverlapping(interval.referenceIndex, interval.start, interval.end); inputSpans[i] = span; } final long[] filePointers; if (inputSpans.length > 0) { filePointers = BAMFileSpan.merge(inputSpans).toCoordinateArray(); } else { filePointers = null; } // Create an iterator over the above chunk boundaries. final BAMFileIndexIterator iterator = new BAMFileIndexIterator(filePointers); // Add some preprocessing filters for edge-case reads that don't fit into this // query type. return new BAMQueryFilteringIterator(iterator, new BAMQueryMultipleIntervalsIteratorFilter(intervals, contained)); } /** * Iterate over the SAMRecords defined by the sections of the file described in the ctor argument. */ private class BAMFileIndexIterator extends BAMFileIterator { private long[] mFilePointers = null; private int mFilePointerIndex = 0; private long mFilePointerLimit = -1; /** * Prepare to iterate through SAMRecords stored in the specified compressed blocks at the given offset. * @param filePointers the block / offset combination, stored in chunk format. */ BAMFileIndexIterator(final long[] filePointers) { super(false); // delay advance() until after construction mFilePointers = filePointers; advance(); } SAMRecord getNextRecord() throws IOException { // Advance to next file block if necessary while (mCompressedInputStream.getFilePointer() >= mFilePointerLimit) { if (mFilePointers == null || mFilePointerIndex >= mFilePointers.length) { return null; } final long startOffset = mFilePointers[mFilePointerIndex++]; final long endOffset = mFilePointers[mFilePointerIndex++]; mCompressedInputStream.seek(startOffset); mFilePointerLimit = endOffset; } // Pull next record from stream return super.getNextRecord(); } } /** * Pull SAMRecords from a coordinate-sorted iterator, and filter out any that do not match the filter. */ public class BAMQueryFilteringIterator extends AbstractBamIterator { /** * The wrapped iterator. */ protected final CloseableIterator<SAMRecord> wrappedIterator; /** * The next record to be returned. Will be null if no such record exists. */ protected SAMRecord mNextRecord; private final BAMIteratorFilter iteratorFilter; public BAMQueryFilteringIterator(final CloseableIterator<SAMRecord> iterator, final BAMIteratorFilter iteratorFilter) { this.wrappedIterator = iterator; this.iteratorFilter = iteratorFilter; mNextRecord = advance(); } /** * Returns true if a next element exists; false otherwise. */ public boolean hasNext() { assertOpen(); return mNextRecord != null; } /** * Gets the next record from the given iterator. * @return The next SAM record in the iterator. */ public SAMRecord next() { if(!hasNext()) throw new NoSuchElementException("BAMQueryFilteringIterator: no next element available"); final SAMRecord currentRead = mNextRecord; mNextRecord = advance(); return currentRead; } SAMRecord advance() { while (true) { // Pull next record from stream if(!wrappedIterator.hasNext()) return null; final SAMRecord record = wrappedIterator.next(); switch (iteratorFilter.compareToFilter(record)) { case MATCHES_FILTER: return record; case STOP_ITERATION: return null; case CONTINUE_ITERATION: break; // keep looping default: throw new SAMException("Unexpected return from compareToFilter"); } } } } interface BAMIteratorFilter { /** * Determine if given record passes the filter, and if it does not, whether iteration should continue * or if this record is beyond the region(s) of interest. */ FilteringIteratorState compareToFilter(final SAMRecord record); } /** * A decorating iterator that filters out records that do not match the given reference and start position. */ private class BAMStartingAtIteratorFilter implements BAMIteratorFilter { private final int mReferenceIndex; private final int mRegionStart; public BAMStartingAtIteratorFilter(final int referenceIndex, final int start) { mReferenceIndex = referenceIndex; mRegionStart = start; } /** * * @return MATCHES_FILTER if this record matches the filter; * CONTINUE_ITERATION if does not match filter but iteration should continue; * STOP_ITERATION if does not match filter and iteration should end. */ @Override public FilteringIteratorState compareToFilter(final SAMRecord record) { // If beyond the end of this reference sequence, end iteration final int referenceIndex = record.getReferenceIndex(); if (referenceIndex < 0 || referenceIndex > mReferenceIndex) { return FilteringIteratorState.STOP_ITERATION; } else if (referenceIndex < mReferenceIndex) { // If before this reference sequence, continue return FilteringIteratorState.CONTINUE_ITERATION; } final int alignmentStart = record.getAlignmentStart(); if (alignmentStart > mRegionStart) { // If scanned beyond target region, end iteration return FilteringIteratorState.STOP_ITERATION; } else if (alignmentStart == mRegionStart) { return FilteringIteratorState.MATCHES_FILTER; } else { return FilteringIteratorState.CONTINUE_ITERATION; } } } private class BAMFileIndexUnmappedIterator extends BAMFileIterator { private BAMFileIndexUnmappedIterator() { while (this.hasNext() && peek().getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { advance(); } } } /** * Filters out records that do not match any of the given intervals and query type. */ private class BAMQueryMultipleIntervalsIteratorFilter implements BAMIteratorFilter { final QueryInterval[] intervals; final boolean contained; int intervalIndex = 0; public BAMQueryMultipleIntervalsIteratorFilter(final QueryInterval[] intervals, final boolean contained) { this.contained = contained; this.intervals = intervals; } @Override public FilteringIteratorState compareToFilter(final SAMRecord record) { while (intervalIndex < intervals.length) { final IntervalComparison comparison = compareIntervalToRecord(intervals[intervalIndex], record); switch (comparison) { // Interval is before SAMRecord. Try next interval; case BEFORE: ++intervalIndex; break; // Interval is after SAMRecord. Keep scanning forward in SAMRecords case AFTER: return FilteringIteratorState.CONTINUE_ITERATION; // Found a good record case CONTAINED: return FilteringIteratorState.MATCHES_FILTER; // Either found a good record, or else keep scanning SAMRecords case OVERLAPPING: return (contained ? FilteringIteratorState.CONTINUE_ITERATION : FilteringIteratorState.MATCHES_FILTER); } } // Went past the last interval return FilteringIteratorState.STOP_ITERATION; } private IntervalComparison compareIntervalToRecord(final QueryInterval interval, final SAMRecord record) { // interval.end <= 0 implies the end of the reference sequence. final int intervalEnd = (interval.end <= 0? Integer.MAX_VALUE: interval.end); final int alignmentEnd; if (record.getReadUnmappedFlag() && record.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) { // Unmapped read with coordinate of mate. alignmentEnd = record.getAlignmentStart(); } else { alignmentEnd = record.getAlignmentEnd(); } if (interval.referenceIndex < record.getReferenceIndex()) return IntervalComparison.BEFORE; else if (interval.referenceIndex > record.getReferenceIndex()) return IntervalComparison.AFTER; else if (intervalEnd < record.getAlignmentStart()) return IntervalComparison.BEFORE; else if (alignmentEnd < interval.start) return IntervalComparison.AFTER; else if (CoordMath.encloses(interval.start, intervalEnd, record.getAlignmentStart(), alignmentEnd)) { return IntervalComparison.CONTAINED; } else return IntervalComparison.OVERLAPPING; } } private enum IntervalComparison { BEFORE, AFTER, OVERLAPPING, CONTAINED } /** * Type returned by BAMIteratorFilter that tell BAMQueryFilteringIterator how to handle each SAMRecord. */ private enum FilteringIteratorState { MATCHES_FILTER, STOP_ITERATION, CONTINUE_ITERATION } }