/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.RuntimeIOException; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; import java.util.Collections; import java.util.List; /** * Provides basic, generic capabilities to be used reading BAM index files. Users can * subclass this class to create new BAM index functionality for adding querying facilities, * changing caching behavior, etc. * * Of particular note: the AbstractBAMFileIndex is, by design, the only class aware of the * details of the BAM index file format (other than the four classes representing the data, * BAMIndexContent, Bin, Chunk, LinearIndex, and the classes for building the BAM index). * Anyone wanting to implement a reader for a differing * or extended BAM index format should implement BAMIndex directly. */ public abstract class AbstractBAMFileIndex implements BAMIndex { private final IndexFileBuffer mIndexBuffer; private SAMSequenceDictionary mBamDictionary = null; final int [] sequenceIndexes; protected AbstractBAMFileIndex( final SeekableStream stream, final SAMSequenceDictionary dictionary) { mBamDictionary = dictionary; mIndexBuffer = new IndexStreamBuffer(stream); seek(4); sequenceIndexes = new int[readInteger() + 1]; Arrays.fill(sequenceIndexes, -1); } protected AbstractBAMFileIndex(final File file, final SAMSequenceDictionary dictionary) { this(file, dictionary, true); } protected AbstractBAMFileIndex(final File file, final SAMSequenceDictionary dictionary, final boolean useMemoryMapping) { mBamDictionary = dictionary; mIndexBuffer = (useMemoryMapping ? new MemoryMappedFileBuffer(file) : new RandomAccessFileBuffer(file)); // Verify the magic number. seek(0); final byte[] buffer = new byte[4]; readBytes(buffer); if (!Arrays.equals(buffer, BAMFileConstants.BAM_INDEX_MAGIC)) { throw new RuntimeException("Invalid file header in BAM index " + file + ": " + new String(buffer)); } sequenceIndexes = new int[readInteger() + 1]; Arrays.fill(sequenceIndexes, -1); } /** * Close this index and release any associated resources. */ public void close() { mIndexBuffer.close(); } /** * Get the number of levels employed by this index. * @return Number of levels in this index. */ public static int getNumIndexLevels() { return GenomicIndexUtil.LEVEL_STARTS.length; } /** * Gets the first bin in the given level. * @param levelNumber Level number. 0-based. * @return The first bin in this level. */ public static int getFirstBinInLevel(final int levelNumber) { return GenomicIndexUtil.LEVEL_STARTS[levelNumber]; } /** * Gets the number of bins in the given level. * @param levelNumber Level number. 0-based. * @return The size (number of possible bins) of the given level. */ public int getLevelSize(final int levelNumber) { if(levelNumber == getNumIndexLevels()) return GenomicIndexUtil.MAX_BINS+1-GenomicIndexUtil.LEVEL_STARTS[levelNumber]; else return GenomicIndexUtil.LEVEL_STARTS[levelNumber+1]-GenomicIndexUtil.LEVEL_STARTS[levelNumber]; } /** * Gets the level associated with the given bin number. * @param bin The bin for which to determine the level. * @return the level associated with the given bin number. */ public int getLevelForBin(final Bin bin) { if(bin.getBinNumber() >= GenomicIndexUtil.MAX_BINS) throw new SAMException("Tried to get level for invalid bin."); for(int i = getNumIndexLevels()-1; i >= 0; i--) { if(bin.getBinNumber() >= GenomicIndexUtil.LEVEL_STARTS[i]) return i; } throw new SAMException("Unable to find correct bin for bin "+bin); } /** * Gets the first locus that this bin can index into. * @param bin The bin to test. * @return The last position that the given bin can represent. */ public int getFirstLocusInBin(final Bin bin) { final int level = getLevelForBin(bin); final int levelStart = GenomicIndexUtil.LEVEL_STARTS[level]; final int levelSize = ((level==getNumIndexLevels()-1) ? GenomicIndexUtil.MAX_BINS-1 : GenomicIndexUtil.LEVEL_STARTS[level+1]) - levelStart; return (bin.getBinNumber() - levelStart)*(GenomicIndexUtil.BIN_GENOMIC_SPAN /levelSize)+1; } /** * Gets the last locus that this bin can index into. * @param bin The bin to test. * @return The last position that the given bin can represent. */ public int getLastLocusInBin(final Bin bin) { final int level = getLevelForBin(bin); final int levelStart = GenomicIndexUtil.LEVEL_STARTS[level]; final int levelSize = ((level==getNumIndexLevels()-1) ? GenomicIndexUtil.MAX_BINS-1 : GenomicIndexUtil.LEVEL_STARTS[level+1]) - levelStart; return (bin.getBinNumber()-levelStart+1)*(GenomicIndexUtil.BIN_GENOMIC_SPAN /levelSize); } public int getNumberOfReferences() { seek(4); return readInteger(); } /** * Use to get close to the unmapped reads at the end of a BAM file. * @return The file offset of the first record in the last linear bin, or -1 * if there are no elements in linear bins (i.e. no mapped reads). */ public long getStartOfLastLinearBin() { seek(4); final int sequenceCount = readInteger(); // Because no reads may align to the last sequence in the sequence dictionary, // grab the last element of the linear index for each sequence, and return // the last one from the last sequence that has one. long lastLinearIndexPointer = -1; for (int i = 0; i < sequenceCount; i++) { // System.out.println("# Sequence TID: " + i); final int nBins = readInteger(); // System.out.println("# nBins: " + nBins); for (int j1 = 0; j1 < nBins; j1++) { // Skip bin # skipBytes(4); final int nChunks = readInteger(); // Skip chunks skipBytes(16 * nChunks); } final int nLinearBins = readInteger(); if (nLinearBins > 0) { // Skip to last element of list of linear bins skipBytes(8 * (nLinearBins - 1)); lastLinearIndexPointer = readLong(); } } return lastLinearIndexPointer; } /** * Return meta data for the given reference including information about number of aligned, unaligned, and noCoordinate records * * @param reference the reference of interest * @return meta data for the reference */ public BAMIndexMetaData getMetaData(final int reference) { seek(4); final List<Chunk> metaDataChunks = new ArrayList<Chunk>(); final int sequenceCount = readInteger(); if (reference >= sequenceCount) { return null; } skipToSequence(reference); final int binCount = readInteger(); for (int binNumber = 0; binNumber < binCount; binNumber++) { final int indexBin = readInteger(); final int nChunks = readInteger(); if (indexBin == GenomicIndexUtil.MAX_BINS) { for (int ci = 0; ci < nChunks; ci++) { final long chunkBegin = readLong(); final long chunkEnd = readLong(); metaDataChunks.add(new Chunk(chunkBegin, chunkEnd)); } } else { skipBytes(16 * nChunks); } } return new BAMIndexMetaData(metaDataChunks); } /** * Returns count of records unassociated with any reference. Call before the index file is closed * * @return meta data at the end of the bam index that indicates count of records holding no coordinates * or null if no meta data (old index format) */ public Long getNoCoordinateCount() { seek(4); final int sequenceCount = readInteger(); skipToSequence(sequenceCount); try { // in case of old index file without meta data return readLong(); } catch (final Exception e) { return null; } } protected BAMIndexContent query(final int referenceSequence, final int startPos, final int endPos) { seek(4); final List<Chunk> metaDataChunks = new ArrayList<Chunk>(); final int sequenceCount = readInteger(); if (referenceSequence >= sequenceCount) { return null; } final BitSet regionBins = GenomicIndexUtil.regionToBins(startPos, endPos); if (regionBins == null) { return null; } skipToSequence(referenceSequence); final int binCount = readInteger(); boolean metaDataSeen = false; final Bin[] bins = new Bin[getMaxBinNumberForReference(referenceSequence) +1]; for (int binNumber = 0; binNumber < binCount; binNumber++) { final int indexBin = readInteger(); final int nChunks = readInteger(); List<Chunk> chunks = null; // System.out.println("# bin[" + i + "] = " + indexBin + ", nChunks = " + nChunks); Chunk lastChunk = null; if (regionBins.get(indexBin)) { chunks = new ArrayList<Chunk>(nChunks); for (int ci = 0; ci < nChunks; ci++) { final long chunkBegin = readLong(); final long chunkEnd = readLong(); lastChunk = new Chunk(chunkBegin, chunkEnd); chunks.add(lastChunk); } } else if (indexBin == GenomicIndexUtil.MAX_BINS) { // meta data - build the bin so that the count of bins is correct; // but don't attach meta chunks to the bin, or normal queries will be off for (int ci = 0; ci < nChunks; ci++) { final long chunkBegin = readLong(); final long chunkEnd = readLong(); lastChunk = new Chunk(chunkBegin, chunkEnd); metaDataChunks.add(lastChunk); } metaDataSeen = true; continue; // don't create a Bin } else { skipBytes(16 * nChunks); chunks = Collections.emptyList(); } final Bin bin = new Bin(referenceSequence, indexBin); bin.setChunkList(chunks); bin.setLastChunk(lastChunk); bins[indexBin] = bin; } final int nLinearBins = readInteger(); final int regionLinearBinStart = LinearIndex.convertToLinearIndexOffset(startPos); final int regionLinearBinStop = endPos > 0 ? LinearIndex.convertToLinearIndexOffset(endPos) : nLinearBins-1; final int actualStop = Math.min(regionLinearBinStop, nLinearBins -1); long[] linearIndexEntries = new long[0]; if (regionLinearBinStart < nLinearBins) { linearIndexEntries = new long[actualStop-regionLinearBinStart+1]; skipBytes(8 * regionLinearBinStart); for(int linearBin = regionLinearBinStart; linearBin <= actualStop; linearBin++) linearIndexEntries[linearBin-regionLinearBinStart] = readLong(); } final LinearIndex linearIndex = new LinearIndex(referenceSequence,regionLinearBinStart,linearIndexEntries); return new BAMIndexContent(referenceSequence, bins, binCount - (metaDataSeen? 1 : 0), new BAMIndexMetaData(metaDataChunks), linearIndex); } /** * The maximum possible bin number for this reference sequence. * This is based on the maximum coordinate position of the reference * which is based on the size of the reference */ private int getMaxBinNumberForReference(final int reference) { try { final int sequenceLength = mBamDictionary.getSequence(reference).getSequenceLength(); return getMaxBinNumberForSequenceLength(sequenceLength); } catch (final Exception e) { return GenomicIndexUtil.MAX_BINS; } } /** * The maxiumum bin number for a reference sequence of a given length */ static int getMaxBinNumberForSequenceLength(final int sequenceLength) { return getFirstBinInLevel(getNumIndexLevels() - 1) + (sequenceLength >> 14); // return 4680 + (sequenceLength >> 14); // note 4680 = getFirstBinInLevel(getNumIndexLevels() - 1) } abstract protected BAMIndexContent getQueryResults(int reference); /** * Gets the possible number of bins for a given reference sequence. * @return How many bins could possibly be used according to this indexing scheme to index a single contig. */ protected int getMaxAddressibleGenomicLocation() { return GenomicIndexUtil.BIN_GENOMIC_SPAN; } /** * Get candidate bins for the specified region * @param startPos 1-based start of target region, inclusive. * @param endPos 1-based end of target region, inclusive. * @return bit set for each bin that may contain SAMRecords in the target region. */ protected BitSet regionToBins(final int startPos, final int endPos) { final int maxPos = 0x1FFFFFFF; final int start = (startPos <= 0) ? 0 : (startPos-1) & maxPos; final int end = (endPos <= 0) ? maxPos : (endPos-1) & maxPos; if (start > end) { return null; } int k; final BitSet bitSet = new BitSet(GenomicIndexUtil.MAX_BINS); bitSet.set(0); for (k = 1 + (start>>26); k <= 1 + (end>>26); ++k) bitSet.set(k); for (k = 9 + (start>>23); k <= 9 + (end>>23); ++k) bitSet.set(k); for (k = 73 + (start>>20); k <= 73 + (end>>20); ++k) bitSet.set(k); for (k = 585 + (start>>17); k <= 585 + (end>>17); ++k) bitSet.set(k); for (k = 4681 + (start>>14); k <= 4681 + (end>>14); ++k) bitSet.set(k); return bitSet; } /** * @deprecated Invoke htsjdk.samtools.Chunk#optimizeChunkList(java.util.List<htsjdk.samtools.Chunk>, long) directly. */ protected List<Chunk> optimizeChunkList(final List<Chunk> chunks, final long minimumOffset) { return Chunk.optimizeChunkList(chunks, minimumOffset); } private void skipToSequence(final int sequenceIndex) { //Use sequence position cache if available if(sequenceIndexes[sequenceIndex] != -1){ seek(sequenceIndexes[sequenceIndex]); return; } for (int i = 0; i < sequenceIndex; i++) { // System.out.println("# Sequence TID: " + i); final int nBins = readInteger(); // System.out.println("# nBins: " + nBins); for (int j = 0; j < nBins; j++) { readInteger(); // bin final int nChunks = readInteger(); // System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks); skipBytes(16 * nChunks); } final int nLinearBins = readInteger(); // System.out.println("# nLinearBins: " + nLinearBins); skipBytes(8 * nLinearBins); } //Update sequence position cache sequenceIndexes[sequenceIndex] = position(); } private void readBytes(final byte[] bytes) { mIndexBuffer.readBytes(bytes); } private int readInteger() { return mIndexBuffer.readInteger(); } private long readLong() { return mIndexBuffer.readLong(); } private void skipBytes(final int count) { mIndexBuffer.skipBytes(count); } private void seek(final int position) { mIndexBuffer.seek(position); } private int position(){ return mIndexBuffer.position(); } private abstract static class IndexFileBuffer { abstract void readBytes(final byte[] bytes); abstract int readInteger(); abstract long readLong(); abstract void skipBytes(final int count); abstract void seek(final int position); abstract int position(); abstract void close(); } /** * Traditional implementation of BAM index file access using memory mapped files. */ private static class MemoryMappedFileBuffer extends IndexFileBuffer { private MappedByteBuffer mFileBuffer; MemoryMappedFileBuffer(final File file) { try { // Open the file stream. final FileInputStream fileStream = new FileInputStream(file); final FileChannel fileChannel = fileStream.getChannel(); mFileBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0L, fileChannel.size()); mFileBuffer.order(ByteOrder.LITTLE_ENDIAN); fileChannel.close(); fileStream.close(); } catch (final IOException exc) { throw new RuntimeIOException(exc.getMessage(), exc); } } @Override void readBytes(final byte[] bytes) { mFileBuffer.get(bytes); } @Override int readInteger() { return mFileBuffer.getInt(); } @Override long readLong() { return mFileBuffer.getLong(); } @Override void skipBytes(final int count) { mFileBuffer.position(mFileBuffer.position() + count); } @Override void seek(final int position) { mFileBuffer.position(position); } @Override int position() { return mFileBuffer.position(); } @Override void close() { mFileBuffer = null; } } /** * Alternative implementation of BAM index file access using regular I/O instead of memory mapping. * * This implementation can be more scalable for certain applications that need to access large numbers of BAM files. * Java provides no way to explicitly release a memory mapping. Instead, you need to wait for the garbage collector * to finalize the MappedByteBuffer. Because of this, when accessing many BAM files or when querying many BAM files * sequentially, you cannot easily control the physical memory footprint of the java process. * This can limit scalability and can have bad interactions with load management software like LSF, forcing you * to reserve enough physical memory for a worst case scenario. * The use of regular I/O allows you to trade somewhat slower performance for a small, fixed memory footprint * if that is more suitable for your application. */ private static class RandomAccessFileBuffer extends IndexFileBuffer { private static final int PAGE_SIZE = 4 * 1024; private static final int PAGE_OFFSET_MASK = PAGE_SIZE-1; private static final int PAGE_MASK = ~PAGE_OFFSET_MASK; private static final int INVALID_PAGE = 1; private final File mFile; private RandomAccessFile mRandomAccessFile; private final int mFileLength; private int mFilePointer = 0; private int mCurrentPage = INVALID_PAGE; private final byte[] mBuffer = new byte[PAGE_SIZE]; RandomAccessFileBuffer(final File file) { mFile = file; try { mRandomAccessFile = new RandomAccessFile(file, "r"); final long fileLength = mRandomAccessFile.length(); if (fileLength > Integer.MAX_VALUE) { throw new RuntimeException("BAM index file " + mFile + " is too large: " + fileLength); } mFileLength = (int) fileLength; } catch (final IOException exc) { throw new RuntimeIOException(exc.getMessage(), exc); } } @Override void readBytes(final byte[] bytes) { int resultOffset = 0; int resultLength = bytes.length; if (mFilePointer + resultLength > mFileLength) { throw new RuntimeException("Attempt to read past end of BAM index file (file is truncated?): " + mFile); } while (resultLength > 0) { loadPage(mFilePointer); final int pageOffset = mFilePointer & PAGE_OFFSET_MASK; final int copyLength = Math.min(resultLength, PAGE_SIZE - pageOffset); System.arraycopy(mBuffer, pageOffset, bytes, resultOffset, copyLength); mFilePointer += copyLength; resultOffset += copyLength; resultLength -= copyLength; } } @Override int readInteger() { // This takes advantage of the fact that integers in BAM index files are always 4-byte aligned. loadPage(mFilePointer); final int pageOffset = mFilePointer & PAGE_OFFSET_MASK; mFilePointer += 4; return((mBuffer[pageOffset + 0] & 0xFF) | ((mBuffer[pageOffset + 1] & 0xFF) << 8) | ((mBuffer[pageOffset + 2] & 0xFF) << 16) | ((mBuffer[pageOffset + 3] & 0xFF) << 24)); } @Override long readLong() { // BAM index files are always 4-byte aligned, but not necessrily 8-byte aligned. // So, rather than fooling with complex page logic we simply read the long in two 4-byte chunks. final long lower = readInteger(); final long upper = readInteger(); return ((upper << 32) | (lower & 0xFFFFFFFFL)); } @Override void skipBytes(final int count) { mFilePointer += count; } @Override void seek(final int position) { mFilePointer = position; } @Override int position() { return mFilePointer; } @Override void close() { mFilePointer = 0; mCurrentPage = INVALID_PAGE; if (mRandomAccessFile != null) { try { mRandomAccessFile.close(); } catch (final IOException exc) { throw new RuntimeIOException(exc.getMessage(), exc); } mRandomAccessFile = null; } } private void loadPage(final int filePosition) { final int page = filePosition & PAGE_MASK; if (page == mCurrentPage) { return; } try { mRandomAccessFile.seek(page); final int readLength = Math.min(mFileLength - page, PAGE_SIZE); mRandomAccessFile.readFully(mBuffer, 0, readLength); mCurrentPage = page; } catch (final IOException exc) { throw new RuntimeIOException("Exception reading BAM index file " + mFile + ": " + exc.getMessage(), exc); } } } static class IndexStreamBuffer extends IndexFileBuffer { private final SeekableStream in; private final ByteBuffer tmpBuf; /** Continually reads from the provided {@link SeekableStream} into the buffer until the specified number of bytes are read, or * until the stream is exhausted, throwing a {@link RuntimeIOException}. */ private static void readFully(final SeekableStream in, final byte[] buffer, final int offset, final int length) { int read = 0; while (read < length) { final int readThisLoop; try { readThisLoop = in.read(buffer, read, length - read); } catch (final IOException e) { throw new RuntimeIOException(e); } if (readThisLoop == -1) break; read += readThisLoop; } if (read != length) throw new RuntimeIOException("Expected to read " + length + " bytes, but expired stream after " + read + "."); } public IndexStreamBuffer(final SeekableStream s) { in = s; tmpBuf = ByteBuffer.allocate(8); // Enough to fit a long. tmpBuf.order(ByteOrder.LITTLE_ENDIAN); } @Override public void close() { try { in.close(); } catch (final IOException e) { throw new RuntimeIOException(e); } } @Override public void readBytes(final byte[] bytes) { readFully(in, bytes, 0, bytes.length); } @Override public void seek(final int position) { try { in.seek(position); } catch (final IOException e) { throw new RuntimeIOException(e); } } @Override public int readInteger() { readFully(in, tmpBuf.array(), 0, 4); return tmpBuf.getInt(0); } @Override public long readLong() { readFully(in, tmpBuf.array(), 0, 8); return tmpBuf.getLong(0); } @Override public void skipBytes(final int count) { try { for (int s = count; s > 0;) { final int skipped = (int)in.skip(s); if (skipped <= 0) throw new RuntimeIOException("Failed to skip " + s); s -= skipped; } } catch (final IOException e) { throw new RuntimeIOException(e); } } @Override public int position() { try { return (int) in.position(); } catch (final IOException e) { throw new RuntimeIOException(e); } } } }