package org.broad.igv.sam.lite; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.CloseableIterator; import htsjdk.tribble.util.LittleEndianInputStream; import org.apache.log4j.Logger; import org.broad.igv.feature.Strand; import org.broad.igv.feature.genome.Genome; import org.broad.igv.sam.*; import org.broad.igv.sam.ReadMate; import org.broad.igv.sam.reader.AlignmentReader; import org.broad.igv.util.ParsingUtils; import org.broad.igv.util.stream.IGVSeekableStreamFactory; import java.io.BufferedInputStream; import java.io.ByteArrayOutputStream; import java.io.EOFException; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.*; import java.util.zip.DataFormatException; /** * Created by jrobinso on 3/9/17. */ public class BAMReader implements AlignmentReader<Alignment> { private static Logger log = Logger.getLogger(BAMReader.class); private final String path; private final String indexPath; int BAM_MAGIC = 21840194; int BAI_MAGIC = 21578050; char[] SECRET_DECODER = {'=', 'A', 'C', 'x', 'G', 'x', 'x', 'x', 'T', 'x', 'x', 'x', 'x', 'x', 'x', 'N'}; char[] CIGAR_DECODER = {'M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X', '?', '?', '?', '?', '?', '?', '?'}; int PAIRED_FLAG = 0x1; int READ_STRAND_FLAG = 0x10; int MATE_STRAND_FLAG = 0x20; int MATE_IS_MAPPED_FLAG = 0x8; int FIRST_OF_PAIR_FLAG = 0x40; int SECOND_OF_PAIR_FLAG = 0x80; int NOT_PRIMARY_ALIGNMENT_FLAG = 0x100; int READ_FAILS_VENDOR_QUALITY_CHECK_FLAG = 0x200; int DUPLICATE_READ_FLAG = 0x400; int SUPPLEMENTARY_FLAG = 0x800; int MAX_GZIP_BLOCK_SIZE = 65536; // APPARENTLY. Where is this documented??? int DEFAULT_SAMPLING_WINDOW_SIZE = 100; int DEFAULT_SAMPLING_DEPTH = 50; int MAXIMUM_SAMPLING_DEPTH = 2500; BAMIndex bamIndex = null; Genome genome; Map<String, Integer> chrToIndex; private String[] indexToChr; public BAMReader(String path) { this.path = path; this.indexPath = path + ".bai"; // this.config = config; // // this.filter = config.filter || new igv.BamFilter(); // // this.bamPath = config.url; // // Todo - deal with Picard convention. WHY DOES THERE HAVE TO BE 2? // this.baiPath = config.indexURL || this.bamPath + ".bai"; // If there is an indexURL provided, use it! // this.headPath = config.headURL || this.bamPath; // // // this.samplingWindowSize = config.samplingWindowSize == = undefined ? DEFAULT_SAMPLING_WINDOW_SIZE : config.samplingWindowSize; // this.samplingDepth = config.samplingDepth == = undefined ? DEFAULT_SAMPLING_DEPTH : config.samplingDepth; // if (this.samplingDepth > MAXIMUM_SAMPLING_DEPTH) { // igv.log("Warning: attempt to set sampling depth > maximum value of 2500"); // this.samplingDepth = MAXIMUM_SAMPLING_DEPTH; // } // // if (config.viewAsPairs) { // this.pairsSupported = true; // } else { // this.pairsSupported = config.pairsSupported == = undefined ? true : config.pairsSupported; // } try { bamIndex = BAMIndex.loadIndex(this.indexPath, null); readHeader(); } catch (IOException e) { e.printStackTrace(); } } @Override public void close() throws IOException { } @Override public List<String> getSequenceNames() throws IOException { return Arrays.asList(indexToChr); } @Override public SAMFileHeader getFileHeader() { return null; } @Override public Set<String> getPlatforms() { return null; } @Override public CloseableIterator<Alignment> iterator() { return null; } @Override public boolean hasIndex() { return bamIndex != null; } @Override public CloseableIterator<Alignment> query(String chr, int start, int end, boolean contained) throws IOException { return new CIterator(chr, start, end); } class CIterator implements CloseableIterator<Alignment> { String chr; Integer chrId; int start; int end; Iterator<BAMIndex.Chunk> chunks; Iterator<Alignment> currentChunkAlignments; Alignment nextAlignment; public CIterator(String chr, int start, int end) { this.chr = chr; this.start = start; this.end = end; init(); } private void init() { Map<String, Integer> chrToIndex = BAMReader.this.chrToIndex; chrId = chrToIndex.get(chr); if (chrId == null) { // TODO } else { chunks = bamIndex.chunksForRange(chrId, start, end).iterator(); } advance(); } @Override public void close() { } @Override public boolean hasNext() { return nextAlignment != null; } @Override public Alignment next() { Alignment tmp = nextAlignment; advance(); return tmp; } void advance() { nextAlignment = null; try { while (nextAlignment == null) { if (currentChunkAlignments != null && currentChunkAlignments.hasNext()) { nextAlignment = currentChunkAlignments.next(); } else { if (chunks.hasNext()) { BAMIndex.Chunk c = chunks.next(); currentChunkAlignments = readAlignments(c, chrId, start, end).iterator(); } else { break; } } } } catch (IOException e) { e.printStackTrace(); nextAlignment = null; } } } public List<Alignment> readAlignments(BAMIndex.Chunk c, int chrId, int start, int end) throws IOException { List<Alignment> alignmentContainer = new ArrayList<>(10000); long fetchMin = c.start.block; long fetchMax = c.end.block + 65000; // Make sure we get the whole block. SeekableStream ss = IGVSeekableStreamFactory.getInstance().getStreamFor(this.path); ss.seek(fetchMin); byte[] buffer = new byte[(int) (fetchMax - fetchMin + 1)]; try { ss.readFully(buffer); } catch (EOFException e) { // Can happen with small files } byte[] unc = BGUnzip.blockUnzip(buffer); decodeBamRecords(unc, c.start.offset, alignmentContainer, start, end, chrId); //, self.filter); return alignmentContainer; } public List<Alignment> readAlignments(String chr, int bpStart, int bpEnd) throws IOException { List<Alignment> alignmentContainer = new ArrayList<>(10000); if (chrToIndex == null) { readHeader(); } Map<String, Integer> chrToIndex = this.chrToIndex; Integer chrId = chrToIndex.get(chr); if (chrId == null) { return alignmentContainer; } else { List<BAMIndex.Chunk> chunks = bamIndex.chunksForRange(chrId, bpStart, bpEnd); if (chunks == null || chunks.isEmpty()) { return alignmentContainer; } for (BAMIndex.Chunk c : chunks) { long fetchMin = c.start.block; long fetchMax = c.end.block + 65000; // Make sure we get the whole block. SeekableStream ss = IGVSeekableStreamFactory.getInstance().getStreamFor(this.path); ss.seek(fetchMin); byte[] buffer = new byte[(int) (fetchMax - fetchMin + 1)]; try { ss.readFully(buffer); } catch (EOFException e) { // Can happen with small files } byte[] unc = BGUnzip.blockUnzip(buffer); decodeBamRecords(unc, c.start.offset, alignmentContainer, bpStart, bpEnd, chrId); //, self.filter); } return alignmentContainer; } } void decodeBamRecords(byte[] ba, int offset, List<Alignment> alignmentContainer, int min, int max, int chrId) { //, filter){ while (true) { if (offset >= ba.length) { return; } int blockSize = readInt(ba, offset); int blockEnd = offset + blockSize + 4; if (blockEnd > ba.length) { return; } int refID = readInt(ba, offset + 4); int pos = readInt(ba, offset + 8); if (refID < 0) { return; // unmapped reads } else if (refID > chrId || pos > max) { return; // off right edge, we're done } else if (refID < chrId) { continue; // to left of start, not sure this is possible } int bmn = readInt(ba, offset + 12); int bin = (bmn & 0xffff0000) >> 16; int mq = (bmn & 0xff00) >> 8; int nl = bmn & 0xff; int flag_nc = readInt(ba, offset + 16); int flag = (flag_nc & 0xffff0000) >> 16; int nc = flag_nc & 0xffff; int lseq = readInt(ba, offset + 20); int mateRefID = readInt(ba, offset + 24); int matePos = readInt(ba, offset + 28); byte[] readNameBytes = Arrays.copyOfRange(ba, offset + 36, offset + 36 + nl); String readName = new String(readNameBytes); int p = offset + 36 + nl; byte[] cigarBytes = Arrays.copyOfRange(ba, p, p + 4 * nc); p += 4 * nc; CigarOperator[] cigarArray = new CigarOperator[nc]; int lengthOnRef = 0; for (int c = 0; c < nc; ++c) { int cigop = readInt(cigarBytes, c); int opLen = (cigop >> 4); char opLtr = CIGAR_DECODER[cigop & 0xf]; if (opLtr == 'M' || opLtr == 'X' || opLtr == 'D' || opLtr == 'N' || opLtr == '=') lengthOnRef += opLen; cigarArray[c] = new CigarOperator(opLen, opLtr); } if (pos + lengthOnRef < min) { offset = blockEnd; continue; // Record out-of-range "to the left", skip to next one } int seqSize = (lseq + 1) >> 1; byte[] seqBytes = Arrays.copyOfRange(ba, p, p + seqSize); byte[] sequence = new byte[lseq]; for (int j = 0; j < seqSize; ++j) { byte sb = seqBytes[j]; sequence[2 * j] = (byte) (SECRET_DECODER[(sb & 0xf0) >> 4]); if ((2 * j + 1) < lseq) sequence[2 * j + 1] += SECRET_DECODER[(sb & 0x0f)]; } p += seqSize; byte[] qualities; if (lseq == 1 && sequence[0] == '*') { qualities = new byte[]{Byte.MAX_VALUE}; // TODO == how to represent this? } else { qualities = Arrays.copyOfRange(ba, p, p + lseq); } p += lseq; ReadMate mate = null; boolean isPaired = (flag & PAIRED_FLAG) != 0; if (isPaired) { boolean mateIsMapped = (flag & MATE_IS_MAPPED_FLAG) != 0; String mateChr = mateRefID > 0 ? this.indexToChr[mateRefID] : ""; boolean mateIsNegativeStrand = ((flag & MATE_STRAND_FLAG) != 0); mate = new ReadMate(mateChr, matePos, mateIsNegativeStrand, mateIsMapped); } byte[] tagBytes = Arrays.copyOfRange(ba, p, blockEnd); if (pos + lengthOnRef >= min && pos <= max) { // && pass filter BAMAlignment alignment = new BAMAlignment(); alignment.start = pos; alignment.flags = flag; alignment.fragmentLength = readInt(ba, offset + 32); alignment.cigarBytes = cigarBytes; alignment.lengthOnRef = lengthOnRef; alignment.sequence = sequence; alignment.mq = mq; alignment.readName = readName; alignment.chr = this.indexToChr[refID]; alignment.qualities = qualities; alignment.mate = mate; alignment.tagBytes = tagBytes; // Decode these on demand makeBlocks(cigarArray, alignment); alignmentContainer.add(alignment); } offset = blockEnd; } } void readHeader() throws IOException { int len = (int) bamIndex.firstAlignmentBlock + MAX_GZIP_BLOCK_SIZE; // Insure we get the complete compressed block containing the header //LittleEndianInputStream parser = new LittleEndianInputStream(new BufferedInputStream(ParsingUtils.openInputStream(indexURL))); SeekableStream ss = IGVSeekableStreamFactory.getInstance().getStreamFor(this.path); byte[] buffer = new byte[len]; try { ss.readFully(buffer); } catch (EOFException e) { // This can happen with small files } byte[] uncba = BGUnzip.blockUnzip(buffer); int magic = readInt(uncba, 0); int samHeaderLen = readInt(uncba, 4); String samHeader = new String(uncba, 8, samHeaderLen); int nRef = readInt(uncba, samHeaderLen + 8); int p = samHeaderLen + 12; Map<String, Integer> chrToIndex = new HashMap<>(); String[] indexToChr = new String[nRef]; for (int i = 0; i < nRef; ++i) { int lName = readInt(uncba, p); String name = new String(uncba, p + 4, lName - 1); int lRef = readInt(uncba, p + lName + 4); if (genome != null) { name = genome.getCanonicalChrName(name); } chrToIndex.put(name, i); indexToChr[i] = name; p = p + 8 + lName; } this.chrToIndex = chrToIndex; this.indexToChr = indexToChr; } /** * Split the alignment record into blocks as specified in the cigarArray. Each aligned block contains * its portion of the read sequence and base quality strings. A read sequence or base quality string * of "*" indicates the value is not recorded. In all other cases the length of the block sequence (block.seq) * and quality string (block.qual) must == the block length. * <p> * NOTE: Insertions are not yet treated // TODO * * @returns array of blocks */ void makeBlocks(CigarOperator[] cigarArray, BAMAlignment alignment) { List<AlignmentBlock> blocks = new ArrayList<>(); List<AlignmentBlock> insertions = null; int seqOffset = 0; int pos = alignment.start; int len = cigarArray.length; byte[] blockSeq; char gapType; byte[] blockQuals; // gapType, // minQ = 5, //prefs.getAsInt(PreferenceManager.SAM_BASE_QUALITY_MIN) // maxQ = 20; //prefs.getAsInt(PreferenceManager.SAM_BASE_QUALITY_MAX) for (int i = 0; i < len; i++) { CigarOperator c = cigarArray[i]; switch (c.letter) { case 'H': break; // ignore hard clips case 'P': break; // ignore pads case 'S': seqOffset += c.length; gapType = 'S'; break; // soft clip read bases case 'N': pos += c.length; gapType = 'N'; break; // reference skip case 'D': pos += c.length; gapType = 'D'; break; case 'I': blockSeq = alignment.sequence.length == 1 ? alignment.sequence : Arrays.copyOfRange(alignment.sequence, seqOffset, seqOffset + c.length); blockQuals = alignment.qualities == null ? null : Arrays.copyOfRange(alignment.qualities, seqOffset, seqOffset + c.length); if (insertions == null) { insertions = new ArrayList<>(); } insertions.add(new AlignmentBlockImpl(pos, blockSeq, blockQuals)); seqOffset += c.length; break; case 'M': case '=': case 'X': blockSeq = alignment.sequence.length == 1 ? alignment.sequence : Arrays.copyOfRange(alignment.sequence, seqOffset, seqOffset + c.length); blockQuals = alignment.qualities == null ? null : Arrays.copyOfRange(alignment.qualities, seqOffset, seqOffset + c.length); blocks.add(new AlignmentBlockImpl(pos, blockSeq, blockQuals)); seqOffset += c.length; pos += c.length; break; default: log.error("Error processing cigar element: " + c.length + c.letter); } } alignment.alignmentBlocks = blocks.toArray(new AlignmentBlockImpl[]{}); if (insertions != null) { alignment.insertions = insertions.toArray(new AlignmentBlockImpl[]{}); } } public String readString(ByteBuffer ba) throws IOException { ByteArrayOutputStream bis = new ByteArrayOutputStream(100); byte b; while ((b = ba.get()) != 0) { if (b < 0) { throw new EOFException(); } bis.write(b); } return new String(bis.toByteArray()); } public static int readInt(byte[] ba, int offset) { return (ba[offset + 3] << 24) + (ba[offset + 2] << 24 >>> 8) + (ba[offset + 1] << 24 >>> 16) + (ba[offset] << 24 >>> 24); } }