package org.broad.igv.sam.lite;
import htsjdk.tribble.util.LittleEndianInputStream;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.util.ParsingUtils;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.util.*;
/**
* Created by jrobinso on 3/9/17.
*/
public class BAMIndex {
static int BAI_MAGIC = 21578050;
static int TABIX_MAGIC = 21578324;
private static final int SHIFT_AMOUNT = 16;
private static final int OFFSET_MASK = 0xffff;
private static final long ADDRESS_MASK = 0xFFFFFFFFFFFFL;
long firstAlignmentBlock;
long lastAlignmentBlock;
Map<Integer, RefIndexPair> refIndexes;
Map<String, Integer> sequenceIndexMap; // For tabix use only
boolean tabix;
public BAMIndex(Map<Integer, RefIndexPair> refIndexes, long blockMin, long blockMax, Map<String, Integer> sequenceIndexMap) {
this.firstAlignmentBlock = blockMin;
this.lastAlignmentBlock = blockMax;
this.refIndexes = refIndexes;
this.sequenceIndexMap = sequenceIndexMap;
this.tabix = false;
}
/**
* Fetch blocks for a particular genomic range. This method is public so it can be unit-tested.
*
* @param refId the sequence dictionary index of the chromosome
* @param min genomic start position
* @param max genomic end position
* @return an array of chunks {minv: {filePointer, offset}, {maxv: {filePointer, offset}}
*/
public List<Chunk> chunksForRange(int refId, int min, int max) {
RefIndexPair ba = this.refIndexes.get(refId);
if (ba == null) {
return Collections.EMPTY_LIST;
} else {
List<Integer> overlappingBins = reg2bins(min, max); // List of bin #s that overlap min, max
List<Chunk> chunks = new ArrayList<>();
// Find chunks in overlapping bins. Leaf bins (< 4681) are not pruned
for(Integer bin : overlappingBins) {
ArrayList<Chunk> binChunks = ba.binIndex.get(bin);
if (binChunks != null) {
chunks.addAll(binChunks);
}
}
// Use the linear index to find minimum file position of chunks that could contain alignments in the region
int nintv = ba.linearIndex.size();
VPointer lowest = null;
int minLin = Math.min(min >> 14, nintv - 1);
int maxLin = Math.min(max >> 14, nintv - 1);
for (int i = minLin; i <= maxLin; ++i) {
VPointer vp = ba.linearIndex.get(i);
if (vp != null) {
// todo -- I think, but am not sure, that the values in the linear index have to be in increasing order. So the first non-null should be minimum
if (lowest == null || vp.isLessThan(lowest)) {
lowest = vp;
}
}
}
return optimizeChunks(chunks, lowest);
}
}
;
List<Chunk> optimizeChunks(List<Chunk> chunks, VPointer lowest) {
ArrayList mergedChunks = new ArrayList<>();
Chunk lastChunk = null;
if (chunks.size() == 0) return chunks;
chunks.sort((c0, c1) -> {
long dif = c0.start.block - c1.start.block;
if (dif > 0) return 1;
else if (dif < 0) return -1;
else {
return c0.start.offset - c1.start.offset;
}
});
for(Chunk chunk : chunks) {
if (chunk.end.isGreaterThan(lowest)) {
if (lastChunk == null) {
mergedChunks.add(chunk);
lastChunk = chunk;
} else {
if ((chunk.start.block - lastChunk.end.block) < 65000) { // Merge chunks that are withing 65k of each other
if (chunk.end.isGreaterThan(lastChunk.end)) {
lastChunk.end = chunk.end;
}
} else {
mergedChunks.add(chunk);
lastChunk = chunk;
}
}
}
}
return mergedChunks;
}
/**
* Calculate the list of bins that overlap with region [beg, end]
*/
List<Integer> reg2bins(int beg, int end) {
int k;
List<Integer> list = new ArrayList<>();
if (end >= 1 << 29) end = 1 << 29;
--end;
list.add(0);
for (k = 1 + (beg >> 26); k <= 1 + (end >> 26); ++k) list.add(k);
for (k = 9 + (beg >> 23); k <= 9 + (end >> 23); ++k) list.add(k);
for (k = 73 + (beg >> 20); k <= 73 + (end >> 20); ++k) list.add(k);
for (k = 585 + (beg >> 17); k <= 585 + (end >> 17); ++k) list.add(k);
for (k = 4681 + (beg >> 14); k <= 4681 + (end >> 14); ++k) list.add(k);
return list;
}
static VPointer readVPointer(LittleEndianInputStream is) throws IOException {
long vp = is.readLong();
long block = (vp >> SHIFT_AMOUNT) & ADDRESS_MASK;
int offset = (int) (vp & OFFSET_MASK);
return new VPointer(block, offset);
}
public static class VPointer {
public long block;
public int offset;
public VPointer(long block, int offset) {
this.block = block;
this.offset = offset;
}
boolean isLessThan(VPointer vp) {
return this.block < vp.block ||
(this.block == vp.block && this.offset < vp.offset);
}
boolean isGreaterThan (VPointer vp) {
return this.block > vp.block ||
(this.block == vp.block && this.offset > vp.offset);
}
}
public static class Chunk {
public VPointer start;
public VPointer end;
public Chunk(VPointer start, VPointer end) {
this.end = end;
this.start = start;
}
}
public static class RefIndexPair {
public Map<Integer, ArrayList<Chunk>> binIndex;
public ArrayList<VPointer> linearIndex;
public RefIndexPair(Map<Integer, ArrayList<Chunk>> binIndex, ArrayList<VPointer> linearIndex) {
this.binIndex = binIndex;
this.linearIndex = linearIndex;
}
}
public static BAMIndex loadIndex(String indexURL,
Genome genome) throws IOException {
boolean tabix = false;
LittleEndianInputStream parser = new LittleEndianInputStream(new BufferedInputStream(ParsingUtils.openInputStream(indexURL)));
// NOTE: if we extend support to tabix must gunzip the file
// if (tabix) {
// var inflate = new Zlib.Gunzip(new Uint8Array(arrayBuffer));
// arrayBuffer = inflate.decompress().buffer;
// }
long blockMin = Long.MAX_VALUE;
long blockMax = 0;
Map<Integer, RefIndexPair> refIndexes = new HashMap<>();
Map<String, Integer> sequenceIndexMap = null;
int magic = parser.readInt();
if (magic == BAI_MAGIC || (tabix && magic == TABIX_MAGIC)) {
int nref = parser.readInt();
if (tabix) {
// Tabix header parameters aren't used, but they must be read to advance the pointer
int format = parser.readInt();
int col_seq = parser.readInt();
int col_beg = parser.readInt();
int col_end = parser.readInt();
int meta = parser.readInt();
int skip = parser.readInt();
int l_nm = parser.readInt();
sequenceIndexMap = new HashMap<>();
for (int i = 0; i < nref; i++) {
String seq_name = parser.readString();
// Translate to "official" chr name.
if (genome != null) {
seq_name = genome.getCanonicalChrName(seq_name);
}
sequenceIndexMap.put(seq_name, i);
}
}
for (int ref = 0; ref < nref; ref++) {
Map<Integer, ArrayList<Chunk>> binIndex = new HashMap<>();
ArrayList<VPointer> linearIndex = new ArrayList<>();
int nbin = parser.readInt();
for (int b = 0; b < nbin; b++) {
int binNumber = parser.readInt();
if (binNumber == 37450) {
// This is a psuedo bin, not used but we have to consume the bytes
int nchnk = parser.readInt(); // # of chunks for this bin
readVPointer(parser); // unmapped beg
readVPointer(parser); // unmapped end
long n_maped = parser.readLong();
long nUnmapped = parser.readLong();
} else {
ArrayList<Chunk> chunks = new ArrayList<>();
binIndex.put(binNumber, chunks);
int nchnk = parser.readInt(); // # of chunks for this bin
for (int i = 0; i < nchnk; i++) {
VPointer cs = readVPointer(parser); //chunk_beg
VPointer ce = readVPointer(parser); //chunk_end
if (cs.block < blockMin) {
blockMin = cs.block; // Block containing first alignment
}
if (ce.block > blockMax) {
blockMax = ce.block;
}
chunks.add(new Chunk(cs, ce));
}
}
}
int nintv = parser.readInt();
for (int i = 0; i < nintv; i++) {
VPointer cs = readVPointer(parser);
linearIndex.add(cs); // Might be null
}
if (nbin > 0) {
refIndexes.put(ref, new RefIndexPair(binIndex, linearIndex));
}
}
} else {
throw new RuntimeException(indexURL + " is not a " + (tabix ? "tabix" : "bai") + " file");
}
return new BAMIndex(refIndexes, blockMin, blockMax, sequenceIndexMap);
}
}