BAMIndex.java example

Explorer
igv-master
- src
- test
  - src
package org.broad.igv.sam.lite;

import htsjdk.tribble.util.LittleEndianInputStream;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.util.ParsingUtils;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.util.*;


/**
 * Created by jrobinso on 3/9/17.
 */
public class BAMIndex {


    static int BAI_MAGIC = 21578050;
    static int TABIX_MAGIC = 21578324;
    private static final int SHIFT_AMOUNT = 16;
    private static final int OFFSET_MASK = 0xffff;
    private static final long ADDRESS_MASK = 0xFFFFFFFFFFFFL;


    long firstAlignmentBlock;
    long lastAlignmentBlock;
    Map<Integer, RefIndexPair> refIndexes;
    Map<String, Integer> sequenceIndexMap;   // For tabix use only
    boolean tabix;


    public BAMIndex(Map<Integer, RefIndexPair> refIndexes, long blockMin, long blockMax, Map<String, Integer> sequenceIndexMap) {
        this.firstAlignmentBlock = blockMin;
        this.lastAlignmentBlock = blockMax;
        this.refIndexes = refIndexes;
        this.sequenceIndexMap = sequenceIndexMap;
        this.tabix = false;
    }


    /**
     * Fetch blocks for a particular genomic range.  This method is public so it can be unit-tested.
     *
     * @param refId  the sequence dictionary index of the chromosome
     * @param min  genomic start position
     * @param max  genomic end position
     * @return an array of chunks {minv: {filePointer, offset}, {maxv: {filePointer, offset}}
     */

    public List<Chunk> chunksForRange(int refId, int min, int max) {

        RefIndexPair ba = this.refIndexes.get(refId);
        if (ba == null) {
            return Collections.EMPTY_LIST;
        } else {

            List<Integer> overlappingBins = reg2bins(min, max);        // List of bin #s that overlap min, max
            List<Chunk> chunks = new ArrayList<>();

            // Find chunks in overlapping bins.  Leaf bins (< 4681) are not pruned

            for(Integer bin : overlappingBins) {
                ArrayList<Chunk> binChunks = ba.binIndex.get(bin);
                if (binChunks != null) {
                    chunks.addAll(binChunks);
                }
            }

            // Use the linear index to find minimum file position of chunks that could contain alignments in the region
            int nintv = ba.linearIndex.size();
            VPointer lowest = null;
            int minLin = Math.min(min >> 14, nintv - 1);
            int maxLin = Math.min(max >> 14, nintv - 1);
            for (int i = minLin; i <= maxLin; ++i) {
                VPointer vp = ba.linearIndex.get(i);
                if (vp != null) {
                    // todo -- I think, but am not sure, that the values in the linear index have to be in increasing order.  So the first non-null should be minimum
                    if (lowest == null || vp.isLessThan(lowest)) {
                        lowest = vp;
                    }
                }
            }

            return optimizeChunks(chunks, lowest);
        }

    }

    ;

    List<Chunk> optimizeChunks(List<Chunk> chunks, VPointer lowest) {

        ArrayList mergedChunks = new ArrayList<>();
        Chunk lastChunk = null;

        if (chunks.size() ==  0) return chunks;

        chunks.sort((c0, c1) -> {
            long dif = c0.start.block - c1.start.block;
            if (dif > 0) return 1;
            else if (dif < 0) return -1;
            else {
                return c0.start.offset - c1.start.offset;
            }
        });

        for(Chunk chunk : chunks) {

            if (chunk.end.isGreaterThan(lowest)) {
                if (lastChunk ==  null) {
                    mergedChunks.add(chunk);
                    lastChunk = chunk;
                } else {
                    if ((chunk.start.block - lastChunk.end.block) < 65000) { // Merge chunks that are withing 65k of each other
                        if (chunk.end.isGreaterThan(lastChunk.end)) {
                            lastChunk.end = chunk.end;
                        }
                    } else {
                        mergedChunks.add(chunk);
                        lastChunk = chunk;
                    }
                }
            }
        }

        return mergedChunks;
    }

    /**
     * Calculate the list of bins that overlap with region [beg, end]
     */
    List<Integer> reg2bins(int beg, int end) {

        int k;

        List<Integer> list = new ArrayList<>();

        if (end >= 1 << 29) end = 1 << 29;
        --end;

        list.add(0);
        for (k = 1 + (beg >> 26); k <= 1 + (end >> 26); ++k) list.add(k);
        for (k = 9 + (beg >> 23); k <= 9 + (end >> 23); ++k) list.add(k);
        for (k = 73 + (beg >> 20); k <= 73 + (end >> 20); ++k) list.add(k);
        for (k = 585 + (beg >> 17); k <= 585 + (end >> 17); ++k) list.add(k);
        for (k = 4681 + (beg >> 14); k <= 4681 + (end >> 14); ++k) list.add(k);
        return list;
    }


    static VPointer readVPointer(LittleEndianInputStream is) throws IOException {

        long vp = is.readLong();
        long block =  (vp >> SHIFT_AMOUNT) & ADDRESS_MASK;
        int offset =  (int) (vp & OFFSET_MASK);

        return new VPointer(block, offset);

    }

    public static class VPointer {
        public long block;
        public int offset;

        public VPointer(long block, int offset) {
            this.block = block;
            this.offset = offset;
        }

        boolean isLessThan(VPointer vp) {
            return this.block < vp.block ||
                    (this.block == vp.block && this.offset < vp.offset);
        }

        boolean isGreaterThan (VPointer vp) {
            return this.block > vp.block ||
                    (this.block == vp.block && this.offset > vp.offset);
        }

    }

    public static class Chunk {

        public VPointer start;
        public VPointer end;

        public Chunk(VPointer start, VPointer end) {
            this.end = end;
            this.start = start;
        }
    }

    public static class RefIndexPair {

        public Map<Integer, ArrayList<Chunk>> binIndex;
        public ArrayList<VPointer> linearIndex;

        public RefIndexPair(Map<Integer, ArrayList<Chunk>> binIndex, ArrayList<VPointer> linearIndex) {
            this.binIndex = binIndex;
            this.linearIndex = linearIndex;
        }
    }


    public static BAMIndex loadIndex(String indexURL,
                              Genome genome) throws IOException {


        boolean tabix = false;

        LittleEndianInputStream parser = new LittleEndianInputStream(new BufferedInputStream(ParsingUtils.openInputStream(indexURL)));

        // NOTE:  if we extend support to tabix must gunzip the file
//                if (tabix) {
//                    var inflate = new Zlib.Gunzip(new Uint8Array(arrayBuffer));
//                    arrayBuffer = inflate.decompress().buffer;
//                }

        long blockMin = Long.MAX_VALUE;
        long blockMax = 0;

        Map<Integer, RefIndexPair> refIndexes = new HashMap<>();
        Map<String, Integer> sequenceIndexMap = null;

        int magic = parser.readInt();

        if (magic == BAI_MAGIC || (tabix && magic == TABIX_MAGIC)) {

            int nref = parser.readInt();

            if (tabix) {
                // Tabix header parameters aren't used, but they must be read to advance the pointer
                int format = parser.readInt();
                int col_seq = parser.readInt();
                int col_beg = parser.readInt();
                int col_end = parser.readInt();
                int meta = parser.readInt();
                int skip = parser.readInt();
                int l_nm = parser.readInt();

                sequenceIndexMap = new HashMap<>();

                for (int i = 0; i < nref; i++) {

                    String seq_name = parser.readString();

                    // Translate to "official" chr name.
                    if (genome != null) {
                        seq_name = genome.getCanonicalChrName(seq_name);
                    }

                    sequenceIndexMap.put(seq_name, i);
                }
            }

            for (int ref = 0; ref < nref; ref++) {

                Map<Integer, ArrayList<Chunk>> binIndex = new HashMap<>();
                ArrayList<VPointer> linearIndex = new ArrayList<>();

                int nbin = parser.readInt();

                for (int b = 0; b < nbin; b++) {

                    int binNumber = parser.readInt();

                    if (binNumber == 37450) {
                        // This is a psuedo bin, not used but we have to consume the bytes
                        int nchnk = parser.readInt(); // # of chunks for this bin
                        readVPointer(parser);   // unmapped beg
                        readVPointer(parser);  // unmapped end
                        long n_maped = parser.readLong();
                        long nUnmapped = parser.readLong();

                    } else {

                        ArrayList<Chunk> chunks = new ArrayList<>();
                        binIndex.put(binNumber, chunks);
                        int nchnk = parser.readInt(); // # of chunks for this bin

                        for (int i = 0; i < nchnk; i++) {
                            VPointer cs = readVPointer(parser);      //chunk_beg
                            VPointer ce = readVPointer(parser);     //chunk_end


                            if (cs.block < blockMin) {
                                blockMin = cs.block;    // Block containing first alignment
                            }
                            if (ce.block > blockMax) {
                                blockMax = ce.block;
                            }
                            chunks.add(new Chunk(cs, ce));
                        }
                    }
                }


                int nintv = parser.readInt();
                for (int i = 0; i < nintv; i++) {
                    VPointer cs = readVPointer(parser);
                    linearIndex.add(cs);   // Might be null
                }

                if (nbin > 0) {
                    refIndexes.put(ref, new RefIndexPair(binIndex, linearIndex));
                }
            }


        } else {
            throw new RuntimeException(indexURL + " is not a " + (tabix ? "tabix" : "bai") + " file");
        }

        return new BAMIndex(refIndexes, blockMin, blockMax, sequenceIndexMap);

    }

}