package com.illumina.basespace.igv.vcf; import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.Arrays; import java.util.HashMap; import net.sf.samtools.seekablestream.SeekableStream; import net.sf.samtools.util.BlockCompressedInputStream; import com.illumina.basespace.igv.io.BaseSpaceSeekableFileStream; import com.illumina.basespace.igv.vcf.VCFLocatorFactory.VCFTrackLoader; public class BasespaceTabixReader { protected VCFTrackLoader locator; BlockCompressedInputStream blockCompressInputStream; private int mPreset; private int mSc; private int mBc; private int mEc; private int mMeta; private int mSkip; private String[] mSeq; public HashMap<String, Integer> mChr2tid; private static int MAX_BIN = 37450; private static int TAD_MIN_CHUNK_GAP = 32768; private static int TAD_LIDX_SHIFT = 14; protected class TPair64 implements Comparable<TPair64> { long u, v; public TPair64(final long _u, final long _v) { u = _u; v = _v; } public TPair64(final TPair64 p) { u = p.u; v = p.v; } public int compareTo(final TPair64 p) { return u == p.u ? 0 : ((u < p.u) ^ (u < 0) ^ (p.u < 0)) ? -1 : 1; // unsigned // 64-bit // comparison } } protected class TIndex { HashMap<Integer, TPair64[]> b; // binning index long[] l; // linear index } protected TIndex[] mIndex; private class TIntv { int tid, beg, end; } private static boolean less64(final long u, final long v) { // unsigned 64-bit comparison return (u < v) ^ (u < 0) ^ (v < 0); } /** * * @param fn * File name of the data file */ public BasespaceTabixReader(VCFTrackLoader locator) throws IOException { this.locator = locator; blockCompressInputStream = new BlockCompressedInputStream(new BaseSpaceSeekableFileStream(locator, locator.getFile())); readIndex(); } private static int reg2bins(final int beg, final int _end, final int[] list) { int i = 0, k, end = _end; if (beg >= end) return 0; if (end >= 1 << 29) end = 1 << 29; --end; list[i++] = 0; for (k = 1 + (beg >> 26); k <= 1 + (end >> 26); ++k) list[i++] = k; for (k = 9 + (beg >> 23); k <= 9 + (end >> 23); ++k) list[i++] = k; for (k = 73 + (beg >> 20); k <= 73 + (end >> 20); ++k) list[i++] = k; for (k = 585 + (beg >> 17); k <= 585 + (end >> 17); ++k) list[i++] = k; for (k = 4681 + (beg >> 14); k <= 4681 + (end >> 14); ++k) list[i++] = k; return i; } public static int readInt(final InputStream is) throws IOException { byte[] buf = new byte[4]; is.read(buf); return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getInt(); } public static long readLong(final InputStream is) throws IOException { byte[] buf = new byte[8]; is.read(buf); return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getLong(); } public static String readLine(final InputStream is) throws IOException { StringBuffer buf = new StringBuffer(); int c; while ((c = is.read()) >= 0 && c != '\n') buf.append((char) c); if (c < 0) return null; return buf.toString(); } /** * * Read the Tabix index from a file * * * * @param fp * File pointer */ public void readIndex(SeekableStream fp) throws IOException { if (fp == null) return; BlockCompressedInputStream is = new BlockCompressedInputStream(fp); byte[] buf = new byte[4]; is.read(buf, 0, 4); // read "TBI\1" mSeq = new String[readInt(is)]; // # sequences mChr2tid = new HashMap<String, Integer>(); mPreset = readInt(is); mSc = readInt(is); mBc = readInt(is); mEc = readInt(is); mMeta = readInt(is); mSkip = readInt(is); // read sequence dictionary int i, j, k, l = readInt(is); buf = new byte[l]; is.read(buf); for (i = j = k = 0; i < buf.length; ++i) { if (buf[i] == 0) { byte[] b = new byte[i - j]; System.arraycopy(buf, j, b, 0, b.length); String s = new String(b); mChr2tid.put(s, k); mSeq[k++] = s; j = i + 1; } } // read the index mIndex = new TIndex[mSeq.length]; for (i = 0; i < mSeq.length; ++i) { // the binning index int n_bin = readInt(is); mIndex[i] = new TIndex(); mIndex[i].b = new HashMap<Integer, TPair64[]>(); for (j = 0; j < n_bin; ++j) { int bin = readInt(is); TPair64[] chunks = new TPair64[readInt(is)]; for (k = 0; k < chunks.length; ++k) { long u = readLong(is); long v = readLong(is); chunks[k] = new TPair64(u, v); // in C, this is inefficient } mIndex[i].b.put(bin, chunks); } // the linear index mIndex[i].l = new long[readInt(is)]; for (k = 0; k < mIndex[i].l.length; ++k) mIndex[i].l[k] = readLong(is); } // close is.close(); } /** * * Read the Tabix index from the default file. */ public void readIndex() throws IOException { readIndex(new BaseSpaceSeekableFileStream(locator, locator.getIndexFile())); } /** * * Read one line from the data file. */ public String readLine() throws IOException { return readLine(blockCompressInputStream); } private int chr2tid(final String chr) { if (mChr2tid.containsKey(chr)) return mChr2tid.get(chr); else return -1; } /** * * Parse a region in the format of "chr1", "chr1:100" or "chr1:100-1000" * * * * @param reg * Region string * * @return An array where the three elements are sequence_id, * * region_begin and region_end. On failure, sequence_id==-1. */ public int[] parseReg(final String reg) { // FIXME: NOT working when the sequence name contains : or -. String chr; int colon, hyphen; int[] ret = new int[3]; colon = reg.indexOf(':'); hyphen = reg.indexOf('-'); chr = colon >= 0 ? reg.substring(0, colon) : reg; ret[1] = colon >= 0 ? Integer.parseInt(reg.substring(colon + 1, hyphen >= 0 ? hyphen : reg.length())) - 1 : 0; ret[2] = hyphen >= 0 ? Integer.parseInt(reg.substring(hyphen + 1)) : 0x7fffffff; ret[0] = chr2tid(chr); return ret; } private TIntv getIntv(final String s) { TIntv intv = new TIntv(); int col = 0, end = 0, beg = 0; while ((end = s.indexOf('\t', beg)) >= 0 || end == -1) { ++col; if (col == mSc) { intv.tid = chr2tid(end != -1 ? s.substring(beg, end) : s.substring(beg)); } else if (col == mBc) { intv.beg = intv.end = Integer.parseInt(end != -1 ? s.substring(beg, end) : s.substring(beg)); if ((mPreset & 0x10000) != 0) ++intv.end; else --intv.beg; if (intv.beg < 0) intv.beg = 0; if (intv.end < 1) intv.end = 1; } else { // FIXME: SAM supports are not tested yet if ((mPreset & 0xffff) == 0) { // generic if (col == mEc) intv.end = Integer.parseInt(end != -1 ? s.substring(beg, end) : s.substring(beg)); } else if ((mPreset & 0xffff) == 1) { // SAM if (col == 6) { // CIGAR int l = 0, i, j; String cigar = s.substring(beg, end); for (i = j = 0; i < cigar.length(); ++i) { if (cigar.charAt(i) > '9') { int op = cigar.charAt(i); if (op == 'M' || op == 'D' || op == 'N') l += Integer.parseInt(cigar.substring(j, i)); j = i + 1; } } intv.end = intv.beg + l; } } else if ((mPreset & 0xffff) == 2) { // VCF String alt; alt = end >= 0 ? s.substring(beg, end) : s.substring(beg); if (col == 4) { // REF if (alt.length() > 0) intv.end = intv.beg + alt.length(); } else if (col == 8) { // INFO int e_off = -1, i = alt.indexOf("END="); if (i == 0) e_off = 4; else if (i > 0) { i = alt.indexOf(";END="); if (i >= 0) e_off = i + 5; } if (e_off > 0) { i = alt.indexOf(";", e_off); intv.end = Integer.parseInt(i > e_off ? alt.substring(e_off, i) : alt.substring(e_off)); } } } } if (end == -1) break; beg = end + 1; } return intv; } public class Iterator { private int i, n_seeks; private int tid, beg, end; private TPair64[] off; private long curr_off; private boolean iseof; public Iterator(final int _tid, final int _beg, final int _end, final TPair64[] _off) { i = -1; n_seeks = 0; curr_off = 0; iseof = false; off = _off; tid = _tid; beg = _beg; end = _end; } public String next() throws IOException { if (iseof) return null; for (;;) { if (curr_off == 0 || !less64(curr_off, off[i].v)) { // then jump to the next chunk if (i == off.length - 1) break; // no more chunks if (i >= 0) assert (curr_off == off[i].v); // otherwise bug if (i < 0 || off[i].v != off[i + 1].u) { // not adjacent chunks; then seek blockCompressInputStream.seek(off[i + 1].u); curr_off = blockCompressInputStream.getFilePointer(); ++n_seeks; } ++i; } String s; if ((s = readLine(blockCompressInputStream)) != null) { TIntv intv; char[] str = s.toCharArray(); curr_off = blockCompressInputStream.getFilePointer(); if (str.length == 0 || str[0] == mMeta) continue; intv = getIntv(s); if (intv.tid != tid || intv.beg >= end) break; // no need to // proceed else if (intv.end > beg && intv.beg < end) return s; // overlap; // return } else break; // end of file } iseof = true; return null; } } /** * * Return * * @param tid * Sequence id * * @param beg * beginning of interval, genomic coords * * @param end * end of interval, genomic coords * * @return an iterator over the lines within the specified interval */ public Iterator query(final int tid, final int beg, final int end) { TPair64[] off, chunks; long min_off; TIndex idx = mIndex[tid]; int[] bins = new int[MAX_BIN]; int i, l, n_off, n_bins = reg2bins(beg, end, bins); if (idx.l.length > 0) min_off = (beg >> TAD_LIDX_SHIFT >= idx.l.length) ? idx.l[idx.l.length - 1] : idx.l[beg >> TAD_LIDX_SHIFT]; else min_off = 0; for (i = n_off = 0; i < n_bins; ++i) { if ((chunks = idx.b.get(bins[i])) != null) n_off += chunks.length; } if (n_off == 0) return null; off = new TPair64[n_off]; for (i = n_off = 0; i < n_bins; ++i) if ((chunks = idx.b.get(bins[i])) != null) for (int j = 0; j < chunks.length; ++j) if (less64(min_off, chunks[j].v)) off[n_off++] = new TPair64(chunks[j]); Arrays.sort(off, 0, n_off); // resolve completely contained adjacent blocks for (i = 1, l = 0; i < n_off; ++i) { if (less64(off[l].v, off[i].v)) { ++l; off[l].u = off[i].u; off[l].v = off[i].v; } } n_off = l + 1; // resolve overlaps between adjacent blocks; this may happen due to the // merge in indexing for (i = 1; i < n_off; ++i) if (!less64(off[i - 1].v, off[i].u)) off[i - 1].v = off[i].u; // merge adjacent blocks for (i = 1, l = 0; i < n_off; ++i) { if (off[l].v >> 16 == off[i].u >> 16) off[l].v = off[i].v; else { ++l; off[l].u = off[i].u; off[l].v = off[i].v; } } n_off = l + 1; // return TPair64[] ret = new TPair64[n_off]; for (i = 0; i < n_off; ++i) { if (off[i] != null) ret[i] = new TPair64(off[i].u, off[i].v); // in // C, // this // is // inefficient } if (ret.length == 0 || (ret.length == 1 && ret[0] == null)) return null; return new BasespaceTabixReader.Iterator(tid, beg, end, ret); } /** * * * * @see #parseReg(String) * * @param reg * A region string of the form acceptable by * {@link #parseReg(String)} * * @return */ public Iterator query(final String reg) { int[] x = parseReg(reg); return query(x[0], x[1], x[2]); } // ADDED BY JTR public void close() { if (blockCompressInputStream != null) { try { blockCompressInputStream.close(); } catch (IOException e) { } } } }