/*
* The MIT License
*
* Copyright (c) 2014 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.tribble.index.tabix;
import htsjdk.samtools.Bin;
import htsjdk.samtools.BinningIndexContent;
import htsjdk.samtools.Chunk;
import htsjdk.samtools.LinearIndex;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.BlockCompressedOutputStream;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.StringUtil;
import htsjdk.tribble.TribbleException;
import htsjdk.tribble.index.Block;
import htsjdk.tribble.index.Index;
import htsjdk.tribble.util.LittleEndianInputStream;
import htsjdk.tribble.util.LittleEndianOutputStream;
import htsjdk.tribble.util.TabixUtils;
import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
/**
* This class represent a Tabix index that has been built in memory or read from a file. It can be queried or
* written to a file.
*/
public class TabixIndex implements Index {
private static final byte[] MAGIC = {'T', 'B', 'I', 1};
public static final int MAGIC_NUMBER;
static {
final ByteBuffer bb = ByteBuffer.allocate(MAGIC.length);
bb.put(MAGIC);
bb.flip();
MAGIC_NUMBER = bb.order(ByteOrder.LITTLE_ENDIAN).getInt();
}
private final TabixFormat formatSpec;
private final List<String> sequenceNames;
private final BinningIndexContent[] indices;
/**
* @param formatSpec Information about how to interpret the file being indexed. Unused by this class other than
* written to an output file.
* @param sequenceNames Sequences in the file being indexed, in the order they appear in the file.
* @param indices One for each element of sequenceNames
*/
public TabixIndex(final TabixFormat formatSpec, final List<String> sequenceNames, final BinningIndexContent[] indices) {
if (sequenceNames.size() != indices.length) {
throw new IllegalArgumentException("sequenceNames.size() != indices.length");
}
this.formatSpec = formatSpec.clone();
this.sequenceNames = Collections.unmodifiableList(new ArrayList<String>(sequenceNames));
this.indices = indices;
}
/**
* @param inputStream This is expected to be buffered and be gzip-decompressing as appropriate. Caller
* should close input stream after ctor returns.
*/
public TabixIndex(final InputStream inputStream) throws IOException {
this(inputStream, false);
}
/**
* Convenient ctor that opens the file, wraps with with BGZF reader, and closes after reading index.
*/
public TabixIndex(final File tabixFile) throws IOException {
this(new BlockCompressedInputStream(tabixFile), true);
}
private TabixIndex(final InputStream inputStream, final boolean closeInputStream) throws IOException {
final LittleEndianInputStream dis = new LittleEndianInputStream(inputStream);
if (dis.readInt() != MAGIC_NUMBER) {
throw new TribbleException(String.format("Unexpected magic number 0x%x", MAGIC_NUMBER));
}
final int numSequences = dis.readInt();
indices = new BinningIndexContent[numSequences];
formatSpec = new TabixFormat();
formatSpec.flags = dis.readInt();
formatSpec.sequenceColumn = dis.readInt();
formatSpec.startPositionColumn = dis.readInt();
formatSpec.endPositionColumn = dis.readInt();
formatSpec.metaCharacter = (char)dis.readInt();
formatSpec.numHeaderLinesToSkip = dis.readInt();
final int nameBlockSize = dis.readInt();
final byte[] nameBlock = new byte[nameBlockSize];
if (dis.read(nameBlock) != nameBlockSize) throw new EOFException("Premature end of file reading Tabix header");
final List<String> sequenceNames = new ArrayList<String>(numSequences);
int startPos = 0;
for (int i = 0; i < numSequences; ++i) {
int endPos = startPos;
while (nameBlock[endPos] != '\0') ++endPos;
sequenceNames.add(StringUtil.bytesToString(nameBlock, startPos, endPos - startPos));
startPos = endPos + 1;
}
if (startPos != nameBlockSize) {
throw new TribbleException("Tabix header format exception. Sequence name block is longer than expected");
}
for (int i = 0; i < numSequences; ++i) {
indices[i] = loadSequence(i, dis);
}
if (closeInputStream) CloserUtil.close(dis);
this.sequenceNames = Collections.unmodifiableList(sequenceNames);
}
/**
*
* @param chr the chromosome
* @param start the start position, one-based, inclusive.
* @param end the end position, one-based, inclusive.
* @return List of regions of file that are candidates for the given query.
*
* TODO: This method has not yet been tested, since the primary task is index writing.
*/
@Override
public List<Block> getBlocks(final String chr, final int start, final int end) {
final int sequenceIndex = sequenceNames.indexOf(chr);
if (sequenceIndex == -1 || indices[sequenceIndex] == null) {
return Collections.EMPTY_LIST;
}
final List<Chunk> chunks = indices[sequenceIndex].getChunksOverlapping(start, end);
final List<Block> ret = new ArrayList<Block>(chunks.size());
for (final Chunk chunk : chunks) {
ret.add(new Block(chunk.getChunkStart(), chunk.getChunkEnd() - chunk.getChunkStart()));
}
return ret;
}
@Override
public boolean isCurrentVersion() {
return true;
}
@Override
public List<String> getSequenceNames() {
return sequenceNames;
}
@Override
public boolean containsChromosome(final String chr) {
return sequenceNames.contains(chr);
}
/**
*
* No arbitrary properties in Tabix
*/
@Override
public Map<String, String> getProperties() {
return null;
}
@Override
public boolean equalsIgnoreProperties(final Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final TabixIndex that = (TabixIndex) o;
if (!formatSpec.equals(that.formatSpec)) return false;
if (!Arrays.equals(indices, that.indices)) return false;
return sequenceNames.equals(that.sequenceNames);
}
public TabixFormat getFormatSpec() {
return formatSpec;
}
/**
* Writes the index with BGZF.
* @param tabixFile Where to write the index.
*/
public void write(final File tabixFile) {
final LittleEndianOutputStream los = new LittleEndianOutputStream(new BlockCompressedOutputStream(tabixFile));
try {
write(los);
los.close();
} catch (final IOException e) {
throw new TribbleException("Exception writing " + tabixFile.getAbsolutePath(), e);
}
}
/**
* Writes to a file with appropriate name and directory based on feature file.
* @param featureFile File being indexed.
*/
@Override
public void writeBasedOnFeatureFile(final File featureFile) throws IOException {
if (!featureFile.isFile()) return;
write(new File(featureFile.getAbsolutePath() + TabixUtils.STANDARD_INDEX_EXTENSION));
}
/**
*
* @param los It is assumes that caller has done appropriate buffering and BlockCompressedOutputStream wrapping.
* Caller should close output stream after invoking this method.
* @throws IOException
*/
@Override
public void write(final LittleEndianOutputStream los) throws IOException {
los.writeInt(MAGIC_NUMBER);
los.writeInt(sequenceNames.size());
los.writeInt(formatSpec.flags);
los.writeInt(formatSpec.sequenceColumn);
los.writeInt(formatSpec.startPositionColumn);
los.writeInt(formatSpec.endPositionColumn);
los.writeInt(formatSpec.metaCharacter);
los.writeInt(formatSpec.numHeaderLinesToSkip);
int nameBlockSize = sequenceNames.size(); // null terminators
for (final String sequenceName : sequenceNames) nameBlockSize += sequenceName.length();
los.writeInt(nameBlockSize);
for (final String sequenceName : sequenceNames) {
los.write(StringUtil.stringToBytes(sequenceName));
los.write(0);
}
for (final BinningIndexContent index : indices) {
writeSequence(index, los);
}
}
private void writeSequence(final BinningIndexContent indexContent, final LittleEndianOutputStream los) throws IOException {
if (indexContent == null) {
los.writeInt(0);
} else {
final BinningIndexContent.BinList binList = indexContent.getBins();
los.writeInt(binList.numberOfNonNullBins);
for (final Bin bin : binList) {
writeBin(bin, los);
}
writeLinearIndex(indexContent.getLinearIndex(), los);
}
}
private void writeLinearIndex(final LinearIndex linearIndex, final LittleEndianOutputStream los) throws IOException {
if (linearIndex.getIndexStart() != 0) {
// This could be handled by writing zeroes, but it is not expected so just fail.
throw new IllegalArgumentException("Non-zero linear index start");
}
final long[] entries = linearIndex.getIndexEntries();
los.writeInt(entries.length);
for (final long entry : entries) los.writeLong(entry);
}
private void writeBin(final Bin bin, final LittleEndianOutputStream los) throws IOException {
los.writeInt(bin.getBinNumber());
final List<Chunk> chunkList = bin.getChunkList();
los.writeInt(chunkList.size());
for (final Chunk chunk: chunkList) {
los.writeLong(chunk.getChunkStart());
los.writeLong(chunk.getChunkEnd());
}
}
/**
* Although this is probably identical to BAM index reading code, code does not exist there to load directly
* into a BinningIndexContent object, so that is implemented here.
* @param referenceSequenceIndex Merely for setting in the returned object, not for seeking into the file.
* @param dis This method assumes that the current position is at the start of the reference.
*/
private BinningIndexContent loadSequence(final int referenceSequenceIndex, final LittleEndianInputStream dis) throws IOException {
final int numBins = dis.readInt();
if (numBins == 0) return null;
int nonNullBins = 0;
final ArrayList<Bin> bins = new ArrayList<Bin>();
for (int i = 0; i < numBins; ++i) {
final Bin bin = loadBin(referenceSequenceIndex, dis);
if (bin != null) {
// File is not sparse, but array being produced is sparse, so grow array with nulls as appropriate
// so that bin number == index into array.
++nonNullBins;
if (bins.size() > bin.getBinNumber()) {
if (bins.get(bin.getBinNumber()) != null) {
throw new TribbleException("Bin " + bin.getBinNumber() + " appears more than once in file");
}
bins.set(bin.getBinNumber(),bin);
} else {
// Grow bins array as needed.
bins.ensureCapacity(bin.getBinNumber() + 1);
while (bins.size() < bin.getBinNumber()) bins.add(null);
bins.add(bin);
}
}
}
final LinearIndex linearIndex = loadLinearIndex(referenceSequenceIndex, dis);
return new BinningIndexContent(referenceSequenceIndex,
new BinningIndexContent.BinList(bins.toArray(new Bin[bins.size()]), nonNullBins), linearIndex);
}
private LinearIndex loadLinearIndex(final int referenceSequenceIndex, final LittleEndianInputStream dis) throws IOException {
final int numElements = dis.readInt();
final long[] elements = new long[numElements];
for (int i = 0; i < numElements; ++i) {
elements[i] = dis.readLong();
}
return new LinearIndex(referenceSequenceIndex, 0, elements);
}
private Bin loadBin(final int referenceSequenceIndex, final LittleEndianInputStream dis) throws IOException {
final int binNumber = dis.readInt();
final Bin ret = new Bin(referenceSequenceIndex, binNumber);
final int numChunks = dis.readInt();
final List<Chunk> chunkList = new ArrayList<Chunk>(numChunks);
for (int i = 0; i < numChunks; ++i) {
chunkList.add(loadChunk(dis));
}
ret.setChunkList(chunkList);
return ret;
}
private Chunk loadChunk(final LittleEndianInputStream dis) throws IOException {
final long start = dis.readLong();
final long end = dis.readLong();
return new Chunk(start, end);
}
@Override
public boolean equals(final Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final TabixIndex index = (TabixIndex) o;
if (!formatSpec.equals(index.formatSpec)) return false;
if (!Arrays.equals(indices, index.indices)) return false;
if (!sequenceNames.equals(index.sequenceNames)) return false;
return true;
}
@Override
public int hashCode() {
int result = formatSpec.hashCode();
result = 31 * result + sequenceNames.hashCode();
result = 31 * result + Arrays.hashCode(indices);
return result;
}
}