/*
* The MIT License
*
* Copyright (c) 2013 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.tribble;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.seekablestream.SeekableStreamFactory;
import htsjdk.tribble.index.Block;
import htsjdk.tribble.index.Index;
import htsjdk.tribble.index.IndexFactory;
import htsjdk.tribble.readers.PositionalBufferedStream;
import htsjdk.tribble.util.ParsingUtils;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.zip.GZIPInputStream;
/**
* A reader for text feature files (i.e. not tabix files). This includes tribble-indexed and non-indexed files. If
* index both iterate() and query() methods are supported.
* <p/>
* Note: Non-indexed files can be gzipped, but not bgzipped.
*
* @author Jim Robinson
* @since 2/11/12
*/
public class TribbleIndexedFeatureReader<T extends Feature, SOURCE> extends AbstractFeatureReader<T, SOURCE> {
private Index index;
/**
* is the path pointing to our source data a regular file?
*/
private final boolean pathIsRegularFile;
/**
* a potentially reusable seekable stream for queries over regular files
*/
private SeekableStream seekableStream = null;
/**
* We lazy-load the index but it might not even exist
* Don't want to keep checking if that's the case
*/
private boolean needCheckForIndex = true;
/**
* @param featurePath - path to the feature file, can be a local file path, http url, or ftp url
* @param codec - codec to decode the features
* @param requireIndex - true if the reader will be queries for specific ranges. An index (idx) file must exist
* @throws IOException
*/
public TribbleIndexedFeatureReader(final String featurePath, final FeatureCodec<T, SOURCE> codec, final boolean requireIndex) throws IOException {
super(featurePath, codec);
if (requireIndex) {
this.loadIndex();
if(!this.hasIndex()){
throw new TribbleException("An index is required, but none found.");
}
}
// does path point to a regular file?
this.pathIsRegularFile = SeekableStreamFactory.isFilePath(path);
readHeader();
}
/**
* @param featureFile - path to the feature file, can be a local file path, http url, or ftp url
* @param indexFile - path to the index file
* @param codec - codec to decode the features
* @param requireIndex - true if the reader will be queries for specific ranges. An index (idx) file must exist
* @throws IOException
*/
public TribbleIndexedFeatureReader(final String featureFile, final String indexFile, final FeatureCodec<T, SOURCE> codec, final boolean requireIndex) throws IOException {
this(featureFile, codec, false); // required to read the header
if (indexFile != null && ParsingUtils.resourceExists(indexFile)) {
index = IndexFactory.loadIndex(indexFile);
this.needCheckForIndex = false;
} else {
if (requireIndex) {
this.loadIndex();
if(!this.hasIndex()){
throw new TribbleException("An index is required, but none found.");
}
}
}
}
/**
* @param featureFile - path to the feature file, can be a local file path, http url, or ftp url
* @param codec - codec to decode the features
* @param index - a tribble Index object
* @throws IOException
*/
public TribbleIndexedFeatureReader(final String featureFile, final FeatureCodec<T, SOURCE> codec, final Index index) throws IOException {
this(featureFile, codec, false); // required to read the header
this.index = index;
this.needCheckForIndex = false;
}
/**
* Attempt to load the index for the specified {@link #path}.
* If the {@link #path} has no available index file,
* does nothing
* @throws IOException
*/
private void loadIndex() throws IOException{
String indexFile = Tribble.indexFile(this.path);
if (ParsingUtils.resourceExists(indexFile)) {
index = IndexFactory.loadIndex(indexFile);
} else {
// See if the index itself is gzipped
indexFile = ParsingUtils.appendToPath(indexFile, ".gz");
if (ParsingUtils.resourceExists(indexFile)) {
index = IndexFactory.loadIndex(indexFile);
}
}
this.needCheckForIndex = false;
}
/**
* Get a seekable stream appropriate to read information from the current feature path
* <p/>
* This function ensures that if reuseStreamInQuery returns true then this function will only
* ever return a single unique instance of SeekableStream for all calls given this instance of
* TribbleIndexedFeatureReader. If reuseStreamInQuery() returns false then the returned SeekableStream
* will be newly opened each time, and should be closed after each use.
*
* @return a SeekableStream
*/
private SeekableStream getSeekableStream() throws IOException {
final SeekableStream result;
if (reuseStreamInQuery()) {
// if the stream points to an underlying file, only create the underlying seekable stream once
if (seekableStream == null) seekableStream = SeekableStreamFactory.getInstance().getStreamFor(path);
result = seekableStream;
} else {
// we are not reusing the stream, so make a fresh copy each time we request it
result = SeekableStreamFactory.getInstance().getStreamFor(path);
}
return result;
}
/**
* Are we attempting to reuse the underlying stream in query() calls?
*
* @return true if
*/
private boolean reuseStreamInQuery() {
return pathIsRegularFile;
}
public void close() throws IOException {
// close the seekable stream if that's necessary
if (seekableStream != null) seekableStream.close();
}
/**
* Return the sequence (chromosome/contig) names in this file, if known.
*
* @return list of strings of the contig names
*/
public List<String> getSequenceNames() {
return !this.hasIndex() ? new ArrayList<String>() : new ArrayList<String>(index.getSequenceNames());
}
@Override
public boolean hasIndex() {
if(index == null && this.needCheckForIndex){
try {
this.loadIndex();
} catch (IOException e) {
throw new TribbleException("Error loading index file: " + e.getMessage(), e);
}
}
return index != null;
}
/**
* read the header from the file
*
* @throws IOException throws an IOException if we can't open the file
*/
private void readHeader() throws IOException {
InputStream is = null;
PositionalBufferedStream pbs = null;
try {
is = ParsingUtils.openInputStream(path);
if (path.endsWith("gz")) {
// TODO -- warning I don't think this can work, the buffered input stream screws up position
is = new GZIPInputStream(new BufferedInputStream(is));
}
pbs = new PositionalBufferedStream(is);
final SOURCE source = codec.makeSourceFromStream(pbs);
header = codec.readHeader(source);
} catch (Exception e) {
throw new TribbleException.MalformedFeatureFile("Unable to parse header with error: " + e.getMessage(), path, e);
} finally {
if (pbs != null) pbs.close();
else if (is != null) is.close();
}
}
/**
* Return an iterator to iterate over features overlapping the specified interval
* <p/>
* Note that TribbleIndexedFeatureReader only supports issuing and manipulating a single query
* for each reader. That is, the behavior of the following code is undefined:
* <p/>
* reader = new TribbleIndexedFeatureReader()
* Iterator it1 = reader.query("x", 10, 20)
* Iterator it2 = reader.query("x", 1000, 1010)
* <p/>
* As a consequence of this, the TribbleIndexedFeatureReader are also not thread-safe.
*
* @param chr contig
* @param start start position
* @param end end position
* @return an iterator of records in this interval
* @throws IOException
*/
public CloseableTribbleIterator<T> query(final String chr, final int start, final int end) throws IOException {
if (!this.hasIndex()) {
throw new TribbleException("Index not found for: " + path);
}
if (index.containsChromosome(chr)) {
final List<Block> blocks = index.getBlocks(chr, start - 1, end);
return new QueryIterator(chr, start, end, blocks);
} else {
return new EmptyIterator<T>();
}
}
/**
* @return Return an iterator to iterate over the entire file
* @throws IOException
*/
public CloseableTribbleIterator<T> iterator() throws IOException {
return new WFIterator();
}
/**
* Class to iterator over an entire file.
*/
class WFIterator implements CloseableTribbleIterator<T> {
private T currentRecord;
private SOURCE source;
/**
* Constructor for iterating over the entire file (seekableStream).
*
* @throws IOException
*/
public WFIterator() throws IOException {
final InputStream inputStream = ParsingUtils.openInputStream(path);
final PositionalBufferedStream pbs;
if (path.endsWith(".gz")) {
// Gzipped -- we need to buffer the GZIPInputStream methods as this class makes read() calls,
// and seekableStream does not support single byte reads
final InputStream is = new GZIPInputStream(new BufferedInputStream(inputStream, 512000));
pbs = new PositionalBufferedStream(is, 1000); // Small buffer as this is buffered already.
} else {
pbs = new PositionalBufferedStream(inputStream, 512000);
}
/**
* The header was already read from the original source in the constructor; don't read it again, since some codecs keep state
* about its initializagtion. Instead, skip that part of the stream.
*/
pbs.skip(header.getHeaderEnd());
source = codec.makeSourceFromStream(pbs);
readNextRecord();
}
@Override
public boolean hasNext() {
return currentRecord != null;
}
@Override
public T next() {
final T ret = currentRecord;
try {
readNextRecord();
} catch (IOException e) {
throw new RuntimeException("Unable to read the next record, the last record was at " +
ret.getChr() + ":" + ret.getStart() + "-" + ret.getEnd(), e);
}
return ret;
}
/**
* Advance to the next record in the query interval.
*
* @throws IOException
*/
private void readNextRecord() throws IOException {
currentRecord = null;
while (!codec.isDone(source)) {
final T f;
try {
f = codec.decode(source);
if (f == null) {
continue;
}
currentRecord = f;
return;
} catch (TribbleException e) {
e.setSource(path);
throw e;
} catch (NumberFormatException e) {
final String error = "Error parsing line at byte position: " + source;
throw new TribbleException.MalformedFeatureFile(error, path, e);
}
}
}
@Override
public void remove() {
throw new UnsupportedOperationException("Remove is not supported in Iterators");
}
@Override
public void close() {
codec.close(source);
}
@Override
public WFIterator iterator() {
return this;
}
}
/**
* Iterator for a query interval
*/
class QueryIterator implements CloseableTribbleIterator<T> {
private String chrAlias;
int start;
int end;
private T currentRecord;
private SOURCE source;
private SeekableStream mySeekableStream;
private Iterator<Block> blockIterator;
public QueryIterator(final String chr, final int start, final int end, final List<Block> blocks) throws IOException {
this.start = start;
this.end = end;
mySeekableStream = getSeekableStream();
blockIterator = blocks.iterator();
advanceBlock();
readNextRecord();
// The feature chromosome might not be the query chromosome, due to alias definitions. We assume
// the chromosome of the first record is correct and record it here. This is not pretty.
chrAlias = (currentRecord == null ? chr : currentRecord.getChr());
}
public boolean hasNext() {
return currentRecord != null;
}
public T next() {
final T ret = currentRecord;
try {
readNextRecord();
} catch (IOException e) {
throw new RuntimeException("Unable to read the next record, the last record was at " +
ret.getChr() + ":" + ret.getStart() + "-" + ret.getEnd(), e);
}
return ret;
}
private void advanceBlock() throws IOException {
while (blockIterator != null && blockIterator.hasNext()) {
final Block block = blockIterator.next();
if (block.getSize() > 0) {
final int bufferSize = Math.min(2000000, block.getSize() > 100000000 ? 10000000 : (int) block.getSize());
source = codec.makeSourceFromStream(new PositionalBufferedStream(new BlockStreamWrapper(mySeekableStream, block), bufferSize));
// note we don't have to skip the header here as the block should never start in the header
return;
}
}
// If we get here the blocks are exhausted, set reader to null
if (source != null) {
codec.close(source);
source = null;
}
}
/**
* Advance to the next record in the query interval.
*
* @throws IOException
*/
private void readNextRecord() throws IOException {
if (source == null) {
return; // <= no more features to read
}
currentRecord = null;
while (true) { // Loop through blocks
while (!codec.isDone(source)) { // Loop through current block
final T f;
try {
f = codec.decode(source);
if (f == null) {
continue; // Skip
}
if ((chrAlias != null && !f.getChr().equals(chrAlias)) || f.getStart() > end) {
if (blockIterator.hasNext()) {
advanceBlock();
continue;
} else {
return; // Done
}
}
if (f.getEnd() < start) {
continue; // Skip
}
currentRecord = f; // Success
return;
} catch (TribbleException e) {
e.setSource(path);
throw e;
} catch (NumberFormatException e) {
final String error = "Error parsing line: " + source;
throw new TribbleException.MalformedFeatureFile(error, path, e);
}
}
if (blockIterator != null && blockIterator.hasNext()) {
advanceBlock(); // Advance to next block
} else {
return; // No blocks left, we're done.
}
}
}
public void remove() {
throw new UnsupportedOperationException("Remove is not supported.");
}
public void close() {
// Note that this depends on BlockStreamWrapper not actually closing the underlying stream
codec.close(source);
if (!reuseStreamInQuery()) {
// if we are going to reuse the underlying stream we don't close the underlying stream.
try {
mySeekableStream.close();
} catch (IOException e) {
throw new TribbleException("Couldn't close seekable stream", e);
}
}
}
public Iterator<T> iterator() {
return this;
}
}
/**
* Wrapper around a SeekableStream that limits reading to the specified "block" of bytes. Attempts to
* read beyond the end of the block should return -1 (EOF).
*/
static class BlockStreamWrapper extends InputStream {
SeekableStream seekableStream;
long maxPosition;
BlockStreamWrapper(final SeekableStream seekableStream, final Block block) throws IOException {
this.seekableStream = seekableStream;
seekableStream.seek(block.getStartPosition());
maxPosition = block.getEndPosition();
}
@Override
public int read() throws IOException {
return (seekableStream.position() > maxPosition) ? -1 : seekableStream.read();
}
@Override
public int read(final byte[] bytes, final int off, final int len) throws IOException {
// note the careful treatment here to ensure we can continue to
// read very long > Integer sized blocks
final long maxBytes = maxPosition - seekableStream.position();
if (maxBytes <= 0) {
return -1;
}
final int bytesToRead = (int) Math.min(len, Math.min(maxBytes, Integer.MAX_VALUE));
return seekableStream.read(bytes, off, bytesToRead);
}
}
}