/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools.reference;
import htsjdk.samtools.Defaults;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.util.IOUtil;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.Iterator;
/**
* A fasta file driven by an index for fast, concurrent lookups. Supports two interfaces:
* the ReferenceSequenceFile for old-style, stateful lookups and a direct getter.
*/
public class IndexedFastaSequenceFile extends AbstractFastaSequenceFile implements Closeable {
/**
* The interface facilitating direct access to the fasta.
*/
private final FileChannel channel;
/**
* A representation of the sequence index, stored alongside the fasta in a .fasta.fai file.
*/
private final FastaSequenceIndex index;
/**
* An iterator into the fasta index, for traversing iteratively across the fasta.
*/
private Iterator<FastaSequenceIndexEntry> indexIterator;
/**
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
* @param file The file to open.
* @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk.
* @throws FileNotFoundException If the fasta or any of its supporting files cannot be found.
*/
public IndexedFastaSequenceFile(final File file, final FastaSequenceIndex index) {
super(file);
if (index == null) throw new IllegalArgumentException("Null index for fasta " + file);
this.index = index;
IOUtil.assertFileIsReadable(file);
final FileInputStream in;
try {
in = new FileInputStream(file);
} catch (FileNotFoundException e) {
throw new SAMException("Fasta file should be readable but is not: " + file, e);
}
channel = in.getChannel();
reset();
if(getSequenceDictionary() != null)
sanityCheckDictionaryAgainstIndex(file.getAbsolutePath(),sequenceDictionary,index);
}
/**
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
* @param file The file to open.
* @throws FileNotFoundException If the fasta or any of its supporting files cannot be found.
*/
public IndexedFastaSequenceFile(final File file) throws FileNotFoundException {
this(file, new FastaSequenceIndex((findRequiredFastaIndexFile(file))));
}
public boolean isIndexed() {return true;}
private static File findFastaIndex(File fastaFile) {
File indexFile = getFastaIndexFileName(fastaFile);
if (!indexFile.exists()) return null;
return indexFile;
}
private static File getFastaIndexFileName(File fastaFile) {
return new File(fastaFile.getAbsolutePath() + ".fai");
}
private static File findRequiredFastaIndexFile(File fastaFile) throws FileNotFoundException {
File ret = findFastaIndex(fastaFile);
if (ret == null) throw new FileNotFoundException(getFastaIndexFileName(fastaFile) + " not found.");
return ret;
}
public static boolean canCreateIndexedFastaReader(final File fastaFile) {
return (fastaFile.exists() &&
findFastaIndex(fastaFile) != null);
}
/**
* Do some basic checking to make sure the dictionary and the index match.
* @param fastaFile Used for error reporting only.
* @param sequenceDictionary sequence dictionary to check against the index.
* @param index index file to check against the dictionary.
*/
protected static void sanityCheckDictionaryAgainstIndex(final String fastaFile,
final SAMSequenceDictionary sequenceDictionary,
final FastaSequenceIndex index) {
// Make sure dictionary and index are the same size.
if( sequenceDictionary.getSequences().size() != index.size() )
throw new SAMException("Sequence dictionary and index contain different numbers of contigs");
Iterator<SAMSequenceRecord> sequenceIterator = sequenceDictionary.getSequences().iterator();
Iterator<FastaSequenceIndexEntry> indexIterator = index.iterator();
while(sequenceIterator.hasNext() && indexIterator.hasNext()) {
SAMSequenceRecord sequenceEntry = sequenceIterator.next();
FastaSequenceIndexEntry indexEntry = indexIterator.next();
if(!sequenceEntry.getSequenceName().equals(indexEntry.getContig())) {
throw new SAMException(String.format("Mismatch between sequence dictionary fasta index for %s, sequence '%s' != '%s'.",
fastaFile, sequenceEntry.getSequenceName(),indexEntry.getContig()));
}
// Make sure sequence length matches index length.
if( sequenceEntry.getSequenceLength() != indexEntry.getSize())
throw new SAMException("Index length does not match dictionary length for contig: " + sequenceEntry.getSequenceName() );
}
}
/**
* Retrieves the sequence dictionary for the fasta file.
* @return sequence dictionary of the fasta.
*/
public SAMSequenceDictionary getSequenceDictionary() {
return sequenceDictionary;
}
/**
* Retrieves the complete sequence described by this contig.
* @param contig contig whose data should be returned.
* @return The full sequence associated with this contig.
*/
public ReferenceSequence getSequence( String contig ) {
return getSubsequenceAt( contig, 1, (int)index.getIndexEntry(contig).getSize() );
}
/**
* Gets the subsequence of the contig in the range [start,stop]
* @param contig Contig whose subsequence to retrieve.
* @param start inclusive, 1-based start of region.
* @param stop inclusive, 1-based stop of region.
* @return The partial reference sequence associated with this range.
*/
public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) {
if(start > stop + 1)
throw new SAMException(String.format("Malformed query; start point %d lies after end point %d",start,stop));
FastaSequenceIndexEntry indexEntry = index.getIndexEntry(contig);
if(stop > indexEntry.getSize())
throw new SAMException("Query asks for data past end of contig");
int length = (int)(stop - start + 1);
byte[] target = new byte[length];
ByteBuffer targetBuffer = ByteBuffer.wrap(target);
final int basesPerLine = indexEntry.getBasesPerLine();
final int bytesPerLine = indexEntry.getBytesPerLine();
final int terminatorLength = bytesPerLine - basesPerLine;
long startOffset = ((start-1)/basesPerLine)*bytesPerLine + (start-1)%basesPerLine;
// Allocate a 128K buffer for reading in sequence data.
ByteBuffer channelBuffer = ByteBuffer.allocate(Defaults.NON_ZERO_BUFFER_SIZE);
while(targetBuffer.position() < length) {
// If the bufferOffset is currently within the eol characters in the string, push the bufferOffset forward to the next printable character.
startOffset += Math.max((int)(startOffset%bytesPerLine - basesPerLine + 1),0);
try {
startOffset += channel.read(channelBuffer,indexEntry.getLocation()+startOffset);
}
catch(IOException ex) {
throw new SAMException("Unable to load " + contig + "(" + start + ", " + stop + ") from " + file);
}
// Reset the buffer for outbound transfers.
channelBuffer.flip();
// Calculate the size of the next run of bases based on the contents we've already retrieved.
final int positionInContig = (int)start-1+targetBuffer.position();
final int nextBaseSpan = Math.min(basesPerLine-positionInContig%basesPerLine,length-targetBuffer.position());
// Cap the bytes to transfer by limiting the nextBaseSpan to the size of the channel buffer.
int bytesToTransfer = Math.min(nextBaseSpan,channelBuffer.capacity());
channelBuffer.limit(channelBuffer.position()+bytesToTransfer);
while(channelBuffer.hasRemaining()) {
targetBuffer.put(channelBuffer);
bytesToTransfer = Math.min(basesPerLine,length-targetBuffer.position());
channelBuffer.limit(Math.min(channelBuffer.position()+bytesToTransfer+terminatorLength,channelBuffer.capacity()));
channelBuffer.position(Math.min(channelBuffer.position()+terminatorLength,channelBuffer.capacity()));
}
// Reset the buffer for inbound transfers.
channelBuffer.flip();
}
return new ReferenceSequence( contig, indexEntry.getSequenceIndex(), target );
}
/**
* Gets the next sequence if available, or null if not present.
* @return next sequence if available, or null if not present.
*/
public ReferenceSequence nextSequence() {
if( !indexIterator.hasNext() )
return null;
return getSequence( indexIterator.next().getContig() );
}
/**
* Reset the iterator over the index.
*/
public void reset() {
indexIterator = index.iterator();
}
/**
* A simple toString implementation for debugging.
* @return String representation of the file.
*/
public String toString() {
return this.file.getAbsolutePath();
}
@Override
public void close() throws IOException {
channel.close();
}
}