/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools.reference;
import htsjdk.samtools.Defaults;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.util.FastLineReader;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.StringUtil;
import java.io.File;
/**
* Implementation of ReferenceSequenceFile for reading from FASTA files.
*
* @author Tim Fennell
*/
public class FastaSequenceFile extends AbstractFastaSequenceFile {
private final boolean truncateNamesAtWhitespace;
private FastLineReader in;
private int sequenceIndex = -1;
private final byte[] basesBuffer = new byte[Defaults.NON_ZERO_BUFFER_SIZE];
/** Constructs a FastaSequenceFile that reads from the specified file. */
public FastaSequenceFile(final File file, final boolean truncateNamesAtWhitespace) {
super(file);
this.truncateNamesAtWhitespace = truncateNamesAtWhitespace;
this.in = new FastLineReader(IOUtil.openFileForReading(file));
}
/**
* It's good to call this to free up memory.
*/
public void close() {
in.close();
}
public ReferenceSequence nextSequence() {
this.sequenceIndex += 1;
// Read the header line
final String name = readSequenceName();
if (name == null) {
close();
return null;
}
// Read the sequence
final int knownLength = (this.sequenceDictionary == null) ? -1 : this.sequenceDictionary.getSequence(this.sequenceIndex).getSequenceLength();
final byte[] bases = readSequence(knownLength);
return new ReferenceSequence(name, this.sequenceIndex, bases);
}
public void reset() {
this.sequenceIndex = -1;
this.in.close();
this.in = new FastLineReader(IOUtil.openFileForReading(file));
}
private String readSequenceName() {
in.skipNewlines();
if (in.eof()) {
return null;
}
final byte b = in.getByte();
if (b != '>') {
throw new SAMException("Format exception reading FASTA " + file + ". Expected > but saw chr(" +
b + ") at start of sequence with index " + this.sequenceIndex);
}
final byte[] nameBuffer = new byte[4096];
int nameLength = 0;
do {
if (in.eof()) {
break;
}
nameLength += in.readToEndOfOutputBufferOrEoln(nameBuffer, nameLength);
if (nameLength == nameBuffer.length && !in.atEoln()) {
throw new SAMException("Sequence name too long in FASTA " + file);
}
} while (!in.atEoln());
if (nameLength == 0) {
throw new SAMException("Missing sequence name in FASTA " + file);
}
String name = StringUtil.bytesToString(nameBuffer, 0, nameLength).trim();
if (truncateNamesAtWhitespace) {
name = SAMSequenceRecord.truncateSequenceName(name);
}
return name;
}
/**
* Read bases from input
* @param knownLength For performance:: -1 if length is not known, otherwise the length of the sequence.
* @return ASCII bases for sequence
*/
private byte[] readSequence(final int knownLength) {
byte[] bases = (knownLength == -1) ? basesBuffer : new byte[knownLength] ;
int sequenceLength = 0;
while (!in.eof()) {
final boolean sawEoln = in.skipNewlines();
if (in.eof()) {
break;
}
if (sawEoln && in.peekByte() == '>') {
break;
}
sequenceLength += in.readToEndOfOutputBufferOrEoln(bases, sequenceLength);
while (sequenceLength > 0 && Character.isWhitespace(StringUtil.byteToChar(bases[sequenceLength - 1]))) {
--sequenceLength;
}
if (sequenceLength == knownLength) {
break;
}
if (sequenceLength == bases.length) {
final byte[] tmp = new byte[bases.length * 2];
System.arraycopy(bases, 0, tmp, 0, sequenceLength);
bases = tmp;
}
}
// And lastly resize the array down to the right size
if (sequenceLength != bases.length || bases == basesBuffer) {
final byte[] tmp = new byte[sequenceLength];
System.arraycopy(bases, 0, tmp, 0, sequenceLength);
bases = tmp;
}
return bases;
}
}