package aliview.importer;
import java.io.BufferedReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import aliview.MemoryUtils;
import aliview.sequences.FastFastaSequence;
import aliview.sequences.Sequence;
public class FastFastaImporter {
private static final Logger logger = Logger.getLogger(FastFastaImporter.class);
private Reader reader;
private int longestSequenceLength;
public FastFastaImporter(Reader reader) {
this.reader = reader;
}
public List<Sequence> importSequences() throws AlignmentImportException {
long startTime = System.currentTimeMillis();
ArrayList<Sequence> sequences = new ArrayList<Sequence>();
int nextSeqEstSize = 5000;
double maxMem = MemoryUtils.getMaxMem();
try {
StringBuilder sequence = new StringBuilder(nextSeqEstSize);
// ByteBufferAutogrow seqBuff = new ByteBufferAutogrow(capacity);
BufferedReader r = new BufferedReader(this.reader);
String line;
String name = null;
int nLine = 0;
while ((line = r.readLine()) != null) {
line = line.trim();
if(nLine == 0){
// if not fasta file then break
if(line.length() > 0 && line.charAt(0) != '>'){
// no fasta
throw new AlignmentImportException("Fasta file should start with > character");
}
}
if(line.length() > 0){
if(line.charAt(0) == '>'){
// if there is one sequence in buffer already create that one before starting a new one
if(name != null && name.length() > 0){
//String seqAsString = sequence.toString();
//nextSeqEstSize = sequence.length();
// if there are whitespace replace them
if(sequence.indexOf(" ") > -1){
sequence = FileImportUtils.removeAll(sequence, " ");
}
//FileImportUtils.replaceChar(sequence, '.', '-');
byte[] bytes = getBytesFromBuffer(sequence);
sequences.add(new FastFastaSequence(name, bytes));
this.longestSequenceLength = Math.max(this.longestSequenceLength, sequence.length());
sequence = new StringBuilder(nextSeqEstSize + 10);
name = null;
}
// skip
name = line.substring(1);
}
else{
sequence.append(line);
}
if(sequence.length() > maxMem/8){
throw new AlignmentImportException("Sequence to long for memory");
}
}
nLine ++;
}
// add last sequence
if(name != null && name.length() > 0){
if(sequence.indexOf(" ") > -1){
// sequence = FileImportUtils.replace(sequence, " ", "", -1);
sequence = FileImportUtils.removeAll(sequence, " ");
}
byte[] bytes = getBytesFromBuffer(sequence);
sequences.add(new FastFastaSequence(name, bytes));
this.longestSequenceLength = Math.max(this.longestSequenceLength, sequence.length());
}
} catch (Exception e) {
logger.error(e);
// TODO Auto-generated catch block
throw new AlignmentImportException("could not import as fasta file because: " + e.getMessage());
}
long endTime = System.currentTimeMillis();
System.out.println("reading sequences took " + (endTime - startTime) + " milliseconds");
return sequences;
}
//
// Byte buffer version
//
public List<Sequence> importSequencesBB() throws AlignmentImportException {
long startTime = System.currentTimeMillis();
ArrayList<Sequence> sequences = new ArrayList<Sequence>();
int nextSeqEstSize = 5000;
try {
//StringBuilder sequence = new StringBuilder(nextSeqEstSize);
ByteBufferAutogrow seqBuff = new ByteBufferAutogrow(nextSeqEstSize);
BufferedReader r = new BufferedReader(this.reader);
String line;
String name = null;
int nLine = 0;
while ((line = r.readLine()) != null) {
line = line.trim();
if(nLine == 0){
// if not fasta file then break
if(line.length() > 0 && line.charAt(0) != '>'){
// no fasta
throw new AlignmentImportException("Fasta file should start with > character");
}
}
if(line.length() > 0){
if(line.charAt(0) == '>'){
// if there is one sequence in buffer already create that one before starting a new one
if(name != null && name.length() > 0){
// if there are whitespace replace them
// if(sequence.indexOf(" ") > -1){
// sequence = FileImportUtils.removeAll(sequence, " ");
// }
// //FileImportUtils.replaceChar(sequence, '.', '-');
byte[] bytes = seqBuff.getBytes();
sequences.add(new FastFastaSequence(name, bytes));
this.longestSequenceLength = Math.max(this.longestSequenceLength, seqBuff.position());
seqBuff.ensureCapacity(this.longestSequenceLength);
seqBuff.clear();
name = null;
}
// skip
name = line.substring(1);
}
else{
line= FileImportUtils.removeAll(line, ' ');
seqBuff.append(line);
}
}
nLine ++;
}
// add last sequence
if(name != null && name.length() > 0){
line= FileImportUtils.removeAll(line, ' ');
byte[] bytes = seqBuff.getBytes();
sequences.add(new FastFastaSequence(name, bytes));
this.longestSequenceLength = Math.max(this.longestSequenceLength, seqBuff.position());
}
} catch (Exception e) {
logger.error(e);
// TODO Auto-generated catch block
throw new AlignmentImportException("could not import as fasta file because: " + e.getMessage());
}
long endTime = System.currentTimeMillis();
System.out.println("reading sequences took " + (endTime - startTime) + " milliseconds");
return sequences;
}
private boolean bufferContains(StringBuilder sequence, char target) {
for(int n = 0; n < sequence.length(); n ++){
if(sequence.charAt(n) == target){
return true;
}
}
return false;
}
private byte[] getBytesFromBuffer(StringBuilder sequence) {
byte[] bytes = new byte[sequence.length()];
for(int n = 0; n < bytes.length; n++){
bytes[n] = (byte)sequence.charAt(n);
}
return bytes;
}
public int getLongestSequenceLength() {
return longestSequenceLength;
}
}