package aliview.importer;
import java.io.BufferedReader;
import java.io.EOFException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.math.NumberUtils;
import org.apache.log4j.Logger;
import aliview.MemoryUtils;
import aliview.sequences.ClustalSequence;
import aliview.sequences.PhylipSequence;
import aliview.sequences.Sequence;
public class ClustalImporter {
private static final Logger logger = Logger.getLogger(ClustalImporter.class);
private Reader reader;
public static final int UNKNOWN = -1;
public static int INTERLEAVED_OR_SINGLELINE_SEQUENTIAL = 0;
public int formatType;
private long fileSize;
public static void main(String[] args) throws FileNotFoundException, AlignmentImportException {
//File alignmentFile = new File("/home/anders/projekt/alignments/Woodsia_chloroplast_min4_20131109_v2.excluded.aln");
// File alignmentFile = new File("/home/anders/projekt/alignments/SMALL-FLAVI-v7-dating.nuc.aed.ALL.protfnuc.mafft.glob.cod.seav.aln");
File alignmentFile = new File("/home/anders/projekt/alignments/testseq1.aln");
ClustalImporter importer = new ClustalImporter(new FileReader(alignmentFile), INTERLEAVED_OR_SINGLELINE_SEQUENTIAL);
importer.importSequences();
}
public ClustalImporter(Reader reader, int formatType, long fileSize) {
this.reader = reader;
this.formatType = formatType;
this.fileSize = fileSize;
}
public ClustalImporter(Reader reader, long fileSize) {
this(reader,INTERLEAVED_OR_SINGLELINE_SEQUENTIAL, fileSize);
}
public List<Sequence> importSequences() throws AlignmentImportException {
long startTime = System.currentTimeMillis();
List<Sequence> sequences = new ArrayList<Sequence>();
try {
String sequenceString = "";
BufferedReader r = new BufferedReader(this.reader);
ReaderHelper helper = new ReaderHelper(r);
helper.readNextLine();
String firstLine = helper.getNextLine();
firstLine = firstLine.trim();
// if not clustal file then it will throw error...
boolean isRightFormat = isStringValidFirstLine(firstLine);
if(! isRightFormat){
throw new AlignmentImportException("Could not read first line as clustal format");
}
logger.info("inside clustal importer");
if(formatType == INTERLEAVED_OR_SINGLELINE_SEQUENTIAL){
List<String> seqNames = new ArrayList<String>();
List<ByteBufferAutogrow> seqBuffers = new ArrayList<ByteBufferAutogrow>();
// skip until start of seq
helper.readUntilNextNonBlankLine();
// get first rows of sequences(
int seqCount = 0;
int longestName = 0;
int seqPartLen = 0;
// in clustal there can be a non blank row without name that contains preservation
while(helper.isNextLineStartingWithNonBlankChar()){
String line = helper.getNextLine();
// logger.info("line" + line);
int index = helper.indexOfFirstNonWhiteCharAfterWhiteChar(line);
String name = line.substring(0, index).trim();
seqNames.add(name);
longestName = Math.max(longestName, name.length());
// in clustal there cqan be another space followed by number - this should be removed
int endIndex = line.indexOf(' ',index);
if(endIndex == -1){
endIndex = line.length();
}
// end clustal trim end of line
String seqChars = line.substring(index,endIndex);
// remove any blank - should not happen in clustal-format
if(seqChars.indexOf(' ') > -1){
seqChars = StringUtils.remove(seqChars, ' ');
}
seqPartLen = Math.max(seqPartLen, seqChars.length());
int capacity = 1000; // we dont know in Clustal format
ByteBufferAutogrow seqBuff = new ByteBufferAutogrow(capacity);
seqBuff.append(seqChars);
seqBuffers.add(seqBuff);
seqCount ++;
helper.readNextLine();
}
// skip until start of seq
helper.readUntilNextNonBlankLine();
// Calculate seqBuff size
int interleaveSize = (longestName + seqPartLen) * seqCount;
long nInterleaves = fileSize / interleaveSize;
long guessedLength = nInterleaves * seqPartLen;
int guessedCapacity = (int) (guessedLength * 1.05); // 1.05 to add some blank in between
logger.info("guessedCapacity" + guessedCapacity);
MemoryUtils.logMem();
for(ByteBufferAutogrow seqBuff: seqBuffers){
seqBuff.ensureCapacity(guessedCapacity);
}
MemoryUtils.logMem();
logger.info("seqCount * guessedCapacity=" + (seqCount * guessedCapacity));
// if sequences are interleaved then there are more data to read
while(helper.isNextLineStartingWithNonBlankChar()){
// loop through all sequences in order
int lineCount = 0;
while(lineCount < seqCount && helper.isNextLineStartingWithNonBlankChar()){
// read lines of seq data
String line = helper.getNextLine();
// logger.info("line" + line);
// in clustal there is the name so lets find end of name
int seqStart = helper.indexOfFirstNonWhiteCharAfterWhiteChar(line);
// in clustal there cqan be another space followed by number - this should be removed
int endIndex = line.indexOf(' ',seqStart);
if(endIndex == -1){
endIndex = line.length();
}
// end clustal trim end of line
String moreChars = line.substring(seqStart,endIndex);
// there should not be any blanks to remove in clustal format
if(moreChars.indexOf(' ') > -1){
moreChars = StringUtils.remove(moreChars, ' ');
}
ByteBufferAutogrow seqBuff = seqBuffers.get(lineCount);
seqBuff.append(moreChars);
lineCount ++;
helper.readNextLine();
}
// logger.info("readUntilNextNonBlankLine");
helper.readUntilNextNonBlankLine();
// logger.info("donereadUntilNextNonBlankLine");
// MemoryUtils.logMem();
}
logger.info("before convert");
for(int n = 0; n <seqCount; n++){
//sequences.add(new PhylipSequence(seqNames.get(n), ""));
sequences.add(new ClustalSequence(seqNames.get(n), seqBuffers.get(n).getBytes()));
// and clear memory
seqNames.set(n,null);
seqBuffers.set(n,null);
}
// for(Sequence seq: sequences){
// logger.info(seq.getName() + " " + seq.getBasesAsString());
// }
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
long endTime = System.currentTimeMillis();
System.out.println("reading sequences took " + (endTime - startTime) + " milliseconds");
return sequences;
}
public static boolean isStringValidFirstLine(String firstLine) {
if(StringUtils.containsIgnoreCase(firstLine, "CLUSTAL")){
return true;
}
else{
return false;
}
}
}