package aliview.importer;
import it.unimi.dsi.io.ByteBufferInpStream;
import java.io.BufferedReader;
import java.io.EOFException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.math.NumberUtils;
import org.apache.log4j.Logger;
import aliview.sequencelist.FileSequenceAlignmentListModel;
import aliview.sequencelist.MemoryMappedSequencesFile;
import aliview.sequences.FileSequence;
import aliview.sequences.MSFFileSequence;
import aliview.sequences.PositionToPointer;
import aliview.sequences.Sequence;
import aliview.subprocesses.SubThreadProgressWindow;
public class MSFFileIndexer {
private static final Logger logger = Logger.getLogger(MSFFileIndexer.class);
private static final String LF = System.getProperty("line.separator");
public static int INTERLEAVED_OR_SINGLELINE_SEQUENTIAL = 0;
public static void main(String[] args) throws AlignmentImportException, IOException {
//File alignmentFile = new File("/home/anders/projekt/alignments/Woodsia_chloroplast_min4_20131109_v2.excluded.aln");
// File alignmentFile = new File("/home/anders/projekt/alignments/SMALL-FLAVI-v7-dating.nuc.aed.ALL.protfnuc.mafft.glob.cod.seav.aln");
File alignmentFile = new File("/home/anders/projekt/alignments/MSF_format.example.msf");
FileSequenceAlignmentListModel model = new FileSequenceAlignmentListModel(alignmentFile, FileFormat.MSF);
}
public ArrayList<Sequence> findSequencesInFile(MemoryMappedSequencesFile sequencesFile, long filePointerStart, int seqOffset, int nSeqsToRetrieve,
SubThreadProgressWindow progressWin) throws AlignmentImportException {
long startTime = System.currentTimeMillis();
logger.info("inside MSF importer");
ByteBufferInpStream mappedBuff = sequencesFile.getMappedBuff();
ArrayList<Sequence> sequences = new ArrayList<Sequence>();
try{
long fileSize = mappedBuff.length();
int longestSequenceLength = 0;
mappedBuff.position(filePointerStart);
MappedBuffReaderHelper readerHelper = new MappedBuffReaderHelper(mappedBuff);
// Get newline char
String firstLine = readerHelper.readLine();
int newlineLen = 1;
if(firstLine.endsWith("\r\n")){
newlineLen = 2;
}else{
newlineLen = 1;
}
// if not right file then it will throw error...
boolean isRightFormat = isStringValidFirstLine(firstLine);
if(! isRightFormat){
throw new AlignmentImportException("Could not read first line as MSF format");
}
String firstNameLine = readerHelper.skipUntilLineContains("Name:");
logger.info(firstNameLine);
String firstName = StringUtils.substringBetween(firstNameLine, "Name:", "Len:");
firstName = firstName.trim();
logger.info(firstName);
int formatType = INTERLEAVED_OR_SINGLELINE_SEQUENTIAL;
if(formatType == INTERLEAVED_OR_SINGLELINE_SEQUENTIAL){
try{
List<String> seqNames = new ArrayList<String>();
List<StringBuilder> seqBuffers = new ArrayList<StringBuilder>();
// skip until start of seq
readerHelper.skipUntilLineContains("//");
firstNameLine = readerHelper.skipUntilLineContains(firstName);
long nameStartPointer = readerHelper.position();
readerHelper.setPosition(nameStartPointer);
int lineCount = 0; // only for display
int seqCount = 0;
int seqPos = 0;
// in clustal there can be a non blank row without name that contains preservation
while(true){
// position sequence start (also name endpos)
readerHelper.setPosition(nameStartPointer);
long seqStartPointer = readerHelper.posOfFirstNonWhiteCharAfterWhiteChar();
long seqEndPointer = readerHelper.posOfNextNewline() - newlineLen;
MSFFileSequence seq = new MSFFileSequence(sequencesFile, nameStartPointer);
String name = readerHelper.readString(nameStartPointer, seqStartPointer - 1);
name = name.trim();
seq.setName(name);
// logger.info("name=" + name);
int seqSeqmentLen = (int) (seqEndPointer - seqStartPointer + 1);
seq.add(new PositionToPointer(seqPos,seqPos + seqSeqmentLen -1, seqStartPointer, seqEndPointer));
sequences.add(seq);
seqCount ++;
lineCount ++;
if(lineCount % 10000 == 0){
progressWin.setMessage("Indexing interleaved MSF file" + LF + "line:" + lineCount);
if(progressWin.wasSubThreadInterruptedByUser()){
break;
}
}
// if the next name is after more than one linebreak - should be EOF or a round of interleaved sequence parts
int linebreaks = readerHelper.skipUntilNextNonWhiteCharOnNextLine();
// logger.info("linebreaks" + linebreaks);
nameStartPointer = readerHelper.position();
if(linebreaks > 1){
// skip to name line - there might be a positions line on top
readerHelper.skipUntilLineContains(firstName);
readerHelper.skipUntilNextNonWhiteChar();
nameStartPointer = readerHelper.position();
break;
}
}
// and now append the inteleaved sequences
while(true){
for(int n = 0; n < seqCount; n++){
// logger.info("n =" + n );
// position sequence start (also name endpos)
readerHelper.setPosition(nameStartPointer);
// msf has name on every interleaved line
long interleavedStartPointer = readerHelper.posOfFirstNonWhiteCharAfterWhiteChar();
//logger.info("interleavedStartPointer" + interleavedStartPointer);
// in clustal there are optional numbers after whitespace before line end
long interleavedEndPointer = readerHelper.posOfNextNewline() - newlineLen;
mappedBuff.position(interleavedEndPointer);
// logger.info("char=" + (char)mappedBuff.read());
int seqSeqmentLen = (int) (interleavedEndPointer - interleavedStartPointer +1);
//logger.info("seqSeqmentLen" + seqSeqmentLen);
MSFFileSequence appendSeq = (MSFFileSequence) sequences.get(n);
// logger.info("getSeq" + n);
int appendSeqPosition = appendSeq.getLength();
appendSeq.add(new PositionToPointer(appendSeqPosition, interleavedStartPointer, interleavedEndPointer));
// check that there is a next name on next line (without a empty line between)
// otherwise break
int linebreaks = readerHelper.skipUntilNextNonWhiteCharOnNextLine();
nameStartPointer = readerHelper.position();
lineCount ++;
if(lineCount % 10000 == 0){
progressWin.setMessage("Indexing interleaved MSF file" + LF + "line:" + lineCount);
if(progressWin.wasSubThreadInterruptedByUser()){
break;
}
}
// if the next name is after more than one linebreak - should be next round of interleaved sequence parts
if(linebreaks > 1){
// skip to name line - there might be a positions line on top
firstNameLine = readerHelper.skipUntilLineContains(firstName);
readerHelper.skipUntilNextNonWhiteChar();
nameStartPointer = readerHelper.position();
break;
}
}
}
}catch(EOFException eof){
logger.info("hit EOF hopefully file is read OK");
// only log output
// for(Sequence seq: sequences){
// logger.info(seq.getName() + " " + seq.getBasesAsString());
// }
}
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
long endTime = System.currentTimeMillis();
System.out.println("reading sequences took " + (endTime - startTime) + " milliseconds");
return sequences;
}
public static boolean isStringValidFirstLine(String firstLine) {
if(StringUtils.contains(firstLine, "!!") || StringUtils.containsIgnoreCase(firstLine, "PileUp")){
return true;
}else{
return false;
}
}
}