package aliview.importer; import it.unimi.dsi.io.ByteBufferInpStream; import java.io.BufferedReader; import java.io.EOFException; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.math.NumberUtils; import org.apache.log4j.Logger; import aliview.sequencelist.FileSequenceAlignmentListModel; import aliview.sequencelist.MemoryMappedSequencesFile; import aliview.sequences.ClustalFileSequence; import aliview.sequences.FileSequence; import aliview.sequences.PositionToPointer; import aliview.sequences.Sequence; import aliview.subprocesses.SubThreadProgressWindow; public class ClustalFileIndexer { private static final Logger logger = Logger.getLogger(ClustalFileIndexer.class); private static final String LF = System.getProperty("line.separator"); public static int INTERLEAVED_OR_SINGLELINE_SEQUENTIAL = 0; public static void main(String[] args) throws AlignmentImportException, IOException { //File alignmentFile = new File("/home/anders/projekt/alignments/Woodsia_chloroplast_min4_20131109_v2.excluded.aln"); // File alignmentFile = new File("/home/anders/projekt/alignments/SMALL-FLAVI-v7-dating.nuc.aed.ALL.protfnuc.mafft.glob.cod.seav.aln"); File alignmentFile = new File("/home/anders/projekt/alignments/testseq1.aln"); FileSequenceAlignmentListModel model = new FileSequenceAlignmentListModel(alignmentFile, FileFormat.CLUSTAL); } public ArrayList<Sequence> findSequencesInFile(MemoryMappedSequencesFile sequencesFile, long filePointerStart, int seqOffset, int nSeqsToRetrieve, SubThreadProgressWindow progressWin) throws AlignmentImportException { long startTime = System.currentTimeMillis(); ByteBufferInpStream mappedBuff = sequencesFile.getMappedBuff(); ArrayList<Sequence> sequences = new ArrayList<Sequence>(); try{ long fileSize = mappedBuff.length(); int longestSequenceLength = 0; mappedBuff.position(filePointerStart); MappedBuffReaderHelper readerHelper = new MappedBuffReaderHelper(mappedBuff); // Get newline char String firstLine = readerHelper.readLine(); int newlineLen = 1; if(firstLine.endsWith("\r\n")){ newlineLen = 2; }else{ newlineLen = 1; } // if not clustal file then it will throw error... boolean isRightFormat = isStringValidFirstLine(firstLine); if(! isRightFormat){ throw new AlignmentImportException("Could not read first line as clustal format"); } logger.info("inside clustal importer"); int formatType = INTERLEAVED_OR_SINGLELINE_SEQUENTIAL; if(formatType == INTERLEAVED_OR_SINGLELINE_SEQUENTIAL){ try{ List<String> seqNames = new ArrayList<String>(); List<StringBuilder> seqBuffers = new ArrayList<StringBuilder>(); // skip until start of seq readerHelper.skipUntilNextNonWhiteCharInFirstPosAfterNewLine(); long nameStartPointer = readerHelper.position(); readerHelper.setPosition(nameStartPointer); int lineCount = 0; // only for display int seqCount = 0; int seqPos = 0; // in clustal there can be a non blank row without name that contains preservation while(true){ // position sequence start (also name endpos) readerHelper.setPosition(nameStartPointer); long seqStartPointer = readerHelper.posOfFirstNonWhiteCharAfterWhiteChar(); // in clustal there are optional numbers after whitespace before line end long seqEndPointer = readerHelper.posOfNextWhitespaceOrLF() - 1; ClustalFileSequence seq = new ClustalFileSequence(sequencesFile, nameStartPointer); String name = readerHelper.readString(nameStartPointer, seqStartPointer - 1); name = name.trim(); seq.setName(name); int seqSeqmentLen = (int) (seqEndPointer - seqStartPointer + 1); seq.add(new PositionToPointer(seqPos,seqPos + seqSeqmentLen -1, seqStartPointer, seqEndPointer)); sequences.add(seq); seqCount ++; lineCount ++; if(lineCount % 100000 == 0){ progressWin.setMessage("Indexing interleaved Phylip file" + LF + "line:" + lineCount); if(progressWin.wasSubThreadInterruptedByUser()){ break; } } // if the next name is after more than one linebreak - should be EOF or a round of interleaved sequence parts int linebreaks = readerHelper.skipUntilNextNonWhiteCharInFirstPosAfterNewLine(); nameStartPointer = readerHelper.position(); if(linebreaks > 1){ break; } } // and now append the inteleaved sequences while(true){ for(int n = 0; n < seqCount; n++){ // position sequence start (also name endpos) readerHelper.setPosition(nameStartPointer); // clustal has name on every interleaved line long interleavedStartPointer = readerHelper.posOfFirstNonWhiteCharAfterWhiteChar(); //logger.info("interleavedStartPointer" + interleavedStartPointer); // in clustal there are optional numbers after whitespace before line end long interleavedEndPointer = readerHelper.posOfNextWhitespaceOrLF() - 1; mappedBuff.position(interleavedEndPointer); int seqSeqmentLen = (int) (interleavedEndPointer - interleavedStartPointer +1); //logger.info("seqSeqmentLen" + seqSeqmentLen); ClustalFileSequence appendSeq = (ClustalFileSequence) sequences.get(n); int appendSeqPosition = appendSeq.getLength(); appendSeq.add(new PositionToPointer(appendSeqPosition, interleavedStartPointer, interleavedEndPointer)); // check that there is a next name on next line (without a empty line between) // otherwise break int linebreaks = readerHelper.skipUntilNextNonWhiteCharInFirstPosAfterNewLine(); nameStartPointer = readerHelper.position(); lineCount ++; if(lineCount % 100000 == 0){ progressWin.setMessage("Indexing interleaved Phylip file" + LF + "line:" + lineCount); if(progressWin.wasSubThreadInterruptedByUser()){ break; } } // if the next name is after more than one linebreak - should be next round of interleaved sequence parts if(linebreaks > 1){ // logger.info("break"); break; } } } }catch(EOFException eof){ logger.info("hit EOF hopefully file is read OK"); // only log output // for(Sequence seq: sequences){ // logger.info(seq.getName() + " " + seq.getBasesAsString()); // } } } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } long endTime = System.currentTimeMillis(); System.out.println("reading sequences took " + (endTime - startTime) + " milliseconds"); return sequences; } public static boolean isStringValidFirstLine(String firstLine) { if(StringUtils.containsIgnoreCase(firstLine, "CLUSTAL")){ return true; } else{ return false; } } }