package aliview.importer; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.math.NumberUtils; import org.apache.log4j.Logger; import it.unimi.dsi.io.ByteBufferInpStream; import aliview.sequencelist.MemoryMappedSequencesFile; import aliview.sequences.FastaFileSequence; import aliview.sequences.FileSequence; import aliview.sequences.PhylipFileSequence; import aliview.sequences.PositionToPointer; import aliview.sequences.Sequence; import aliview.subprocesses.SubThreadProgressWindow; public class PhylipFileIndexer implements FileIndexer{ private static final String LF = System.getProperty("line.separator"); private static final Logger logger = Logger.getLogger(PhylipFileIndexer.class); long estimateTotalSeqInFile = 0; long fileSize = -1; private MappedBuffReaderHelper readerHelper; public FileFormat formatType; public ArrayList<Sequence> findSequencesInFile(MemoryMappedSequencesFile sequencesFile, long filePointerStart, int seqOffset, int nSeqsToRetrieve, SubThreadProgressWindow progressWin) throws AlignmentImportException { ByteBufferInpStream mappedBuff = sequencesFile.getMappedBuff(); ArrayList<Sequence> allSeqs = new ArrayList<Sequence>(); try{ this.fileSize = mappedBuff.length(); int longestSequenceLength = 0; mappedBuff.position(filePointerStart); readerHelper = new MappedBuffReaderHelper(mappedBuff); String firstLine = readerHelper.readLine(); int newlineLen = 1; if(firstLine.endsWith("\r")){ newlineLen = 2; }else{ newlineLen = 1; } firstLine = firstLine.trim(); // if not phylip file then it will throw error... int seqCount = 0; String[] lineSplitted = firstLine.split("\\s+"); // one or many whitespace logger.info("splitSize" + lineSplitted.length); if(lineSplitted != null && lineSplitted.length == 2 && NumberUtils.isNumber(lineSplitted[0]) && NumberUtils.isNumber(lineSplitted[1]) ){ seqCount = Integer.parseInt(lineSplitted[0]); longestSequenceLength = Integer.parseInt(lineSplitted[1]); } else{ throw new AlignmentImportException("Could not read first line as phylip format"); } long firstNameStartPointer = mappedBuff.position(); // determine file type FileFormat formatType = FileFormat.UNKNOWN; // Test PhylipImporter.LONG_NAME_SEQUENTIAL if(formatType == FileFormat.UNKNOWN){ try { mappedBuff.position(firstNameStartPointer); long seqStartPointer = readerHelper.posOfFirstNonWhiteCharAfterWhiteChar(); long seqEndPointerIfSequential = readerHelper.posAtNSequenceCharacters(seqStartPointer, longestSequenceLength); if(readerHelper.isNextLF()){ // probably long name sequential formatType = FileFormat.PHYLIP_RELAXED_PADDED_AKA_LONG_NAME_SEQUENTIAL; logger.info("probably long name sequential"); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } // Test PhylipImporter.SHORT_NAME_SEQUENTIAL if(formatType == FileFormat.UNKNOWN){ try { long seqEndPointerIfTenPosSequential = readerHelper.posAtNSequenceCharacters(firstNameStartPointer + 10, longestSequenceLength); if(readerHelper.isNextLF()){ // probably long name sequential formatType = FileFormat.PHYLIP_STRICT_SEQUENTIAL_AKA_SHORT_NAME_SEQUENTIAL; logger.info("probably short name sequential"); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } // if(formatType == FileFormat.UNKNOWN){ // if only one continous gap --> long interleaved try { mappedBuff.position(firstNameStartPointer); if(readerHelper.hasLineOnlyOneContinousSpace()){ formatType = FileFormat.PHYLIP_RELAXED_PADDED_INTERLEAVED_AKA_LONG_NAME_INTERLEAVED; } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } // finally set to PHYLIP_SHORT_NAME_INTERLEAVED. if(formatType == FileFormat.UNKNOWN){ formatType = FileFormat.PHYLIP_SHORT_NAME_INTERLEAVED; logger.info("probably short name interleaved"); } // load depending on file type if(formatType == FileFormat.PHYLIP_RELAXED_PADDED_INTERLEAVED_AKA_LONG_NAME_INTERLEAVED){ long nameStartPointer = firstNameStartPointer; readerHelper.setPosition(nameStartPointer); int lineCount = 0; for(int n = 0; n <seqCount; n++){ int seqPos = 0; // position sequence start (also name endpos) readerHelper.setPosition(nameStartPointer); long seqStartPointer = readerHelper.posOfFirstNonWhiteCharAfterWhiteChar(); // end of first line long firstNewlinePointer = readerHelper.posOfNextNewline(); PhylipFileSequence seq = new PhylipFileSequence(sequencesFile, nameStartPointer); String name = readerHelper.readString(nameStartPointer, seqStartPointer - 1); name = name.trim(); seq.setName(name); int seqSeqmentLen = (int) (firstNewlinePointer - newlineLen - seqStartPointer + 1); seq.add(new PositionToPointer(seqPos,seqPos + seqSeqmentLen -1, seqStartPointer, firstNewlinePointer - newlineLen)); nameStartPointer = firstNewlinePointer + newlineLen; allSeqs.add(seq); lineCount ++; if(lineCount % 1000 == 0){ progressWin.setMessage("Indexing interleaved Phylip file" + LF + "line:" + lineCount); if(progressWin.wasSubThreadInterruptedByUser()){ break; } } } FileSequence lastPhylSeq = (FileSequence) allSeqs.get(allSeqs.size()-1); mappedBuff.position(lastPhylSeq.getEndPointer() + 1); // and now append the inteleaved sequences while(true){ long interleavedStartPointer = readerHelper.posOfNextNonWhiteChar(); //logger.info("interleavedStartPointer" + interleavedStartPointer); long interleavedEndLinePointer = readerHelper.posOfNextNewline() - newlineLen; //logger.info("interleavedEndLinePointer" + interleavedEndLinePointer); long interleavedNextStartPointer = readerHelper.posOfNextNonWhiteChar(); //logger.info("interleavedNextStartPointer" + interleavedNextStartPointer); long lengthBetweenTwoInterleaveStartPointer = interleavedNextStartPointer - interleavedStartPointer -1; int nextSeqPartStartPos = allSeqs.get(0).getLength(); // length is one more already than pointer //logger.info("nextSeqPartStartPos" + nextSeqPartStartPos); int seqSeqmentLen = (int) (interleavedEndLinePointer - interleavedStartPointer +1); //logger.info("seqSeqmentLen" + seqSeqmentLen); for(int n = 0; n < seqCount; n++){ PhylipFileSequence appendSeq = (PhylipFileSequence) allSeqs.get(n); appendSeq.add(new PositionToPointer(nextSeqPartStartPos,nextSeqPartStartPos + seqSeqmentLen -1, interleavedStartPointer, interleavedEndLinePointer)); interleavedStartPointer = interleavedStartPointer + lengthBetweenTwoInterleaveStartPointer +1; interleavedEndLinePointer = interleavedStartPointer + (seqSeqmentLen -1); } // break when full // logger.info("allSeqs.get(0).getLength()" + allSeqs.get(0).getLength()); // logger.info("longestSequenceLength" + longestSequenceLength); if(allSeqs.get(0).getLength() >= longestSequenceLength){ logger.info("done indexing"); break; }else{ FileSequence lastSeq = (FileSequence) allSeqs.get(allSeqs.size()-1); mappedBuff.position(lastSeq.getEndPointer() + 1); } lineCount ++; if(lineCount % 1000 == 0){ progressWin.setMessage("Indexing interleaved Phylip file" + LF + "line:" + lineCount); if(progressWin.wasSubThreadInterruptedByUser()){ break; } } } } // load depending on file type if(formatType == FileFormat.PHYLIP_SHORT_NAME_INTERLEAVED){ long nameStartPointer = firstNameStartPointer; readerHelper.setPosition(nameStartPointer); int lineCount = 0; for(int n = 0; n <seqCount; n++){ int seqPos = 0; // position sequence start (also name endpos) readerHelper.setPosition(nameStartPointer); // This row is the only difference between SHORT AND LONG NAME INTERLEAVED long seqStartPointer = nameStartPointer + 10; // end of first line long firstNewlinePointer = readerHelper.posOfNextNewline(); // and number of non-char positions // spaces = readerHelper.countSpaceBetween(seqStartPos, firstNewlinePos); // if(spaces > 0){ // logger.info("spaces" + spaces); // } PhylipFileSequence seq = new PhylipFileSequence(sequencesFile, nameStartPointer); String name = readerHelper.readString(nameStartPointer, seqStartPointer - 1); name = name.trim(); seq.setName(name); int seqSeqmentLen = (int) (firstNewlinePointer - newlineLen - seqStartPointer + 1); seq.add(new PositionToPointer(seqPos,seqPos + seqSeqmentLen -1, seqStartPointer, firstNewlinePointer - newlineLen)); nameStartPointer = firstNewlinePointer + newlineLen; allSeqs.add(seq); lineCount ++; if(lineCount % 1000 == 0){ progressWin.setMessage("Indexing interleaved Phylip file" + LF + "line:" + lineCount); if(progressWin.wasSubThreadInterruptedByUser()){ break; } } } FileSequence lastSeq = (FileSequence) allSeqs.get(allSeqs.size()-1); mappedBuff.position(lastSeq.getEndPointer() + 1); // and now append the inteleaved sequences while(true){ long interleavedStartPointer = readerHelper.posOfNextNonWhiteChar(); //logger.info("interleavedStartPointer" + interleavedStartPointer); long interleavedEndLinePointer = readerHelper.posOfNextNewline() - newlineLen; //logger.info("interleavedEndLinePointer" + interleavedEndLinePointer); long interleavedNextStartPointer = readerHelper.posOfNextNonWhiteChar(); //logger.info("interleavedNextStartPointer" + interleavedNextStartPointer); long lengthBetweenTwoInterleaveStartPointer = interleavedNextStartPointer - interleavedStartPointer -1; int nextSeqPartStartPos = allSeqs.get(0).getLength(); // length is one more already than pointer //logger.info("nextSeqPartStartPos" + nextSeqPartStartPos); int seqSeqmentLen = (int) (interleavedEndLinePointer - interleavedStartPointer +1); //logger.info("seqSeqmentLen" + seqSeqmentLen); for(int n = 0; n < seqCount; n++){ PhylipFileSequence appendSeq = (PhylipFileSequence) allSeqs.get(n); appendSeq.add(new PositionToPointer(nextSeqPartStartPos,nextSeqPartStartPos + seqSeqmentLen -1, interleavedStartPointer, interleavedEndLinePointer)); interleavedStartPointer = interleavedStartPointer + lengthBetweenTwoInterleaveStartPointer +1; interleavedEndLinePointer = interleavedStartPointer + (seqSeqmentLen -1); } // break when full // logger.info("allSeqs.get(0).getLength()" + allSeqs.get(0).getLength()); // logger.info("longestSequenceLength" + longestSequenceLength); if(allSeqs.get(0).getLength() >= longestSequenceLength){ logger.info("done indexing"); break; }else{ FileSequence theLastSeq = (FileSequence) allSeqs.get(allSeqs.size()-1); mappedBuff.position(theLastSeq.getEndPointer() + 1); } lineCount ++; if(lineCount % 1000 == 0){ progressWin.setMessage("Indexing interleaved Phylip file" + LF + "line:" + lineCount); if(progressWin.wasSubThreadInterruptedByUser()){ break; } } } } // load depending on file type if(formatType == FileFormat.PHYLIP_RELAXED_PADDED_AKA_LONG_NAME_SEQUENTIAL){ logger.info("PhylipImporter.LONG_NAME_SEQUENTIAL"); // get all names and initial sequences long nameStartPointer = firstNameStartPointer; readerHelper.setPosition(nameStartPointer); // position sequence start (also name endpos) long seqStartPointer = readerHelper.posOfFirstNonWhiteCharAfterWhiteChar(); // endpointer long sequentialEndPointer = readerHelper.posAtNSequenceCharacters(seqStartPointer, longestSequenceLength); // sequence-length-in-pointers int seqSeqmentLen = (int) (sequentialEndPointer - seqStartPointer + 1); // length is +1 for(int n = 0; n <seqCount; n++){ int seqPos = 0; mappedBuff.position(nameStartPointer); seqStartPointer = readerHelper.posOfFirstNonWhiteCharAfterWhiteChar(); // just calculate - don't read sequentialEndPointer = seqStartPointer + seqSeqmentLen -1; PhylipFileSequence seq = new PhylipFileSequence(sequencesFile, nameStartPointer); String name = readerHelper.readString(nameStartPointer, seqStartPointer - 1); name = name.trim(); seq.setName(name); seq.add(new PositionToPointer(seqPos,seqPos + seqSeqmentLen -1, seqStartPointer, sequentialEndPointer)); allSeqs.add(seq); // move forward nameStartPointer = sequentialEndPointer + newlineLen + 1; if(n % 1000 == 0){ progressWin.setMessage("Indexing Phylip sequential file" + LF + "seq:" + n + "/" + seqCount); if(progressWin.wasSubThreadInterruptedByUser()){ break; } } } } // load depending on file type if(formatType == FileFormat.PHYLIP_STRICT_SEQUENTIAL_AKA_SHORT_NAME_SEQUENTIAL){ logger.info("PhylipImporter.SHORT_NAME_SEQUENTIAL"); // get all names and initial sequences long nameStartPointer = firstNameStartPointer; readerHelper.setPosition(nameStartPointer); // position sequence start (also name endpos) long seqStartPointer = nameStartPointer + 10; // endpointer long sequentialEndPointer = readerHelper.posAtNSequenceCharacters(seqStartPointer, longestSequenceLength); // sequence-length-in-pointers int seqSeqmentLen = (int) (sequentialEndPointer - seqStartPointer + 1); // length is +1 for(int n = 0; n <seqCount; n++){ int seqPos = 0; mappedBuff.position(nameStartPointer); seqStartPointer = nameStartPointer + 10; // just calculate - don't read sequentialEndPointer = seqStartPointer + seqSeqmentLen -1; PhylipFileSequence seq = new PhylipFileSequence(sequencesFile, nameStartPointer); String name = readerHelper.readString(nameStartPointer, seqStartPointer - 1); name = name.trim(); seq.setName(name); seq.add(new PositionToPointer(seqPos,seqPos + seqSeqmentLen -1, seqStartPointer, sequentialEndPointer)); allSeqs.add(seq); // move forward nameStartPointer = sequentialEndPointer + newlineLen + 1; if(n % 1000 == 0){ progressWin.setMessage("Indexing Phylip sequential file" + LF + "seq:" + n + "/" + seqCount); if(progressWin.wasSubThreadInterruptedByUser()){ break; } } } } }catch(Exception exc){ logger.info("could not read as phylip"); exc.printStackTrace(); throw new AlignmentImportException("Could not read phylip format"); } return allSeqs; } }