package aliview.importer; import java.io.BufferedReader; import java.io.EOFException; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.math.NumberUtils; import org.apache.log4j.Logger; import aliview.sequences.PhylipSequence; import aliview.sequences.Sequence; public class PhylipImporter { private static final Logger logger = Logger.getLogger(PhylipImporter.class); private Reader reader; private int longestSequenceLength; public FileFormat formatType; public static void main(String[] args) throws FileNotFoundException, AlignmentImportException { File alignmentFile = new File("/home/anders/projekt/alignments/smalphylipSeqShortName.phy"); PhylipImporter importer = new PhylipImporter(new FileReader(alignmentFile), FileFormat.PHYLIP_SHORT_NAME_INTERLEAVED); importer.importSequences(); } public PhylipImporter(Reader reader, FileFormat formatType) { this.reader = reader; this.formatType = formatType; } public List<Sequence> importSequences() throws AlignmentImportException { long startTime = System.currentTimeMillis(); List<Sequence> sequences = new ArrayList<Sequence>(); try { String sequenceString = ""; BufferedReader r = new BufferedReader(this.reader); String firstLine = r.readLine(); firstLine = firstLine.trim(); // if not phylip file then it will throw error... int seqCount = 0; String[] lineSplitted = firstLine.split("\\s+"); // one or many whitespace logger.info("splitSize" + lineSplitted.length); if(lineSplitted != null && lineSplitted.length == 2 && NumberUtils.isNumber(lineSplitted[0]) && NumberUtils.isNumber(lineSplitted[1]) ){ seqCount = Integer.parseInt(lineSplitted[0]); longestSequenceLength = Integer.parseInt(lineSplitted[1]); } else{ throw new AlignmentImportException("Could not read first line as phylip format"); } ReaderHelper helper = new ReaderHelper(r); logger.info("inside phy importer"); try{ if(formatType == FileFormat.PHYLIP_RELAXED_PADDED_INTERLEAVED_AKA_LONG_NAME_INTERLEAVED){ List<String> seqNames = new ArrayList<String>(); // since we already know sequence size then we can use ByteBuffer List<ByteBuffer> seqBuffers = new ArrayList<ByteBuffer>(); //List<StringBuilder> seqBuffers = new ArrayList<StringBuilder>(); // try long name sequential for(int n = 0; n <seqCount; n++){ // read lines of seq data helper.readNextLine(); String line = helper.getNextLine(); int index = ReaderHelper.indexOfFirstNonWhiteCharAfterWhiteChar(line); String name = line.substring(0, index).trim(); seqNames.add(name); logger.info("name" + name); logger.info("index" + index); int capacity = longestSequenceLength; ByteBuffer seqBuff = ByteBuffer.allocate(capacity); String seqChars = line.substring(index); seqChars = ReaderHelper.removeSpaceAndTab(seqChars); logger.info("seqChars" + seqChars); seqBuff.put(seqChars.getBytes()); seqBuffers.add(seqBuff); } while(true){ // loop through all sequences in order int lineCount = 0; while(lineCount < seqCount){ // read lines of seq data helper.readNextLine(); String line = helper.getNextLine(); int index = ReaderHelper.indexOfFirstNonWhiteChar(line); // Skip empty lines if(index == -1){ logger.info("skip empty"); }else{ String moreChars = line.substring(index); moreChars = ReaderHelper.removeSpaceAndTab(moreChars); ByteBuffer seqBuff = seqBuffers.get(lineCount); seqBuff.put(moreChars.getBytes()); lineCount ++; } } ByteBuffer seqBuff = seqBuffers.get(seqBuffers.size() - 1); // check to see if last sequence is filled then break if(seqBuff.position() == longestSequenceLength){ logger.info("right length"); // create sequences for(int n = 0; n <seqCount; n++){ //sequences.add(new PhylipSequence(seqNames.get(n), "")); sequences.add(new PhylipSequence(seqNames.get(n), seqBuffers.get(n).array())); seqNames.set(n,null); seqBuffers.set(n,null); } break; }else{ logger.info("seqBuff.position()" + seqBuff.position()); } if(seqBuff.position() > longestSequenceLength){ logger.info("wrong length"); throw new AlignmentImportException("Did not match Phylip.LONG_NAME_INTERLEAVED"); } } } if(formatType == FileFormat.PHYLIP_RELAXED_PADDED_AKA_LONG_NAME_SEQUENTIAL){ // try long name sequential for(int n = 0; n <seqCount; n++){ String name = helper.getStringUntilNextSpaceOrTab(); // read lines of seq data StringBuilder seqBuffer = new StringBuilder(longestSequenceLength); while(seqBuffer.length() < longestSequenceLength){ // read lines of seq data helper.readNextLine(); String line = helper.getNextLine(); line = ReaderHelper.removeSpaceAndTab(line); seqBuffer.append(line); } if(seqBuffer.length() != longestSequenceLength){ throw new AlignmentImportException("Did not match FileFormat.PHYLIP_RELAXED_PADDED_AKA_LONG_NAME_SEQUENTIAL"); } sequences.add(new PhylipSequence(name, seqBuffer.toString())); } } if(formatType == FileFormat.PHYLIP_STRICT_SEQUENTIAL_AKA_SHORT_NAME_SEQUENTIAL){ // try long name sequential for(int n = 0; n <seqCount; n++){ String name = helper.getStringFromNextPositions(10); byte[] sequence = helper.getNonWhiteBytes(longestSequenceLength); sequences.add(new PhylipSequence(name, sequence)); // if not last seq go on to nest name if(n != seqCount -1){ helper.skipPastNextline(); } } } if(formatType == FileFormat.PHYLIP_SHORT_NAME_INTERLEAVED){ // try long name sequential // first read names lines for(int n = 0; n <seqCount; n++){ // short name sequential String name = helper.getStringFromNextPositions(10); // read rest of line as seq data helper.readNextLine(); String line = helper.getNextLine(); line = ReaderHelper.removeSpaceAndTab(line); sequences.add(new PhylipSequence(name, line)); } // now read rest of sequences while(true){ // loop through all sequences in order for(int n = 0; n <seqCount; n++){ // read lines of seq data helper.readNextLine(); String line = helper.getNextLine(); line = ReaderHelper.removeSpaceAndTab(line); PhylipSequence seq = (PhylipSequence) sequences.get(n); seq.append(line); } PhylipSequence seq = (PhylipSequence) sequences.get(sequences.size() - 1); // check to see if last sequence is filled then break if(seq.getLength() == longestSequenceLength){ break; } } } }catch(EOFException eofExc){ // if import is ok there should not have been an EOF throw new AlignmentImportException("Premature End of file when importing"); } /* if(importerType == SHORT_NAME_INTERLEAVED){ // try short name sequential for(int n = 0; n <seqCount; n++){ String name = helper.getStringFromNextPositions(10); byte[] sequence = helper.getNonWhiteBytesUntilNewLine(longestSequenceLength); sequences.add(new PhylipSequence(name, sequence)); // if not last seq go on to nest name if(n != seqCount -1){ helper.skipPastNextline(); } } } */ // try long name interleaved // try short name sequential } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } long endTime = System.currentTimeMillis(); System.out.println("reading sequences took " + (endTime - startTime) + " milliseconds"); return sequences; } public int getLongestSequenceLength() { return longestSequenceLength; } public static boolean isStringValidFirstLine(String firstLine) { boolean isValid = false; if(firstLine.contains(" ")){ String[] lineSplitted = firstLine.split("\\s+"); // one or many whitespace if( NumberUtils.isNumber(lineSplitted[0]) && NumberUtils.isNumber(lineSplitted[1]) ){ isValid = true; } } return isValid; } public FileFormat getFileFormat() { // TODO Auto-generated method stub return null; } }