package aliview.importer;
import java.io.BufferedReader;
import java.io.EOFException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.math.NumberUtils;
import org.apache.log4j.Logger;
import aliview.MemoryUtils;
import aliview.sequences.MSFSequence;
import aliview.sequences.PhylipSequence;
import aliview.sequences.Sequence;
public class MSFImporter {
private static final Logger logger = Logger.getLogger(MSFImporter.class);
private Reader reader;
public static final int UNKNOWN = -1;
public static int INTERLEAVED_OR_SINGLELINE_SEQUENTIAL = 0;
public int formatType;
public static void main(String[] args) throws FileNotFoundException, AlignmentImportException {
File alignmentFile = new File("/home/anders/projekt/alignments/MSF_format.example.msf");
MSFImporter importer = new MSFImporter(new FileReader(alignmentFile), INTERLEAVED_OR_SINGLELINE_SEQUENTIAL);
importer.importSequences();
}
public MSFImporter(Reader reader, int formatType) {
this.reader = reader;
this.formatType = formatType;
}
public MSFImporter(FileReader fileReader) {
this(fileReader,INTERLEAVED_OR_SINGLELINE_SEQUENTIAL);
}
public List<Sequence> importSequences() throws AlignmentImportException {
long startTime = System.currentTimeMillis();
List<Sequence> sequences = new ArrayList<Sequence>();
try {
String sequenceString = "";
BufferedReader r = new BufferedReader(this.reader);
ReaderHelper helper = new ReaderHelper(r);
helper.readNextLine();
String firstLine = helper.getNextLine();
boolean isRightFormat = isStringValidFirstLine(firstLine);
if(! isRightFormat){
throw new AlignmentImportException("Could not read file as MSF format");
}
boolean containsMSF = helper.readUntilNextLineContains("MSF:");
String metaLine = helper.getNextLine();
String strLength = StringUtils.substringBetween(metaLine,"MSF:","Type:");
strLength = strLength.trim();
int guessedLength = 0;
try {
guessedLength = Integer.parseInt(strLength);
} catch (NumberFormatException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
logger.info("guessedLength" + guessedLength);
boolean containsName = helper.readUntilNextLineContains("Name:");
String firstNameLine = helper.getNextLine();
String firstName = StringUtils.substringBetween(firstNameLine,": "," ");
firstName = firstName.trim();
logger.info("inside msf importer, firstName = " + firstName);
if(formatType == INTERLEAVED_OR_SINGLELINE_SEQUENTIAL){
List<String> seqNames = new ArrayList<String>();
List<ByteBufferAutogrow> seqBuffers = new ArrayList<ByteBufferAutogrow>();
// skip until start of sequences
helper.readUntilNextLineContains("//");
helper.readUntilNextLineContains(firstName);
// get first rows of sequences(
int seqCount = 0;
int longestName = 0;
int longestSeq = 0;
// in clustal there can be a non blank row without name that contains preservation
while(helper.isNextLineContainingNonWhitespaceChars()){
String line = helper.getNextLine();
// logger.info("line" + line);
// remove blanks in beginning of name
line = line.trim();
int index = ReaderHelper.indexOfFirstNonWhiteCharAfterWhiteChar(line);
String name = line.substring(0, index).trim();
seqNames.add(name);
String seqChars = line.substring(index);
// remove any blank and replace MSF . and ~ characters
seqChars = ReaderHelper.removeSpaceAndTab(seqChars);
seqChars = replaceMSFGapCharacters(seqChars);
int capacity = guessedLength; // we dont know (i guess it could be read in header)
ByteBufferAutogrow seqBuff = new ByteBufferAutogrow(capacity);
seqBuff.append(seqChars);
seqBuffers.add(seqBuff);
seqCount ++;
helper.readNextLine();
}
// if sequences are interleaved then there are more data to read
while(helper.readUntilNextLineContains(firstName)){
// loop through all sequences in order
int lineCount = 0;
while(lineCount < seqCount){
// read lines of seq data
String line = helper.getNextLine();
// remove blanks in beginning of name
line = line.trim();
int index = ReaderHelper.indexOfFirstNonWhiteCharAfterWhiteChar(line);
String moreChars = line.substring(index);
// remove any blank and replace MSF . and ~ characters
moreChars = ReaderHelper.removeSpaceAndTab(moreChars);
moreChars = replaceMSFGapCharacters(moreChars);
ByteBufferAutogrow seqBuff = seqBuffers.get(lineCount);
seqBuff.append(moreChars);
lineCount ++;
helper.readNextLine();
}
// MemoryUtils.logMem();
}
for(int n = 0; n <seqCount; n++){
//sequences.add(new PhylipSequence(seqNames.get(n), ""));
sequences.add(new MSFSequence(seqNames.get(n), seqBuffers.get(n).getBytes()));
// and clear memory
seqNames.set(n,null);
seqBuffers.set(n,null);
}
// Only logging
// for(Sequence seq: sequences){
// logger.info(seq.getName() + " " + seq.getBasesAsString());
// }
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
long endTime = System.currentTimeMillis();
System.out.println("reading sequences took " + (endTime - startTime) + " milliseconds");
return sequences;
}
private String replaceMSFGapCharacters(String seqChars){
if(seqChars.indexOf('.') > -1){
seqChars = seqChars.replace('.', '-');
}
if(seqChars.indexOf('~') > -1){
seqChars = seqChars.replace('~', '-');
}
return seqChars;
}
public static boolean isStringValidFirstLine(String firstLine) {
if(StringUtils.contains(firstLine, "!!") || StringUtils.containsIgnoreCase(firstLine, "PileUp")){
return true;
}else{
return false;
}
}
/*
*
* This method is copied and modified from iubio.readseq
*
*/
public static int GCGchecksum(Sequence seq){
int check = 0;
for (int n = 0; n < seq.getLength(); n++){
byte byteVal = seq.getBaseAtPos(n);
int val = Character.toLowerCase(byteVal);
if (val >= 'a' && val <= 'z'){
val -= 32;
}
int positionMultiplier = n % 57 + 1;
check += val * positionMultiplier;
}
check %= 10000;
return check;
}
}