/*
* Genoogle: Similar DNA Sequences Searching Engine and Tools. (http://genoogle.pih.bio.br)
* Copyright (C) 2008,2009,2010,2011,2012 Felipe Fernandes Albrecht (felipe.albrecht@gmail.com)
*
* For further information check the LICENSE file.
*/
package bio.pih.genoogle.io.reader;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import bio.pih.genoogle.seq.Alphabet;
/**
* A extended FastaFormat for LightweightSymbolList. Strongly basead on
* http://code.open-bio.org/svnweb
* /index.cgi/biojava/view/biojava-live/trunk/src/org/biojavax/bio/seq/io/FastaFormat.java?rev=4800
*
* @author albrecht
*
*/
public class FastaFormat implements RichSequenceFormat {
// TODO: Put this value as parameter.
private static final int READ_AHEAD_LIMIT = (int) Math.pow(2, 20); // Every line read, will look ahead 1M
static Logger logger = Logger.getLogger(FastaFormat.class.getName());
protected static final Pattern hp = Pattern.compile(">(\\S+)(\\s+(.*))?");
protected static final Pattern dp = Pattern.compile("^(gi\\|(\\d+)\\|)*(\\S+)\\|(\\S+?)(\\.(\\d+))*\\|(\\S+)");
private final Alphabet alphabet;
public FastaFormat(Alphabet alphabet) {
this.alphabet = alphabet;
}
public boolean readRichSequence(BufferedReader reader, RichSequenceBuilder builder) throws IOException,
ParseException {
String line = reader.readLine();
if (line == null) {
throw new IOException("Premature stream end");
}
while (line.length() == 0) {
line = reader.readLine();
if (line == null) {
throw new IOException("Premature stream end");
}
}
if (!line.startsWith(">")) {
throw new IOException("Stream does not appear to contain FASTA formatted data: " + line);
}
builder.startSequence();
processHeader(line, builder);
builder.setHeader(line.substring(1));
StringBuffer seq = new StringBuffer();
boolean hasMoreSeq = true;
while (hasMoreSeq) {
reader.mark(READ_AHEAD_LIMIT);
// TODO: Not read with readline, but read ant put into a buffer.
line = reader.readLine();
if (line != null) {
line = line.trim();
if (line.length() > 0 && line.charAt(0) == '>') {
logger.debug("New header: '" + line + "'.");
if (line.length() > READ_AHEAD_LIMIT) {
throw new IOException("Sequence header length ("+line.length()+") too long. The limit is " + READ_AHEAD_LIMIT+ ".");
}
reader.reset();
hasMoreSeq = false;
} else {
seq.append(line);
}
} else {
hasMoreSeq = false;
}
}
builder.setSequence(seq.toString().replaceAll("\\s+", "").replaceAll("[\\.|~]", "-"));
builder.setAlphabet(alphabet);
builder.endSequence();
return line != null;
}
/**
* GenBank gi|gi-number|gb|accession|locus
* EMBL Data Library gi|gi-number|emb|accession|locus
* DDBJ, DNA Database of Japan gi|gi-number|dbj|accession|locus
*/
protected static final Pattern giHeader = Pattern.compile(">gi\\|(\\d+)\\|(\\S+)\\|(\\S+)\\|(\\s+(.*))?");
/**
* Local Sequence identifier lcl|identifier
*/
protected static final Pattern lclHeader = Pattern.compile(">lcl\\|(\\S+)(\\|(\\s*(.*))?)?");
/**
* >EMBLCDS:BAJ49870 BAJ49870.1 Candidatus Caldiarchaeum subterraneum archaeal cell division control protein 6
*/
protected static final Pattern emblHeader = Pattern.compile(">(\\S+):(\\S+)(\\s+)(\\S+\\.\\d)(\\s+)(\\S+(.*))");
/**
* >contig00001_1 length=19730
*/
protected static final Pattern ecoli = Pattern.compile(">contig(\\S+)(\\s+)(\\S+(.*))");
public void processHeader(String line, RichSequenceBuilder sequenceBuilder) throws IOException, ParseException {
Matcher matcher = giHeader.matcher(line);
if (matcher.matches()) {
sequenceBuilder.setType("gi");
sequenceBuilder.setGi(matcher.group(1));
sequenceBuilder.setName(matcher.group(2));
sequenceBuilder.setAccession(matcher.group(3));
sequenceBuilder.setDescription(matcher.group(4));
return;
}
matcher = lclHeader.matcher(line);
if (matcher.matches()) {
sequenceBuilder.setType("lcl");
sequenceBuilder.setName(matcher.group(1));
sequenceBuilder.setDescription(matcher.group(3));
return;
}
matcher = emblHeader.matcher(line);
if (matcher.matches()) {
sequenceBuilder.setType(matcher.group(1));
sequenceBuilder.setGi(matcher.group(2));
sequenceBuilder.setName(matcher.group(4));
sequenceBuilder.setDescription(matcher.group(6));
return;
}
matcher = ecoli.matcher(line);
if (matcher.matches()) {
sequenceBuilder.setType("contig");
sequenceBuilder.setName(matcher.group(1));
sequenceBuilder.setDescription(matcher.group(3));
return;
}
line = line.substring(1);
String[] strings = line.split("\\|");
if (strings.length == 1) {
sequenceBuilder.setDescription(line);
sequenceBuilder.setType("custon");
} else {
sequenceBuilder.setType(strings[0]);
}
if (strings.length > 2) {
sequenceBuilder.setName(strings[1]);
}
if (strings.length > 3) {
sequenceBuilder.setGi(strings[2]);
}
if (strings.length > 4) {
sequenceBuilder.setAccession(strings[3]);
}
if (strings.length > 1) {
sequenceBuilder.setDescription(strings[strings.length - 1]);
}
}
// TODO:
// NBRF PIR pir||entry
// Patents pat|country|number
// GenInfo Backbone Id bbs|number
// General database identifier gnl|database|identifier
// NCBI Reference Sequence ref|accession|locus
}