package org.seqcode.tools.sequence;
import java.io.*;
import org.seqcode.data.io.parsing.FASTAStream;
import org.seqcode.gseutils.Args;
/**
* Produces random read sequences (fixed length) in FASTA format
*
*
*/
public class RandomReadGenerator {
public static final char[] letters = {'A','C','T','G'};
public static void main(String args[]) throws IOException {
String fastafile = Args.parseString(args,"fasta",null);
/* uniform probability of error at each base */
double errorP = Args.parseDouble(args,"uniformerrorp",0);
/* in solexa model, error probability is
base + multiplier * ((exp) ^ length)
Based on the data in Dohm et al (2008),
base = .005, multiplier = .0001, and exp = 1.2
their data only went to 32bp, so this may not be accurate for long reads
*/
double solexaErrorBaseP = Args.parseDouble(args,"solexabase",0);
double solexaErrorMultiplier = Args.parseDouble(args,"solexamultiplier",0);
double solexaErrorExp = Args.parseDouble(args,"solexaexp",0);
// read length
int n = Args.parseInteger(args, "n", 25);
/* can specify either absolute number of reads per sequence in the FASTA file
* or an amount of coverage
*/
int nreads = Args.parseInteger(args,"nreads",100);
double coverage = Args.parseDouble(args,"coverage",Double.NaN);
BufferedReader reader;
if (fastafile == null || fastafile.equals("-")) {
reader = new BufferedReader(new InputStreamReader(System.in));
} else {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(fastafile)));
}
cern.jet.random.engine.RandomEngine engine = new cern.jet.random.engine.DRand();
cern.jet.random.Normal norm = new cern.jet.random.Normal(0,coverage,engine);
cern.jet.random.Uniform uniform = new cern.jet.random.Uniform(0.0,1.0,engine);
cern.jet.random.Uniform uniform4 = new cern.jet.random.Uniform(0.0,3,engine);
FASTAStream stream = new FASTAStream(reader);
int readcount = 0;
while (stream.hasNext()) {
String seq = stream.next().cdr();
int reads;
if (Double.isNaN(coverage)) {
reads = nreads;
} else {
reads = (int) (seq.length() * norm.nextDouble());
}
for (int i = 0; i < reads; i++) {
int start = (int)((seq.length() - (n+1)) * Math.random());
String name = String.format("read_%d",readcount++);
char[] read = seq.substring(start,start+n).toCharArray();
if (errorP > 0) {
for (int j = 0; j < read.length; j++) {
if (uniform.nextDouble() < errorP) {
char old = read[j];
while (old == (read[j] = letters[uniform.nextIntFromTo(0,3)])) { }
}
}
} else if (solexaErrorBaseP > 0) {
for (int j = 0; j < read.length; j++) {
double p = solexaErrorBaseP + solexaErrorMultiplier * Math.pow(solexaErrorExp, j);
// System.err.println(String.format("%d -> %f",j,p));
if (j > 0 && read[j-1] == 'G') {
p *= 2;
}
if (uniform.nextDouble() < p) {
// System.err.println("Mutating base " + j + " of " + i + " from " + read[j]);
char old = read[j];
while (old == (read[j] = letters[uniform.nextIntFromTo(0,3)])) { }
}
}
}
System.out.println(">" + name + "\n" + String.valueOf(read));
}
}
}
}