package org.seqcode.deepseq.utils.simulation;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import org.seqcode.gseutils.ArgParser;
import org.seqcode.math.stats.NegativeBinomialDistrib;
/**
* CountDataSimulator: generate a two-condition CountsDataset file given an empirically observed dataset and some parameters
* @author Shaun Mahony
* @version %I%, %G%
*/
public class CountDataSimulator {
//Parameters
private int numReplicates=1;
private int numConditions=2;
private int numDataPoints = 10000;
private Double alpha = 0.15; //Parameter for biological variance. Poisson distribution (technical variance only) is alpha=0
private Double upRegFrac=0.1; //Fraction of genes that are up-regulated B vs A
private Double downRegFrac=0.1; //Fraction of genes that are down-regulated B vs A
private Double diffExpLevel =2.0; //Base fold-change of differentially regulated genes
private Double condATotalReads=10000000.0;
private Double condBTotalReads=20000000.0;
//Variables
private ArrayList<Double> empirical = new ArrayList<Double>();
//Output
double[][] counts;
double[][] extraCounts;
List<SimCounts> simResults = new ArrayList<SimCounts>();
//Constructor
public CountDataSimulator(){}
//Modifiers
public void setDataPoints(int nd){numDataPoints=nd;}
public void setConditions(int c){numConditions=c;}
public void setReplicates(int r){numReplicates=r;}
//public void setConditions(int c){numConditions=c;}
public void setAlpha(Double a){alpha=a;}
public void setReads(Double r){condATotalReads = r; condBTotalReads = r;}
public void setReadsA(Double r){condATotalReads = r;}
public void setReadsB(Double r){condBTotalReads = r;}
public void setUpRegFrac(Double f){upRegFrac=f;}
public void setDownRegFrac(Double f){downRegFrac=f;}
public void setDiffExpLevel(Double d){diffExpLevel = d;}
/**
* Load an empirical dataset from a file
* @param filename
*/
public void loadEmpiricalFromFile(String filename){
try{
File pFile = new File(filename);
if(!pFile.isFile()){System.err.println("Invalid file name");System.exit(1);}
BufferedReader reader = new BufferedReader(new FileReader(pFile));
String line= reader.readLine(); //Skip first line
while ((line = reader.readLine()) != null) {
line = line.trim();
if(!line.startsWith("#")){
String[] words = line.split("\\s+");
double e = new Double(words[1]);
empirical.add(e);
}
}reader.close();
Collections.sort(empirical);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Simulate counts using a negative binomial model
* @param outFilename
*/
public List<SimCounts> simulate(){
counts = new double[numDataPoints][numConditions*numReplicates];
extraCounts = new double[numDataPoints][numConditions*numReplicates];
double[] absolutes = new double[numDataPoints];
simResults = new ArrayList<SimCounts>();
boolean [] diffs = new boolean[numDataPoints];
Random rand = new Random();
NegativeBinomialDistrib nb = new NegativeBinomialDistrib(1,0.5);
double [] condAMols = new double[numDataPoints];
double [] condBMols = new double[numDataPoints];
double [] condMolTotals = new double[numConditions];
double [] sampleReadTotals = new double[numConditions*numReplicates];
double [] sampleReadsPerMols = new double[numConditions*numReplicates];
double absTotal = 0;
int x=0;
for(int c=0; c<numConditions; c++){
condMolTotals[c]=0;
for(int r=0; r<numReplicates; r++){
sampleReadTotals[x]=0;
sampleReadsPerMols[x]=0;
x++;
}
}
//Assume that empirical holds the absolute molecule counts
//Generate new molecule counts for two conditions
for(int d=0; d<numDataPoints; d++){
//Choose a binding event strength
double dice = rand.nextDouble();
int strengthIndex = (int)(dice*(double)empirical.size());
double empMol = empirical.get(strengthIndex);
absolutes[d] = empMol;
double currAMol = empMol;
double currBMol = empMol;
dice = rand.nextDouble();
//Is this gene differentially expressed in this condition? (w.r.t condition A)
//Implemented a little strangely to avoid skew
diffs[d] = true;
if(dice<(downRegFrac/2))
currAMol=currAMol*diffExpLevel;
else if(dice<downRegFrac)
currBMol=currBMol/diffExpLevel;
else if(dice<(downRegFrac+upRegFrac/2))
currAMol=currAMol/diffExpLevel;
else if(dice<(downRegFrac+upRegFrac))
currBMol=currBMol*diffExpLevel;
else
diffs[d] = false;
condAMols[d]=currAMol;
condBMols[d]=currBMol;
condMolTotals[0]+=currAMol;
if(numConditions==2)
condMolTotals[1]+=currBMol;
absTotal+=empMol;
}
//For each condition
for(int c=0; c<numConditions; c++){
//Sample counts at the appropriate concentrations
for(int d=0; d<numDataPoints; d++){
double conc = c==0 ? condAMols[d]/condMolTotals[0] : condBMols[d]/condMolTotals[1];
double mean = c==0 ? (conc*condATotalReads) : (conc*condBTotalReads);
double var = mean+(mean*mean*alpha);
nb.setMeanVar(mean, var);
int sample = c*numReplicates;
for(int r=0; r<numReplicates; r++){
counts[d][sample]=(double)nb.nextInt();
sampleReadTotals[sample]+=counts[d][sample];
sample++;
}
}
}
//Extra counts around the absolute conc
for(int d=0; d<numDataPoints; d++){
double conc = absolutes[d]/absTotal;
for(int c=0; c<numConditions; c++){
double mean = c==0 ? (conc*condATotalReads) : (conc*condBTotalReads);
double var = mean+(mean*mean*alpha);
nb.setMeanVar(mean, var);
int sample= c*numReplicates;
for(int r=0; r<numReplicates; r++){
extraCounts[d][sample]=(double)nb.nextInt();
sample++;
}
}
}
//Make simulated results data structure
for(int d=0; d<numDataPoints; d++){
SimCounts sim = new SimCounts();
sim.absolute = absolutes[d];
sim.counts=counts[d];
sim.backup = extraCounts[d];
sim.isDiff=diffs[d];
simResults.add(sim);
}
//Reads per Mol ratios
System.out.println("CountDataSimulator: Reads per Mol ratios\n");
x=0;
for(int c=0; c<numConditions; c++){
for(int r=0; r<numReplicates; r++){
System.out.print("\t"+c+":"+r);
sampleReadsPerMols[x]=sampleReadTotals[x]/condMolTotals[c];
x++;
}
}System.out.println("");
x=0;
for(int c1=0; c1<numConditions; c1++){
for(int r1=0; r1<numReplicates; r1++){
int y=0;
System.out.print(c1+":"+r1);
for(int c2=0; c2<numConditions; c2++){
for(int r2=0; r2<numReplicates; r2++){
System.out.print(String.format("\t%.2f",(sampleReadsPerMols[x]/sampleReadsPerMols[y])));
y++;
}
}System.out.println("");
x++;
}
}System.out.println("");
return(simResults);
}
/**
* Write the counts table to a file
* @param outFilename
*/
public void printOutput(String outBase, boolean printLabels){
if(counts!=null){
try {
String outFilename = outBase+".counts";
FileWriter fw = new FileWriter(outFilename);
fw.write("Point");
if(printLabels)
fw.write("\tLabel");
for(int c=0; c<numConditions; c++){
for(int r=0; r<numReplicates; r++)
fw.write("\t"+c+":"+r);
}fw.write("\n");
for(int d=0; d<numDataPoints; d++){
fw.write(d+"");
if(printLabels){
int label = simResults.get(d).isDiff? 1:0;
fw.write("\t"+label);
}
for(int s=0; s<(numConditions*numReplicates); s++)
fw.write("\t"+counts[d][s]);
fw.write("\n");
}
fw.close();
} catch (IOException e) {
e.printStackTrace();
}
}else{
System.err.println("Counts matrix not defined.");
}
}
/**
* Main file for simulating counts data from the command-line
* @param args
*/
public static void main(String[] args) {
String empFile, outFile="out";
int r, numdata;
double rA, rB, a, up, down, diff;
ArgParser ap = new ArgParser(args);
if(args.length==0 || ap.hasKey("h") || !ap.hasKey("emp")){
System.err.println("CountDataSimulator:\n" +
"\t--emp <empirical data file>\n" +
"\t--numdata <number of data points>\n" +
"\t--r <num replicates per condition>\n" +
"\t--a <over-dispersion param>\n" +
"\t--readsA <avg reads in cond A>\n" +
"\t--readsB <avg reads in cond B>\n" +
"\t--up <up-regulated fraction>\n" +
"\t--down <down-regulated fraction>\n" +
"\t--diff <basis of differential expression>\n" +
"\t--out <output file>\n" +
"");
}else{
CountDataSimulator sim = new CountDataSimulator();
if(ap.hasKey("emp")){
empFile = ap.getKeyValue("emp");
sim.loadEmpiricalFromFile(empFile);
}if(ap.hasKey("numdata")){
numdata = new Integer(ap.getKeyValue("numdata"));
sim.setDataPoints(numdata);
}if(ap.hasKey("r")){
r = new Integer(ap.getKeyValue("r"));
sim.setReplicates(r);
}if(ap.hasKey("a")){
a = new Double(ap.getKeyValue("a"));
sim.setAlpha(a);
}if(ap.hasKey("readsA")){
rA = new Double(ap.getKeyValue("readsA"));
sim.setReadsA(rA);
}if(ap.hasKey("readsB")){
rB = new Double(ap.getKeyValue("readsB"));
sim.setReadsB(rB);
}if(ap.hasKey("up")){
up = new Double(ap.getKeyValue("up"));
sim.setUpRegFrac(up);
}if(ap.hasKey("down")){
down = new Double(ap.getKeyValue("down"));
sim.setDownRegFrac(down);
}if(ap.hasKey("diff")){
diff = new Double(ap.getKeyValue("diff"));
sim.setDiffExpLevel(diff);
}if(ap.hasKey("out")){
outFile = ap.getKeyValue("out");
}
sim.simulate();
sim.printOutput(outFile, false);
}
}
public class SimCounts{
public double absolute; //The absolute value of binding enrichment (mean before differential)
public double [] counts; //The simulated counts
public double [] backup; //Extra counts used by the multi-condition read simulator
public boolean isDiff=false;
public String toString(){
String str = new String("Absolute: "+absolute+"\n");
str = str+"Counts:";
for(int i=0; i<counts.length; i++)
str = str+"\t"+counts[i];
str = str+"\nBackup:";
for(int i=0; i<backup.length; i++)
str = str+"\t"+backup[i];
if(isDiff)
str = str+"\nisDiff:true";
else
str = str+"\nisDiff:false";
return str;
}
}
}