package org.seqcode.deepseq.utils.simulation; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Random; import org.apache.commons.math3.distribution.NormalDistribution; import org.seqcode.deepseq.ReadHit; import org.seqcode.deepseq.events.BindingModel; import org.seqcode.deepseq.experiments.ExperimentManager; import org.seqcode.deepseq.experiments.ExptConfig; import org.seqcode.deepseq.experiments.Sample; import org.seqcode.deepseq.utils.simulation.CountDataSimulator.SimCounts; import org.seqcode.genome.Genome; import org.seqcode.genome.GenomeConfig; import org.seqcode.genome.location.Point; import org.seqcode.gseutils.ArgParser; import org.seqcode.gseutils.Args; import org.seqcode.gseutils.Pair; /** * Simulates single or double condition fragments and reads using BindingModels. <br> * * In any given simulated replicate experiment, reads are simulated according to a BindingModel. * Read counts for each binding event in each condition are determined from a CountDataSimulator, * which allows for differential binding between two conditions, and negative-binomial sampled * read counts in each replicate. * Noise reads are distributed uniformly across the genome or sampled from an input tag file. * * * @author Shaun Mahony * * Extended from org.seqcode.projects.multigps.utilities.MultiConditionReadSimulator */ public class ChIPReadSimulator { private BindingModel model; private int numConditions = 2; private int numReplicates = 1; private String outPath; private FileWriter[][] writers; private Genome fakeGen; private long[] chromLens; private HashMap<String, Long> chromOffsets = new HashMap<String, Long>(); private int rLen=32; private long genomeLength=-1; private double noiseProbabilities[][]; private int numSigFrags[][], numTotalFrags[][]; private int numReads =1000000; private List<SimCounts> simCounts; private int numEvents=0; private int eventSpacing = 2000; private double jointEventRate = 0.0; private int jointEventSpacing = 200; private int fragLenMean = 140; private int fragLenStdDev = 40; private List<Pair<Point, SimCounts>> events = new ArrayList<Pair<Point, SimCounts>>(); private HashMap<Point, Boolean> eventIsJoint = new HashMap<Point, Boolean>(); private List<ReadHit> noiseSource=null; private boolean subsampleControl=false; private boolean paired=false; public ChIPReadSimulator(BindingModel m, Genome g, List<SimCounts> counts, int numCond, int numRep, double noiseProb, double jointRate, int jointSpacing, String outPath){ model=m; numConditions = numCond; numReplicates = numRep; jointEventRate = jointRate; jointEventSpacing = jointSpacing; this.outPath = outPath; fakeGen = g; genomeLength = (long)fakeGen.getGenomeLength(); chromLens = new long[fakeGen.getChromList().size()]; int c=0; long offset=0; for(String chr : fakeGen.getChromList()){ chromLens[c]=fakeGen.getChromLength(chr); chromOffsets.put(chr, offset); offset+=chromLens[c]; c++; } noiseProbabilities = new double[numConditions][numReplicates]; numSigFrags = new int[numConditions][numReplicates]; numTotalFrags = new int[numConditions][numReplicates]; for(int co=0; co<numConditions; co++) for(int r=0; r<numReplicates; r++){ noiseProbabilities[co][r]=noiseProb; numSigFrags[co][r]=0; numTotalFrags[co][r]=0; } try { writers = new FileWriter[numConditions][numReplicates]; for(int co=0; co<numConditions; co++) for(int r=0; r<numReplicates; r++){ FileWriter fout = new FileWriter(outPath+"_reads_C"+co+"_R"+r+".bed"); writers[co][r]=fout; } } catch (IOException e) { e.printStackTrace(); } if(noiseProb<1.0){ simCounts = counts; setBindingPositions(); numEvents = events.size(); //Count signal reads and infer total reads for(Pair<Point,SimCounts> event : events){ int index=0; for(int co=0; co<numConditions; co++) for(int r=0; r<numReplicates; r++){ if(!eventIsJoint.get(event.car())){ numSigFrags[co][r]+=event.cdr().counts[index]; }else{ numSigFrags[co][r]+=event.cdr().backup[index]; } numTotalFrags[co][r] = (int)((double)numSigFrags[co][r]/(1-noiseProbabilities[co][r])); index++; } } } } /** * Make binding positions corresponding to all simulated counts. * By default, shared events are in chr1, and diff events are in chrX * If there is no chrX (e.g. yeast), shared and diff events are interspersed along all chroms */ private void setBindingPositions(){ Random jointDice = new Random(); long sharedOffset=eventSpacing, diffOffset=eventSpacing/2; if(chromOffsets.containsKey("X")){ sharedOffset=chromOffsets.get("1"); diffOffset=chromOffsets.get("X"); } for(SimCounts s : simCounts){ long curroff =0; if(s.isDiff){ curroff =diffOffset; diffOffset+=eventSpacing; }else{ curroff =sharedOffset; sharedOffset+=eventSpacing; } //Translate from offsets to chromosome name and start int c=0; long offset=0; String chr = fakeGen.getChromList().get(0); while(curroff>(offset+chromLens[c]) && c<fakeGen.getChromList().size()-1){ c++; chr = fakeGen.getChromList().get(c); offset = chromOffsets.get(chr); } long start = curroff-offset; Point p = new Point(fakeGen, chr, (int)start); events.add(new Pair<Point, SimCounts>(p,s)); eventIsJoint.put(p, false); //Simulate a joint event? double jointrand = jointDice.nextDouble(); if(jointrand<jointEventRate){ Point jp = new Point(fakeGen, chr, (int)start+jointEventSpacing); events.add(new Pair<Point, SimCounts>(jp,s)); eventIsJoint.put(jp, true); } } } /** * Simulate a set of binding event reads for each replicate in each condition. * Number of reads, number of events, and strengths of events are all pre-determined. */ private void simulateReads(){ //Initialize the probability landscape int eventWidth=1000; int evoff = eventWidth/2; double[] forProbLand=new double[eventWidth]; double[] revProbLand=new double[eventWidth]; double[] forProbCumul=new double[eventWidth]; double[] revProbCumul=new double[eventWidth]; for(int i=0; i<eventWidth; i++){ forProbLand[i]=0; revProbLand[i]=0; forProbCumul[i]=0; revProbCumul[i]=0; } int modelRange = Math.max(Math.abs(model.getMin()), Math.abs(model.getMax())); int winStart = Math.max(0, evoff-modelRange); int winStop = Math.min(evoff+modelRange, eventWidth-1); for(int i=winStart; i<winStop; i++){ int forDist = i-evoff; int revDist = evoff-i; forProbLand[i]+=model.probability(forDist); revProbLand[i]+=model.probability(revDist); } //Set the cumulative scores double fTotal=0, rTotal=0; for(int i=0; i<eventWidth; i++){ fTotal+=forProbLand[i]; rTotal+=revProbLand[i]; forProbCumul[i]=fTotal; revProbCumul[i]=rTotal; } //Normalize for(int i=0; i<eventWidth; i++){ forProbLand[i]=forProbLand[i]/fTotal; revProbLand[i]=revProbLand[i]/rTotal; forProbCumul[i]=forProbCumul[i]/fTotal; revProbCumul[i]=revProbCumul[i]/rTotal; } //Generate the reads Random sigGenerator = new Random(); Random noiseGenerator = new Random(); Random strandGenerator = new Random(); NormalDistribution fragLengthDistrib = new NormalDistribution(fragLenMean, fragLenStdDev); for(int co=0; co<numConditions; co++) for(int r=0; r<numReplicates; r++){ List<ReadHit> frags = new ArrayList<ReadHit>(); List<ReadHit> fragPairs = new ArrayList<ReadHit>(); // Generate event reads if(noiseProbabilities[co][r]<1){ for(Pair<Point, SimCounts> ps : events){ Point coord = ps.car(); String chr = coord.getChrom(); int chrLen = fakeGen.getChromLength(chr); SimCounts sc = ps.cdr(); int sample = co*numReplicates+r; double readCount = eventIsJoint.get(coord) ? sc.backup[sample] : sc.counts[sample]; ReadHit rh = null; ReadHit rhp=null; for(int x=0; x<readCount; x++){ boolean forwardStrand = strandGenerator.nextDouble()<0.5 ? true:false; double rand = sigGenerator.nextDouble(); int fivePrimeEnd=coord.getLocation()-evoff; //Find the probability interval if(forwardStrand){ for(int j=0; j<eventWidth; j++){ if(forProbCumul[j] > rand){ fivePrimeEnd+=j; break; } } //Make the ReadHit if((fivePrimeEnd+rLen-1)<chrLen){ rh = new ReadHit(chr, fivePrimeEnd, fivePrimeEnd+rLen-1, '+'); if(paired){ int len =Math.max(rLen, (int)fragLengthDistrib.sample()); rhp = new ReadHit(chr, fivePrimeEnd+len-rLen+1, fivePrimeEnd+len, '-'); } } }else{ for(int j=eventWidth-1; j>=0; j--){ if(revProbCumul[j] < rand){ fivePrimeEnd+=j+1; break; } } //Make the ReadHit if(fivePrimeEnd<chrLen){ rh = new ReadHit(chr, Math.max(1, fivePrimeEnd-rLen+1), fivePrimeEnd, '-'); if(paired){ int len =Math.max(rLen, (int)fragLengthDistrib.sample()); rhp = new ReadHit(chr, Math.max(1, fivePrimeEnd-len), Math.max(1, fivePrimeEnd-len+rLen-1), '+'); } } } if(rh!=null && (!paired || rhp!=null)){ frags.add(rh); fragPairs.add(rhp); } } } } //Generate noise reads Random readSampler = new Random(); int noiseFrags = (int)((double)numTotalFrags[co][r]*noiseProbabilities[co][r]); if(noiseSource==null) //Poisson for(int i=0; i<noiseFrags; i++){ ReadHit rh=null, rhp=null; double noiserand = noiseGenerator.nextDouble(); double strandrand = strandGenerator.nextDouble(); long pos = (long)(noiserand*(genomeLength)); //Translate from pos to chromosome name and start int c=0; long offset=0; String chr = fakeGen.getChromList().get(0); while(pos>(offset+chromLens[c]) && c<fakeGen.getChromList().size()-1){ c++; chr = fakeGen.getChromList().get(c); offset = chromOffsets.get(chr); } long start = pos-offset; //Add the ReadHit if (strandrand<0.5){ rh = new ReadHit(chr, (int)start, (int)start+rLen-1, '+'); if(paired){ int len =Math.max(rLen, (int)fragLengthDistrib.sample()); rhp = new ReadHit(chr, (int)start+len-rLen+1, (int)start+len, '-'); } }else{ rh = new ReadHit(chr, Math.max(1, (int)start-rLen+1), (int)start, '-'); if(paired){ int len =Math.max(rLen, (int)fragLengthDistrib.sample()); rhp = new ReadHit(chr, Math.max(1, (int)start-len), Math.max(1, (int)start-len+rLen-1), '+'); } } frags.add(rh); if(paired) fragPairs.add(rhp); } else{ //If the noise source is a control experiment, we shouldn't sample fragments with replacement. //Rather, we treat each control experiment read as a distinct fragment in the initial library int noiseSourceSize = noiseSource.size(); if(noiseSourceSize<noiseFrags){ System.err.println("Provided control has fewer reads than the requested noise fragments"); System.exit(1); } //fragindex is a list of non-repeating random numbers Integer[] fragindex = new Integer[noiseSourceSize]; for (int i = 0; i < noiseSourceSize; i++) fragindex[i] = i; Collections.shuffle(Arrays.asList(fragindex)); //Extract the fragments for(int i=0; i<noiseFrags; i++){ ReadHit rh = noiseSource.get(fragindex[i]); frags.add(rh); } } //Count unique fragments HashMap<String, Integer> uniqueFragLabels = new HashMap<String, Integer>(); for(ReadHit hit : frags){ String label = hit.toString(); int count =0; if(uniqueFragLabels.containsKey(label)) count = uniqueFragLabels.get(label); uniqueFragLabels.put(label, count+1); } int uniqueFrags = uniqueFragLabels.keySet().size(); System.out.println(uniqueFrags+" unique positions generated."); //Sample reads from the fragments & print try { int numFrags = frags.size(); for(int x=0; x<numReads; x++){ ReadHit rh=null, rhp=null; if(noiseSource==null || !subsampleControl){ double rand = readSampler.nextDouble(); int index = (int)((double)numFrags*rand); rh = frags.get(index); if(paired) rhp=fragPairs.get(index); }else{ //Here, the options tell us to just *subsample* the control reads directly without replacement. //This is easy here, as frags should contain a randomly ordered subset of control reads already rh = frags.get(x); } writers[co][r].write(rh.getChrom()+"\t"+rh.getStart()+"\t"+rh.getEnd()+"\tU\t0\t"+rh.getStrand()+"\n"); if(paired) writers[co][r].write(rhp.getChrom()+"\t"+rhp.getStart()+"\t"+rhp.getEnd()+"\tU\t0\t"+rhp.getStrand()+"\n"); } } catch (IOException e) { e.printStackTrace(); } } } //Accessors public void setTotalFrags(int totFrags){ for(int co=0; co<numConditions; co++) for(int r=0; r<numReplicates; r++){ numTotalFrags[co][r] = totFrags; } } public void setTotalReads(int totReads){ for(int co=0; co<numConditions; co++) for(int r=0; r<numReplicates; r++){ numReads = totReads; } } public void setJointEventRate(double jointRate){ this.jointEventRate = jointRate; } public void setReadLength(int r){ this.rLen=r; } public void setPaired(boolean p){ this.paired=p;} public void setNoiseSource(List<Sample> controls, boolean subsample){ subsampleControl = subsample; noiseSource = new ArrayList<ReadHit>(); for(Sample s : controls){ //Add each read hit as its own fragment for(ReadHit rh : s.exportReadHits(rLen)){ for(float x=0; x<rh.getWeight(); x+=1.0){ ReadHit newRH = new ReadHit(rh.getChrom(), rh.getStart(), rh.getEnd(), rh.getStrand()); noiseSource.add(newRH); } } } System.err.println(noiseSource.size()+" control reads sourced as distinct fragments"); } // clean up public void close(){ try { for(int co=0; co<numConditions; co++) for(int r=0; r<numReplicates; r++){ writers[co][r].close(); } } catch (IOException e) { e.printStackTrace(); } } /** * Write an output file of event positions & expected read counts per replicate */ public void printEvents(){ try { String filename = outPath+".events"; FileWriter fout = new FileWriter(filename); fout.write("Coord\tDiff"); for(int co=0; co<numConditions; co++) for(int r=0; r<numReplicates; r++) fout.write("\tC"+co+"_R"+r); fout.write("\n"); for(Pair<Point, SimCounts> ps : events){ Point coord = ps.car(); SimCounts sc = ps.cdr(); int diff = sc.isDiff && !eventIsJoint.get(coord) ? 1 : 0; fout.write(coord.getLocationString()+"\t"+diff); for(int x=0; x<sc.counts.length; x++){ if(!eventIsJoint.get(coord)) fout.write("\t"+sc.counts[x]); else fout.write("\t"+sc.backup[x]); }fout.write("\n"); } fout.close(); } catch (IOException e) { e.printStackTrace(); } } /** */ public static void main(String[] args) { String empFile, outFile="out"; int c=2, r=2, numdata, jointSpacing=200, rlen=32; double frags=1000000, reads=1000000, a, up, down, diff, jointRate=0.0; String bmfile; boolean printEvents=true, subsampleControl=false, isPaired=false; ArgParser ap = new ArgParser(args); if(args.length==0 || ap.hasKey("h") || !ap.hasKey("emp")){ System.err.println("ChIPReadSimulator:\n" + "\t--geninfo <genome info file>\n" + "\t--emp <empirical data file>\n" + "\t--numdata <number of events to simulate>\n" + "\t--c <num conditions>\n" + "\t--r <num replicates per condition>\n" + "\t--a <over-dispersion param>\n" + "\t--frags <avg total fragments>\n" + "\t--reads <avg total reads>\n" + "\t--rlen <length of each read>\n" + "\t--up <up-regulated fraction>\n" + "\t--down <down-regulated fraction>\n" + "\t--diff <basis of differential expression>\n" + "\t--model <binding model file>\n" + "\t--noise <noise probability per replicate>\n" + "\t--ctrl <control experiment to replace Poisson for noise data>\n" + "\t--subsample <subsample the control experiment non-redundantly>\n" + "\t--format <SAM/IDX>\n" + "\t--jointrate <proportion of peaks that are joint binding events>\n" + "\t--jointspacing <spacing between joint events>\n" + "\t--noevents [flag to turn off making some files]\n" + "\t--paired [flag to generate paired reads (not working for control-sampling yet)]\n" + "\t--out <output file>\n" + ""); }else{ GenomeConfig gcon = new GenomeConfig(args); ExptConfig econ = new ExptConfig(gcon.getGenome(), args); CountDataSimulator cdsim = new CountDataSimulator(); ////////////////////////////////////////////////// // Read in parameters if(ap.hasKey("emp")){ empFile = ap.getKeyValue("emp"); cdsim.loadEmpiricalFromFile(empFile); }if(ap.hasKey("numdata")){ numdata = new Integer(ap.getKeyValue("numdata")); cdsim.setDataPoints(numdata); }if(ap.hasKey("c")){ c = new Integer(ap.getKeyValue("c")); if(c!=1 && c!=2){ System.err.println("Number of conditions must be either 1 or 2"); System.exit(1); } cdsim.setConditions(c); }if(ap.hasKey("r")){ r = new Integer(ap.getKeyValue("r")); cdsim.setReplicates(r); }if(ap.hasKey("a")){ a = new Double(ap.getKeyValue("a")); cdsim.setAlpha(a); }if(ap.hasKey("frags")){ frags = new Double(ap.getKeyValue("frags")); cdsim.setReadsA(frags); cdsim.setReadsB(frags); }if(ap.hasKey("reads")){ reads = new Double(ap.getKeyValue("reads")); }if(ap.hasKey("rlen")){ rlen = new Integer(ap.getKeyValue("rlen")); }if(ap.hasKey("up")){ up = new Double(ap.getKeyValue("up")); cdsim.setUpRegFrac(up); }if(ap.hasKey("down")){ down = new Double(ap.getKeyValue("down")); cdsim.setDownRegFrac(down); }if(ap.hasKey("diff")){ diff = new Double(ap.getKeyValue("diff")); cdsim.setDiffExpLevel(diff); }if(ap.hasKey("jointrate")){ jointRate = new Double(ap.getKeyValue("jointrate")); }if(ap.hasKey("jointspacing")){ jointSpacing = new Integer(ap.getKeyValue("jointspacing")); }if(ap.hasKey("out")){ outFile = ap.getKeyValue("out"); }if(ap.hasKey("noevents")){ printEvents=false; }if(ap.hasKey("subsample")){ subsampleControl=true; if(reads>frags){ System.err.println("Can't subsample in cases where frags is less than reads!"); System.exit(1); } }if(ap.hasKey("paired")){ isPaired=true; } double noiseProb = Args.parseDouble(args, "noise", 0.9); double [][] noiseProbs = new double[c][r]; for(int x=0; x<c; x++) for(int y=0; y<noiseProbs[x].length; y++) noiseProbs[x][y]=noiseProb; cdsim.setReads(frags*(1-noiseProb)); bmfile = Args.parseString(args, "model", null); File mFile = new File(bmfile); if(!mFile.isFile()){System.err.println("Invalid file name");System.exit(1);} ////////////////////////////////////////////////// // Simulate counts List<SimCounts> counts = null; if(noiseProb<1.0){ counts = cdsim.simulate(); if(printEvents) cdsim.printOutput(outFile, true); } ////////////////////////////////////////////////// // Simulate reads according to counts and binding model BindingModel bm = new BindingModel(mFile); //Initialize the MultiConditionReadSimulator ChIPReadSimulator sim = new ChIPReadSimulator(bm, gcon.getGenome(), counts, c, r, noiseProb, jointRate, jointSpacing, outFile); if(noiseProb==1.0) sim.setTotalFrags((int) frags); ExperimentManager manager=null; if(ap.hasKey("ctrl")){ manager = new ExperimentManager(econ); sim.setNoiseSource(manager.getSamples(), subsampleControl); } sim.setTotalReads((int) reads); sim.setReadLength(rlen); sim.setPaired(isPaired); if(noiseProb<1 && printEvents) sim.printEvents(); sim.simulateReads(); sim.close(); if(manager!=null) manager.close(); } } }