package org.seqcode.deepseq.utils; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.seqcode.data.io.RegionFileUtilities; import org.seqcode.deepseq.StrandedBaseCount; import org.seqcode.deepseq.experiments.ControlledExperiment; import org.seqcode.deepseq.experiments.ExperimentCondition; import org.seqcode.deepseq.experiments.ExperimentManager; import org.seqcode.deepseq.experiments.ExptConfig; import org.seqcode.genome.GenomeConfig; import org.seqcode.genome.location.NamedRegion; import org.seqcode.genome.location.Region; import org.seqcode.gsebricks.verbs.location.ChromRegionIterator; import org.seqcode.gseutils.ArgParser; import org.seqcode.gseutils.Args; /** * This class aims to count tags overlapping a set of regions. * * @author mahony * */ public class RegionTagCounts { private GenomeConfig gConfig; private ExptConfig eConfig; private List<Region> testRegs; private ExperimentManager manager=null; private String outName = "out"; private boolean totalTagNorm=false; //normalize to total tags private boolean sigPropNorm=false; //normalize to signal proportion private boolean scaleToPercentile=false; //scale to Xth percentile of observed counts private double scalePercentile=0.99; //Percentile to scale to public RegionTagCounts(GenomeConfig gcon, ExptConfig econ, List<Region> regs, String out){ gConfig = gcon; eConfig = econ; testRegs = regs; manager = new ExperimentManager(eConfig); outName = out; } public void setTotalTagNorm(boolean n){totalTagNorm=n;} public void setSigPropNorm(boolean n){sigPropNorm=n;} public void setPercentileScale(double perc){ if(perc>0 && perc<=1){ scalePercentile=perc; scaleToPercentile=true; } } public void execute(){ Map<Region, Double[]> regCounts = new HashMap<Region, Double[]>(); for(Region r : testRegs){ regCounts.put(r, new Double[manager.getReplicates().size()]); for(int i=0; i<manager.getReplicates().size(); i++) regCounts.get(r)[i] = 0.0; } for(ExperimentCondition c : manager.getConditions()){ for(ControlledExperiment rep : c.getReplicates()){ System.err.println("Condition "+c.getName()+":\tRep "+rep.getName()); double scaling = rep.getControlScaling(); double sigStrength = 1-(scaling/(rep.getSignal().getHitCount()/rep.getControl().getHitCount())); double sigCount = sigStrength * rep.getSignal().getHitCount(); ArrayList<Double> allRepCounts= new ArrayList<Double>(); ChromRegionIterator chroms = new ChromRegionIterator(gConfig.getGenome()); while(chroms.hasNext()){ NamedRegion currentRegion = chroms.next(); //Split the job up into chunks of 100Mbp for(int x=currentRegion.getStart(); x<=currentRegion.getEnd(); x+=100000000){ int y = x+100000000; if(y>currentRegion.getEnd()){y=currentRegion.getEnd();} Region currSubRegion = new Region(gConfig.getGenome(), currentRegion.getChrom(), x, y); List<StrandedBaseCount> hits = rep.getSignal().getBases(currSubRegion); double stackedTagStarts[] = makeTagStartLandscape(hits, currSubRegion); //Get coverage of points that lie within the current region for(Region r : testRegs){ if(currSubRegion.contains(r)){ int offsetStart = inBounds(r.getStart()-currSubRegion.getStart(), 0, currSubRegion.getWidth()-1); int offsetEnd =inBounds(r.getEnd()-currSubRegion.getStart(), 0, currSubRegion.getWidth()-1); double sum=0; for(int o=offsetStart; o<=offsetEnd; o++){ sum+=stackedTagStarts[o]; } double count = 0; if(totalTagNorm) count=sum/rep.getSignal().getHitCount(); else if(sigPropNorm) count=sum/sigCount; else count=sum; regCounts.get(r)[rep.getIndex()]=count; allRepCounts.add(count); } } } } if(scaleToPercentile){ Collections.sort(allRepCounts); double ceiling = allRepCounts.get((int)(allRepCounts.size()*scalePercentile)); for(Region r : testRegs){ if(regCounts.containsKey(r)){ if(regCounts.get(r)[rep.getIndex()]>ceiling) regCounts.get(r)[rep.getIndex()] = 1.0; else regCounts.get(r)[rep.getIndex()] /=ceiling; } } } } } //Write to file try { FileWriter fw = new FileWriter(outName+".region-counts.txt"); fw.write("Coord"); for(ExperimentCondition c : manager.getConditions()) for(ControlledExperiment rep : c.getReplicates()) fw.write("\t"+rep.getName()); fw.write("\n"); for(Region r : testRegs){ fw.write(r.getLocationString()); for(ExperimentCondition c : manager.getConditions()) for(ControlledExperiment rep : c.getReplicates()) fw.write("\t"+String.format("%f", regCounts.get(r)[rep.getIndex()])); fw.write("\n"); } fw.close(); } catch (IOException e) { e.printStackTrace(); } } protected double[] makeTagStartLandscape(List<StrandedBaseCount> hits, Region currReg){ double[] counts = new double[(int)currReg.getWidth()+1]; for(int i=0; i<=currReg.getWidth(); i++){counts[i]=0;} for(StrandedBaseCount r : hits){ if(r.getCoordinate()>=currReg.getStart() && r.getCoordinate()<=currReg.getEnd()){ int offset=inBounds(r.getCoordinate()-currReg.getStart(),0,currReg.getWidth()); counts[offset]+=r.getCount(); } } return(counts); } //keep the number in bounds protected final double inBounds(double x, double min, double max){ if(x<min){return min;} if(x>max){return max;} return x; } protected final int inBounds(int x, int min, int max){ if(x<min){return min;} if(x>max){return max;} return x; } public void close(){ if(manager !=null) manager.close(); } /** * @param args */ public static void main(String[] args) { ArgParser ap = new ArgParser(args); if(args.length==0 || ap.hasKey("h")){ System.err.println("RegionTagCounts:"); System.err.println("Genome:" + "\t--species <Species;Genome>\n" + "\tOR\n" + "\t--geninfo <genome info file> AND --seq <fasta seq directory>\n" + "Coverage Testing:\n" + "\t--reg <region coords>\n" + "\t--win <win size bp>\n" + "\t--out <output file root>\n" + "\t--signormcounts [flag to normalize counts by inferred signal proportion]\n" + "\t--totalnormcounts [flag to normalize counts by total tags]\n" + "\t--percscale <fraction> : scale counts to this percentile of observed counts\n" ); }else{ GenomeConfig gcon = new GenomeConfig(args); ExptConfig econ = new ExptConfig(gcon.getGenome(), args); Integer win = Args.parseInteger(args, "win", -1); List<Region> testSites = new ArrayList<Region>(); Collection<String> regFiles = Args.parseStrings(args, "reg"); for(String rf : regFiles) testSites.addAll(RegionFileUtilities.loadRegionsFromFile(rf, gcon.getGenome(), win)); String outName = Args.parseString(args, "out", "out"); boolean signorm = Args.parseFlags(args).contains("signormcounts"); boolean totnorm = Args.parseFlags(args).contains("totalnormcounts"); double scalePercentile = Args.parseDouble(args, "percscale", -1); RegionTagCounts rct = new RegionTagCounts(gcon, econ, testSites, outName); rct.setSigPropNorm(signorm); rct.setTotalTagNorm(totnorm); rct.setPercentileScale(scalePercentile); rct.execute(); rct.close(); } } }