package org.seqcode.projects.galaxyexo; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import org.seqcode.data.io.parsing.GFFEntry; import org.seqcode.data.io.parsing.ParseGFF; import org.seqcode.deepseq.events.BindingEvent; import org.seqcode.deepseq.events.BindingManager; import org.seqcode.deepseq.events.EnrichmentSignificance; import org.seqcode.deepseq.events.EventsConfig; import org.seqcode.deepseq.events.PointsToEvents; import org.seqcode.deepseq.experiments.ControlledExperiment; import org.seqcode.deepseq.experiments.ExperimentCondition; import org.seqcode.deepseq.experiments.ExperimentManager; import org.seqcode.deepseq.experiments.ExptConfig; import org.seqcode.genome.GenomeConfig; import org.seqcode.genome.location.Point; import org.seqcode.gseutils.Args; import org.seqcode.math.diff.CountsDataset; import org.seqcode.math.diff.DifferentialEnrichment; import org.seqcode.math.diff.EdgeRDifferentialEnrichment; import org.seqcode.math.diff.Normalization; /** * Utility to quantify enrichment and assess significance of a set of peaks (from GFF file). * If multiple conditions are specified, differential binding analysis is performed via edgeR. * Meant to be used in the galaxy-exo pipeline. * * Input: * - Genome * - GFF file of potential peak locations * - Window around peaks in which to count reads * Output: * - Annotated GFF file (all peaks) * - Annotated GFF file (peaks passing threshold) * - Replicate count file * * SignificanceTester performs no cleaning, merging, or shifting of input locations. * * @author mahony * */ public class SignificanceTester { protected ExptConfig econfig; protected EventsConfig config; protected ExperimentManager manager=null; protected BindingManager bindingManager; protected Normalization normalizer; protected HashMap<Point,GFFEntry> siteToGFFEntry; protected List<Point> potentialSites; protected int searchRegionWin = 50; protected double qThres = 0.01; protected double minFold = 2; protected boolean simpleReadAssignment=true; protected String outDirName, outFileBase; protected File outDir; protected boolean rankByQ; //Constructor public SignificanceTester(ExptConfig econ, EventsConfig config, ExperimentManager manager, BindingManager bman, String gffFileName, int win, double q, double minfold, boolean rankByQ, String outDirName) { this.econfig = econ; this.config = config; this.manager = manager; this.bindingManager = bman; potentialSites = new ArrayList<Point>(); siteToGFFEntry = new HashMap<Point, GFFEntry>(); searchRegionWin = win; qThres = q; this.minFold = minfold; this.rankByQ = rankByQ; //Load GFF file String gffDir=""; try { File gffFile = new File(gffFileName); ParseGFF parser = new ParseGFF(gffFile); while(parser.hasNext()){ GFFEntry site = parser.next(); Point currPt = new Point(config.getGenome(), site.getChr(), site.getMidPoint()); potentialSites.add(currPt); siteToGFFEntry.put(currPt, site); } if(gffFile.getParent() != null) gffDir = gffFile.getParent()+File.separator; outFileBase = gffFile.getName().replaceAll(".gff", ""); } catch (IOException e) { //Silent exceptions } this.outDirName = outDirName==null? gffDir+"signif_w"+searchRegionWin+"_q"+String.format("%.2e", qThres)+"_minfold"+String.format("%.1f", minFold): outDirName+File.separator+"signif_w"+searchRegionWin+"_q"+String.format("%.2e", qThres)+"_minfold"+String.format("%.1f", minFold); outDir = new File(this.outDirName); outDir.mkdirs(); } /** * Execute the enrichment tester */ public void execute(){ //Convert our points to events PointsToEvents p2e = new PointsToEvents(config, manager, bindingManager, potentialSites, searchRegionWin,!simpleReadAssignment); List<BindingEvent> events = p2e.execute(); bindingManager.setBindingEvents(events); //Estimate signal fraction (necessary for calculating statistics) bindingManager.estimateSignalVsNoiseFractions(events); EnrichmentSignificance tester = new EnrichmentSignificance(config, manager, bindingManager, config.getMinEventFoldChange(), econfig.getMappableGenomeLength()); tester.execute(searchRegionWin); bindingManager.writeReplicateCounts(outDirName+File.separator+outFileBase+"_replicatecounts.txt"); writeBindingEventGFFFiles(outDirName+File.separator+outFileBase, events); //Statistical analysis: inter-condition differences if(manager.getNumConditions()>1 && config.getRunDiffTests()){ DifferentialEnrichment edgeR = new EdgeRDifferentialEnrichment(config, outDir, outFileBase); CountsDataset data; File outImagesDir = new File(outDirName+File.separator+"images"); outImagesDir.mkdir(); for(int ref=0; ref<manager.getNumConditions(); ref++){ data = new CountsDataset(manager, bindingManager.getBindingEvents(), ref); data = edgeR.execute(data); data.updateEvents(bindingManager.getBindingEvents(), manager); //Print MA scatters (inter-sample & inter-condition) data.savePairwiseConditionMAPlots(config.getDiffPMinThres(), outImagesDir.getAbsolutePath()+File.separator, true); //Print XY scatters (inter-sample & inter-condition) data.savePairwiseFocalSampleXYPlots(outImagesDir.getAbsolutePath()+File.separator, true); data.savePairwiseConditionXYPlots(manager, bindingManager, config.getDiffPMinThres(), outImagesDir.getAbsolutePath()+File.separator, true); } } System.out.println("Output files written to: "+outDirName); } /** * Print all binding events to files */ public void writeBindingEventGFFFiles(String filePrefix, List<BindingEvent> events){ if(events.size()>0){ try { //Per-condition event files for(ExperimentCondition cond : manager.getConditions()){ //Sort on the current condition BindingEvent.setSortingCond(cond); Collections.sort(events, new Comparator<BindingEvent>(){ public int compare(BindingEvent o1, BindingEvent o2) {return o1.compareBySigCtrlPvalue(o2);} }); //Print String condName = cond.getName(); condName = condName.replaceAll("/", "-"); String allFilename = filePrefix+"_all_"+condName+".gff"; FileWriter allFout = new FileWriter(allFilename); String signifFilename = filePrefix+"_signif_"+condName+".gff"; FileWriter signifFout = new FileWriter(signifFilename); for(BindingEvent e : events){ Point currPt = e.getPoint(); GFFEntry origGFF = siteToGFFEntry.get(currPt); double P = e.getCondSigVCtrlP(cond); double logP = Math.log(e.getCondSigVCtrlP(cond))/config.LOG2; double logF = Math.log(e.getCondSigVCtrlFold(cond))/config.LOG2; double sigHits = e.getCondSigHits(cond); double ctrlHits = e.getCondCtrlHits(cond); String attrib = origGFF.getAttribString()+String.format(";sig_tags=%.1f;ctrl_tags=%.1f;log2_fold_sigctrl=%.3f;log2_qval_sigctrl=%.3f", sigHits, ctrlHits, logF, logP); double score = sigHits; if(rankByQ) score = -logP; if(e.isFoundInCondition(cond) && P <=config.getQMinThres()){ signifFout.write("chr"+origGFF.getChr()+"\t"+origGFF.getSource()+"\t"+origGFF.getFeature()+"\t"+origGFF.getStart()+"\t" +origGFF.getEnd()+"\t"+score+"\t"+origGFF.getStrand()+"\t"+origGFF.getFrame() +"\t"+attrib+"\n"); } allFout.write("chr"+origGFF.getChr()+"\t"+origGFF.getSource()+"\t"+origGFF.getFeature()+"\t"+origGFF.getStart()+"\t" +origGFF.getEnd()+"\t"+score+"\t"+origGFF.getStrand()+"\t"+origGFF.getFrame() +"\t"+attrib+"\n"); } signifFout.close(); allFout.close(); } } catch (IOException e) { e.printStackTrace(); } } } //Main public static void main(String[] args){ //Hack to set a default fixedpb limit. Need a better way to set per-application defaults String [] newargs = new String[args.length+2]; for(int a=0; a<args.length; a++) newargs[a] = args[a]; newargs[newargs.length-2] = "--fixedpb"; newargs[newargs.length-1] = "1000"; //Initialize MultiGPSConfig GenomeConfig gcon = new GenomeConfig(args); ExptConfig econ = new ExptConfig(gcon.getGenome(), args); EventsConfig config = new EventsConfig(gcon, args); econ.setScalingSlidingWindow(50000); if(config.helpWanted()){ printHelp(); }else{ ExperimentManager manager = new ExperimentManager(econ); BindingManager bman = new BindingManager(config, manager); //Just a test to see if we've loaded all conditions System.err.println("Conditions:\t"+manager.getConditions().size()); for(ExperimentCondition c : manager.getConditions()){ System.err.println("Condition "+c.getName()+":\t#Replicates:\t"+c.getReplicates().size()); } for(ExperimentCondition c : manager.getConditions()){ for(ControlledExperiment r : c.getReplicates()){ System.err.println("Condition "+c.getName()+":\tRep "+r.getName()); if(r.getControl()==null) System.err.println("\tSignal:\t"+r.getSignal().getHitCount()); else System.err.println("\tSignal:\t"+r.getSignal().getHitCount()+"\tControl:\t"+r.getControl().getHitCount()); } } //Arguments String siteFile = Args.parseString(args, "gff", null); if(siteFile==null){ printHelp(); System.exit(1); } String outDirName = Args.parseString(args, "outdir", null); int win = Args.parseInteger(args, "win", 50); double qThres = Args.parseDouble(args, "q", 0.01); double minFold = Args.parseDouble(args, "minfold", 2); boolean rankbyq = Args.parseFlags(args).contains("rankbyq"); SignificanceTester tester = new SignificanceTester(econ, config, manager, bman, siteFile, win, qThres, minFold, rankbyq, outDirName); tester.execute(); manager.close(); } } public static void printHelp(){ System.err.println("SignificanceTester:\n" + "\tCounts signal and control tags in windows around specified binding events, \n" + "\tnormalizes signal vs control using median scaling, and assesses the significance\n" + "\tof signal vs control enrichment using a Binomial test. \n" + "\tNote that median scaling is probably not appropriate in yeast ChIP-exo datasets.\n" + "\tAlternatives will be added soon. \n"); System.err.println("\t--gen <genome version>\n" + "\t--format <format of seq data files (SAM/IDX)> default=SAM\n" + "\t--exptNAME-REP <file name of signal experiment, where NAME and REP are experiment name and replicate number>\n" + "\t--ctrlNAME-REP <file name of control experiment, where NAME and REP are experiment name and replicate number>\n" + "\t--gff <potential site coords>\n" + "\t--outdir <output directory, default = based on input gff file directory name>\n" + "\t--win <window around events> default=50bp\n" + "\t--q <Q-value minimum (corrected p-value)> default=0.01\n" + "\t--minfold <min event fold-change> default=2\n" + "\t--rankbyq [flag to rank by Q-value] default=rank by signal tags\n" + "\n" + "\t--design <experiment design file> optional: can use design file instead of --expt, --ctrl and --format\n"); System.err.println("\tOutput:\n" + "\t- GFF file containing all significant peak-pairs (i.e. passing Q-value and fold thresholds) with annotation.\n" + "\t- GFF file containing all input peak-pairs with annotation.\n" + "\t- Text file containing per-replicate signal tag counts for all peak-pairs.\n"); System.err.println("\tExample Usage:\n" + "\tjava -Xmx2G org.seqcode.projects.galaxyexo.SignificanceTester --gen mm10 --format IDX --exptFoxa2-r1 FoxA2_07-633_liver_-_-_-_XO111_kaz1-S001_Pugh40203mm10.tab --exptFoxa2-r2 FoxA2_07-633_liver_-_-_-_XO211_kaz1-S001_Pugh40205mm10.tab --ctrlFoxa2 IgG_12-370_liver_-_-_-_XO_kaz1-S001_Pugh4020mm10.idx --gff genetrack_s5e10F1/cwpair_output_mode_f0u5d25b1/S_FoxA2_07-633_liver_-_-_-_XO_kaz1-S001_Pugh4020mm10_s5e10F1.gff --win 50 --q 0.01 --minfold 2\n" + ""); } }