package org.seqcode.projects.seed; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.seqcode.deepseq.StrandedBaseCount; import org.seqcode.deepseq.experiments.ExperimentCondition; import org.seqcode.deepseq.experiments.ExperimentManager; import org.seqcode.deepseq.experiments.ExptConfig; import org.seqcode.deepseq.experiments.Sample; import org.seqcode.genome.GenomeConfig; import org.seqcode.genome.location.Region; import org.seqcode.genome.location.StrandedRegion; import org.seqcode.projects.seed.features.EnrichedFeature; import org.seqcode.projects.seed.features.Feature; import org.seqcode.projects.seed.stats.FeatureStatistics; /** * DomainFinder: A FeatureDetection implementation that scans the genome for statistically enriched blocks (domains) * * @author mahony * */ public class DomainFinder extends FeatureDetection { public static String version = "0.1"; protected FeatureStatistics stats; public DomainFinder(GenomeConfig gcon, ExptConfig econ, SEEDConfig scon, ExperimentManager man) { super(gcon, econ, scon, man); stats = new FeatureStatistics(); } /** * Return the class name */ public String getProgramName(){ return "org.seqcode.projects.seed.DomainFinder"; } /** * Return a thread for this implementation */ public FeatureDetectionThread getMyThread(List<Region> regs){ return new DomainFinderThread(regs); } /** * Main command-line executable method * @param args */ public static void main(String[] args) { System.setProperty("java.awt.headless", "true"); System.err.println("DomainFinder version "+DomainFinder.version+"\n\n"); GenomeConfig gcon = new GenomeConfig(args); ExptConfig econ = new ExptConfig(gcon.getGenome(), args); SEEDConfig scon = new SEEDConfig(gcon, args); if(scon.helpWanted()){ //System.out.println(DomainFinder.getDomainFinderArgs()); System.err.println(gcon.getArgsList()+ econ.getArgsList()+ scon.getArgsList()); }else{ ExperimentManager man = new ExperimentManager(econ); DomainFinder finder = new DomainFinder(gcon, econ, scon, man); System.err.println("\nBeginning domain finding..."); finder.execute(); man.close(); } } /** * Return a help string * @return */ public static String getDomainFinderArgs() { //TODO return("TODO"); } /** * Multiple-hypothesis correction. Print the output files. * Assumes that all domains have been found. * @return : final features */ public Map<ExperimentCondition, List<Feature>> postProcess() { System.err.println("\nDomain finding complete."); for(ExperimentCondition cond : manager.getConditions()){ stats.benjaminiHochbergCorrection(features.get(cond)); } Map<ExperimentCondition, List<Feature>> signifFeatures = this.filter(features, sconfig.perBinBinomialPThres, true); //All domains this.printEventsFile(features, ".all.domains"); //Filtered by q-value this.printEventsFile(signifFeatures, ".p"+sconfig.perBinBinomialPThres+".domains"); //Summarize for(ExperimentCondition cond : manager.getConditions()) System.err.println(cond.getName()+"\t"+features.get(cond).size()+" domains\t"+signifFeatures.get(cond).size()+" domains below threshold."); return features; } /** * DomainFinderThread: thread that searches for domains * @author mahony * */ public class DomainFinderThread extends FeatureDetectionThread { public DomainFinderThread(List<Region> regs) { super(regs); } /** * findFeatures: find potentially enriched domains on the genome by comparing tag counts to a background model * * Specifically, the procedure requires that the signal count in a bin passes all PoissonBackground thresholds, * and the signal count vs control count (if exists) Binomial test passes an uncorrected p-value threshold. * Neighboring bins that pass the thresholds are merged. * * This method can assume that the hits and landscape data structures have been initialized for a given region * - for each Sample these data structures contain, respectively: the StrandedBaseCounts hits (sorted), and the * binned tag density (after any shifting and extending). * * @param subRegion : region to run analysis on * @return Map of Lists of Features in the subRegion, Indexed by ExperimentCondition */ public Map<ExperimentCondition, List<Feature>> findFeatures(Region subRegion) { Map<ExperimentCondition, List<Feature>> results = new HashMap<ExperimentCondition, List<Feature>>(); for(ExperimentCondition cond : manager.getConditions()) results.put(cond, new ArrayList<Feature>()); Map<ExperimentCondition, List<EnrichedFeature>> currFeatures = new HashMap<ExperimentCondition, List<EnrichedFeature>>(); for(ExperimentCondition cond : manager.getConditions()) currFeatures.put(cond, new ArrayList<EnrichedFeature>()); for(ExperimentCondition cond : manager.getConditions()){ int numStrandIter = strandedEventDetection ? 2 : 1; for(int stranditer=1; stranditer<=numStrandIter; stranditer++){ EnrichedFeature lastFeature=null; //If stranded peak-finding, run over both strands separately char str = !strandedEventDetection ? '.' : (stranditer==1 ? '+' : '-'); float [] condSigCounts = getConditionCounts(cond, landscape, str, true); float [] condCtrlCounts = cond.getControlSamples().size()>0 ? null : getConditionCounts(cond, landscape, str, false); //Scan regions int currBin=0; for(int i=subRegion.getStart(); i<subRegion.getEnd()-sconfig.getBinWidth(); i+=sconfig.getBinStep()){ float sigCounts=condSigCounts[currBin]; //If there's no control, use the expected bin count given random distribution float ctrlCounts= condCtrlCounts==null ? conditionBackgrounds.get(cond).getGenomicExpectedCount() : condCtrlCounts[currBin]*(float)(cond.getPooledSampleControlScaling()); //First Test: is the read count above the genome-wide thresholds? if(conditionBackgrounds.get(cond).passesGenomicThreshold((int)sigCounts, str)){ //Second Test: refresh all thresholds & test again conditionBackgrounds.get(cond).updateModels(subRegion, i-subRegion.getStart(), condSigCounts, condCtrlCounts, sconfig.getBinStep()); if(conditionBackgrounds.get(cond).passesAllThresholds((int)sigCounts, str)){ //Third Test: Binomial test between signal & (scaled) control double pval = stats.binomialPValue(ctrlCounts, (sigCounts+ctrlCounts), sconfig.getMinSigCtrlFoldDifference()); if(pval < sconfig.getPerBinBinomialPThres()){ //Add new event or append to the last one. Region currWin = strandedEventDetection ? new StrandedRegion(gen, subRegion.getChrom(), i, i+sconfig.getBinWidth()-1, str) : new Region(gen, subRegion.getChrom(), i, i+sconfig.getBinWidth()-1); lastFeature=addEnrichedDomain(currFeatures.get(cond), lastFeature, currWin, sigCounts, condCtrlCounts==null ? 0 : ctrlCounts, pval); } } } currBin++; } } } //Trim, quantify, & properly score currFeatures before adding them to the results currFeatures = processDomains(currFeatures, subRegion); for(ExperimentCondition cond : manager.getConditions()) results.get(cond).addAll(currFeatures.get(cond)); return results; } /** * Add or append a window to the list of discovered EnrichedFeatures * * @param currResults * @param lastFeat * @param currWin * @param sigWinHits * @param ctrlWinHits * @param score * @return */ protected EnrichedFeature addEnrichedDomain(List<EnrichedFeature> currResults, EnrichedFeature lastFeat, Region currWin, float sigWinHits, float ctrlWinHits, double score){ EnrichedFeature resFeat=null; //Is this hit close to the previously added one? If so, merge if(lastFeat!=null && currWin.getStrand()==lastFeat.getCoords().getStrand() && currWin.distance(lastFeat.getCoords())<=sconfig.getFeatureMergeWindow()){ lastFeat.setCoords(strandedEventDetection ? new StrandedRegion(gen, lastFeat.getCoords().getChrom(), lastFeat.getCoords().getStart(), currWin.getEnd(), lastFeat.getCoords().getStrand()) : new Region(gen, lastFeat.getCoords().getChrom(), lastFeat.getCoords().getStart(), currWin.getEnd())); if(sigWinHits>lastFeat.getSignalCount()) lastFeat.setSignalCount(sigWinHits); if(ctrlWinHits>lastFeat.getControlCount()) lastFeat.setControlCount(ctrlWinHits); resFeat=lastFeat; }else{ EnrichedFeature feat=null; try { feat = new EnrichedFeature(currWin, null, null, sigWinHits, ctrlWinHits, score); currResults.add(feat); } catch (Exception e) { e.printStackTrace(); } resFeat=feat; }return(resFeat); } /** * ProcessDomains: * - Trims feature coordinates back to agree with overlapping hits. * - Counts hits in each feature, per sample * * @param currFeatures * @param current region * @return : Lists of EnrichedFeatures, indexed by condition */ protected Map<ExperimentCondition, List<EnrichedFeature>> processDomains(Map<ExperimentCondition,List<EnrichedFeature>> currFeatures, Region currSubRegion){ for(ExperimentCondition currCondition : manager.getConditions()){ for(EnrichedFeature f : currFeatures.get(currCondition)){ Map<Sample, List<StrandedBaseCount>> fHitsPos = overlappingHits(hitsPos, f); Map<Sample, List<StrandedBaseCount>> fHitsNeg = overlappingHits(hitsNeg, f); //Trim the coordinates trimFeature(f, fHitsPos, fHitsNeg, currCondition); //Quantify the feature in each Sample and in the condition in which it was found quantifyFeature(f, fHitsPos, fHitsNeg, currCondition); } } return(currFeatures); } /** * Trim a feature back to the coordinates of the first and last overlapping hit, if necessary. * Only trims - does not allow extension past the original bounds. * @param f * @param featureHits * @param featureCond */ protected void trimFeature(Feature f, Map<Sample, List<StrandedBaseCount>> featureHitsPos, Map<Sample, List<StrandedBaseCount>> featureHitsNeg, ExperimentCondition featureCond){ int minCoord=f.getCoords().getEnd(); int maxCoord=f.getCoords().getStart(); for(Sample s : featureCond.getSignalSamples()){ for(int strand=0; strand<=1; strand++){ List<StrandedBaseCount> featureHitsCurr = strand==0 ? featureHitsPos.get(s) : featureHitsNeg.get(s); for(StrandedBaseCount sbc : featureHitsCurr){ int hitL = getLeft(sbc); int hitR = getRight(sbc); if(hitL<minCoord) minCoord = hitL; if(hitR>maxCoord) maxCoord = hitR; } } } minCoord = Math.max(minCoord, f.getCoords().getStart()); maxCoord = Math.min(maxCoord+1, f.getCoords().getEnd()); //Plus one to avoid minCoord & maxCoord on the same base if there are no extensions f.setCoords(f.getCoords().expand(f.getCoords().getStart()-minCoord, maxCoord-f.getCoords().getEnd())); } /** * Count the number of tags that overlap this enriched feature in every sample and in the condition in which it was discovered. * In place update of counts. * @param f * @param featureHitsPos * @param featureHitsNeg * @param featureCond */ protected void quantifyFeature(EnrichedFeature f, Map<Sample, List<StrandedBaseCount>> featureHitsPos, Map<Sample, List<StrandedBaseCount>> featureHitsNeg, ExperimentCondition featureCond){ //Count the hits per sample float[] sampCountsPos = new float[manager.getSamples().size()]; float[] sampCountsNeg = new float[manager.getSamples().size()]; for(Sample s : manager.getSamples()){ float currSampPos=0, currSampNeg=0; for(StrandedBaseCount sbc : featureHitsPos.get(s)) currSampPos+=sbc.getCount(); for(StrandedBaseCount sbc : featureHitsNeg.get(s)) currSampNeg+=sbc.getCount(); sampCountsPos[s.getIndex()]=currSampPos; sampCountsNeg[s.getIndex()]=currSampNeg; } //Sample-pooled count for the condition that this feature was discovered in float pooledSigCount=0, pooledCtrlCount=0; for(Sample s : featureCond.getSignalSamples()) pooledSigCount+=sampCountsPos[s.getIndex()]+sampCountsNeg[s.getIndex()]; for(Sample s : featureCond.getControlSamples()) pooledCtrlCount+=sampCountsPos[s.getIndex()]+sampCountsNeg[s.getIndex()]; //If there is no control, use the expected number of tags in this region as the control count for the purposes of significance testing float binomialTestCtrlCount = (float) (featureCond.getControlSamples().size()==0 ? (featureCond.getTotalSignalCount()*f.getCoords().getWidth())/(econfig.getMappableGenomeLength()) :(pooledCtrlCount*(float)featureCond.getPooledSampleControlScaling())); //Update the feature f.setSampleCountsPos(sampCountsPos); f.setSampleCountsNeg(sampCountsNeg); f.setSignalCount(pooledSigCount); f.setControlCount(pooledCtrlCount); f.setScore(stats.binomialPValue(binomialTestCtrlCount, (pooledSigCount+binomialTestCtrlCount), sconfig.getMinSigCtrlFoldDifference())); } }//End of DomainFinderThread }//End of DomainFinder