package org.seqcode.projects.seed; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.TimeZone; import org.seqcode.data.io.RegionFileUtilities; import org.seqcode.genome.GenomeConfig; import org.seqcode.genome.location.Region; import org.seqcode.genome.location.StrandedPoint; import org.seqcode.gseutils.ArgParser; import org.seqcode.gseutils.Args; public class SEEDConfig { protected GenomeConfig gconfig; //Settable variables protected String outName="seed", outBase="seed"; //output names protected File outDir=null; protected int tagShift=0; //Shift tags this length in the 3' direction protected int tag3PrimeExtension=0; //Extend tags this number of bp in the 3' direction (starting from 5' position) protected int tag5PrimeExtension=0; //Extend tags this number of bp in the 5' direction (starting from 5' position) protected float tagGaussSigma=0; //Sigma for Gaussian tag smoothing (std dev) protected float tagGaussWidthFactor=5; //If smoothing with a Gaussian, each tag's prob density 'extends' over this factor times sigma protected int binWidth=50; //Bin size for feature scanning protected int binStep=25; //Bin step for feature scanning protected int featureMergeWindow=100; //Merge neighboring domains that are this close to one another (in bp) protected int minFeatureSize=0; //Minimum size for a feature (in bp) protected List<Integer> localBackWins=new ArrayList<Integer>(); //MACS-style local background windows protected double perBinPoissonLogPThres=-7; //Log base 10 confidence threshold for use with Poisson background models when scoring bins protected double perBinBinomialPThres = 0.01; //threshold to use with per-bin Binomial test (we can afford to be lax here, as the q-value threshold filters for final results) protected double perFeatureBinomialQThres = 0.01; //threshold to use with per-event Binomial tests protected double minSigCtrlFoldDifference = 1; //minimum signal/control difference to use in Binomial testing (per bin & per event) protected PeakFindingMethod peakFinding = PeakFindingMethod.MAXDENSITY; //Approach for finding peak point(s) in enriched domains protected int maxThreads=1; //Number of threads to use. Default is 1 for single processor machines. protected List<Region> regionsToIgnore = new ArrayList<Region>(); //List of regions that will be ignored during EM training (i.e. known towers, etc) protected boolean printHelp=false; protected boolean outputGFF=false; //Print the output files in GFF format protected int superStitchWindow = 12500; // Window to stitch typical enhancers to super enhancers protected int minDistalDistance = 2000; // Minmum distance from TSSs to call an element Distal protected List<StrandedPoint> refTSSs = new ArrayList<StrandedPoint>(); // refTSSs; currently loading from a stranded peak file //Constants public final int MAXSECTION = 50000000; //Size of genomic sections that each thread analyzes protected String[] args; public String getArgs(){ String a=""; for(int i=0; i<args.length; i++) a = a+" "+args[i]; return a; } public SEEDConfig(GenomeConfig gcon, String[] args){ this.args = args; gconfig = gcon; ArgParser ap = new ArgParser(args); if(args.length==0 || ap.hasKey("h")){ printHelp=true; }else{ try{ //Test for a config file... if there is concatenate the contents into the args if(ap.hasKey("config")){ ArrayList<String> confArgs = new ArrayList<String>(); String confName = ap.getKeyValue("config"); File confFile = new File(confName); if(!confFile.isFile()) System.err.println("\nCannot find configuration file: "+confName); BufferedReader reader = new BufferedReader(new FileReader(confFile)); String line; while ((line = reader.readLine()) != null) { line = line.trim(); String[] words = line.split("\\s+"); if(!words[0].startsWith("--")) words[0] = new String("--"+words[0]); confArgs.add(words[0]); if(words.length>1){ String rest=words[1]; for(int w=2; w<words.length; w++) rest = rest+" "+words[w]; confArgs.add(rest); } } String [] confArgsArr = confArgs.toArray(new String[confArgs.size()]); String [] newargs =new String[args.length + confArgsArr.length]; System.arraycopy(args, 0, newargs, 0, args.length); System.arraycopy(confArgsArr, 0, newargs, args.length, confArgsArr.length); args = newargs; ap = new ArgParser(args); } /******* General options *******/ //Output path DateFormat df = new SimpleDateFormat("yyyy-MM-dd-hh-mm-ss"); df.setTimeZone(TimeZone.getTimeZone("EST")); outName = Args.parseString(args, "out", outName+"_"+df.format(new Date())); outDir = new File(outName); //Output directory outBase = outDir.getName(); //Last part of name outputGFF = ap.hasKey("gffout"); //Threads maxThreads = Args.parseInteger(args,"threads",maxThreads); maxThreads = Math.min(maxThreads, java.lang.Runtime.getRuntime().availableProcessors()); //Feature detection binWidth = Args.parseInteger(args,"binwidth",binWidth); binStep = Args.parseInteger(args,"binstep",binStep); featureMergeWindow = Args.parseInteger(args,"mergewin",featureMergeWindow); minFeatureSize = Args.parseInteger(args,"minsize",minFeatureSize); //Regions to ignore during analysis if(ap.hasKey("exclude")) regionsToIgnore = RegionFileUtilities.loadRegionsFromFile(Args.parseString(args, "exclude", null), gconfig.getGenome(), -1); //Tag manipulation tagShift = Args.parseInteger(args,"tagshift",tagShift); tag3PrimeExtension = Args.parseInteger(args,"tag3ext",tag3PrimeExtension); tag5PrimeExtension = Args.parseInteger(args,"tag5ext",tag5PrimeExtension); tagGaussSigma = Args.parseFloat(args,"tagsigma",tag5PrimeExtension); //Statistical thresholds perBinPoissonLogPThres = Args.parseDouble(args,"poisslogpthres",perBinPoissonLogPThres); perBinBinomialPThres = Args.parseDouble(args,"binpthres",perBinBinomialPThres); perFeatureBinomialQThres = Args.parseDouble(args,"featureqthres",perFeatureBinomialQThres); minSigCtrlFoldDifference = Args.parseDouble(args,"minfolddiff",minSigCtrlFoldDifference); //Local background models localBackWins= (List<Integer>) Args.parseIntegers(args, "localbackwin"); //Peak-finding method String pfmethod = Args.parseString(args,"peakfinding","max").toLowerCase(); if(pfmethod.equals("max")){peakFinding = PeakFindingMethod.MAXDENSITY;} else if(pfmethod.equals("lrbal")){peakFinding = PeakFindingMethod.LRBALANCE;} else if(pfmethod.equals("distrib")){peakFinding = PeakFindingMethod.TAGDISTRIBUTION;} else{peakFinding = PeakFindingMethod.MAXDENSITY;} // Super Feature detection method optioins superStitchWindow = Args.parseInteger(args, "supStitchWin", 12500); minDistalDistance = Args.parseInteger(args, "distalDistance", 2000); if(ap.hasKey("refTSSs")){ refTSSs = RegionFileUtilities.loadStrandedPointsFromFile(gconfig.getGenome(), Args.parseString(args, "refTSSs","")); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } //Accessors public boolean helpWanted(){return printHelp;} public String getOutName(){return outName;} public String getOutBase(){return outBase;} public File getOutputParentDir(){return outDir;} public int getMaxThreads(){return maxThreads;} public List<Region> getRegionsToIgnore(){return regionsToIgnore;} public int getTagShift(){return tagShift;} public int getTag3PrimeExtension(){return tag3PrimeExtension;} public int getTag5PrimeExtension(){return tag5PrimeExtension;} public float getTagGaussSigma(){return tagGaussSigma;} public int getTagGaussWidth(){return (int)(tagGaussSigma * tagGaussWidthFactor);} public int getBinWidth(){return binWidth;} public int getBinStep(){return binStep;} public int getFeatureMergeWindow(){return featureMergeWindow;} public int getMinFeatureSize(){return minFeatureSize;} public List<Integer> getLocalBackWins(){return localBackWins;} public double getPerBinPoissonLogPThres(){return perBinPoissonLogPThres;} public double getPerBinBinomialPThres(){return perBinBinomialPThres;} public double getPerFeatureBinomialQThres(){return perFeatureBinomialQThres;} public double getMinSigCtrlFoldDifference(){return minSigCtrlFoldDifference;} public PeakFindingMethod getPeakFindingApproach(){return peakFinding;} public boolean outputGFF(){return outputGFF;} public int getSuperStitchWin(){return superStitchWindow;} public int getMinDistalDistance(){return minDistalDistance;} public List<StrandedPoint> getRefTSSs(){return refTSSs;} //Settors public void setTagShift(int s){tagShift=s;} public void setTag3PrimeExtension(int e){tag3PrimeExtension = e;} public void setTag5PrimeExtension(int e){tag5PrimeExtension = e;} public void setTagGaussSigma(float s){tagGaussSigma = s;} public void setPeakFindingApproach(PeakFindingMethod p){peakFinding=p;} /** * Make some output directories used by SEED implementations */ public void makeSEEDOutputDirs(){ //Test if output directory already exists. If it does, recursively delete contents outDir = new File(outName); if(outDir.exists()) deleteDirectory(outDir); outBase = outDir.getName(); //(re)make the output directory outDir.mkdirs(); } /** * Delete a direcctory */ public boolean deleteDirectory(File path) { if( path.exists() ) { File[] files = path.listFiles(); for(int i=0; i<files.length; i++) { if(files[i].isDirectory()) { deleteDirectory(files[i]); } else { files[i].delete(); } } } return( path.delete() ); } /** * returns a string describing the arguments handled by this parser. * @return String */ public String getArgsList(){ return(new String("" + "SEED arguments:\n"+ "\t--out <output file prefix>\n" + "\t--tagshift <shift tags this number of bases in 3' direction (default: shift="+tagShift+")>\n" + "\t--tag3ext <extend tags this number of bp from (shifted) 5' position in the 3' direction (default="+tag3PrimeExtension+")>\n" + "\t--tag5ext <extend tags this number of bp from (shifted) 5' position in the 5' direction (default="+tag5PrimeExtension+")>\n" + "\t--tagsigma <smooth tags using Gaussian with this sigma (default="+tagGaussSigma+")>\n" + "\t--binwidth <width of bin in bp for feature scanning (default="+binWidth+")>\n" + "\t--binstep <bin step size in bp for feature scanning (default="+binStep+")>\n" + "\t--mergewin <size of window in which to merge features/domains (default="+featureMergeWindow+")>\n" + "\t--minsize <min size of feature/domain (default="+minFeatureSize+")>\n"+ "\t--localbackwin <size of local window(s) for MACS-style calculation of expected tag count (default=none)>\n" + "\t--poisslogpthres <log base 10 Poisson confidence threshold for use in finding enriched bins (default="+perBinPoissonLogPThres+")>\n" + "\t--binpthres <uncorrected p-value threshold for use in Binomial test for enriched bins (default="+perBinBinomialPThres+")>\n" + "\t--featureqthres <q-value threshold for use in Binomial test for enriched events (default="+perFeatureBinomialQThres+")>\n" + "\t--minfolddiff <minimum signal/control fold difference for use in Binomial test for enriched domains (default="+minSigCtrlFoldDifference+")>\n" + "\t--peakfinding <max/lrbal/distrib: approach for finding the peak in a domain; max. density, left-right balance point, or tag density scan, resp. (default=max)>\n" + "\t--threads <number of threads to use>\n" + "\t--config <config file: all options can be specified in a name<space>value text file, over-ridden by command-line args>\n" + "\t--exclude <file of regions to ignore>\n" + "\t--gffout <output GFF format files>\n" + "")); } /** * Enumerated type for peak-finding approach * @author mahony */ public enum PeakFindingMethod{ MAXDENSITY, LRBALANCE, TAGDISTRIBUTION } }