package org.seqcode.deepseq.experiments; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import org.seqcode.genome.Genome; import org.seqcode.genome.location.Region; import org.seqcode.gseutils.ArgParser; import org.seqcode.gseutils.Args; import org.seqcode.gseutils.Pair; /** * ExptConfig: * A config parser that: * Handles specification of experiments and read loaders from command-line, config files and design files. * Maintains constants needed by experiment loading. * * @author Shaun Mahony * @version %I%, %G% */ public class ExptConfig { protected Genome gen=null; protected List<ExptDescriptor> expts = new ArrayList<ExptDescriptor>(); protected boolean nonUnique=false; protected boolean printHelp=false; protected boolean printLoadingProgress=true; protected List<Integer> localBackgroundWindows=new ArrayList<Integer>(); protected double perBaseLogConf=-7; protected boolean poissonGaussWinPerBaseFilter = false; //Filter using a poisson Gaussian window protected float perBaseReadLimit = -1; protected boolean perBaseReadFiltering = true; protected double mappableGenome = 0.8; protected boolean estimateScaling=true; protected boolean scalingByRegression=false; //Default is to scale by NCIS protected boolean scalingBySES = false; //Default is to estimate scaling by NCIS protected boolean scalingByMedian = false; //Default is to estimate scaling by NCIS protected float fixedScalingFactor = 1; //Default is to estimate scaling by NCIS protected int scalingSlidingWindow = 10000; protected boolean plotScaling = false; //Make a scaling method plot protected boolean cacheAllHits=true; //Cache all hits protected String fileCacheDir = "hitcache"; protected List<Region> initialCachedRegions=null; //Different loaders will have different behaviors in the following //For example, some file formats cannot store pairs. ReadDB ignores the difference between R1 & R2 in single-end, etc. protected boolean loadType1Reads = true; //Load Type1 reads protected boolean loadType2Reads = false; //Load Type2 reads (if exists and distinguishable) protected boolean loadRead2=true; //Load second in pair reads (only used by BAM loader for now) protected boolean loadPairs = false; //Load pair information (if exists) protected String[] args; public String getArgs(){ String a=""; for(int i=0; i<args.length; i++) a = a+" "+args[i]; return a; } public ExptConfig(Genome g, String[] arguments){ this.gen = g; this.args=arguments; ArgParser ap = new ArgParser(args); if(args.length==0 || ap.hasKey("h")){ printHelp=true; }else{ try{ //Test for a config file... if there is concatenate the contents into the args if(ap.hasKey("config")){ ArrayList<String> confArgs = new ArrayList<String>(); String confName = ap.getKeyValue("config"); File confFile = new File(confName); if(!confFile.isFile()) System.err.println("\nCannot find configuration file: "+confName); BufferedReader reader = new BufferedReader(new FileReader(confFile)); String line; while ((line = reader.readLine()) != null) { line = line.trim(); String[] words = line.split("\\s+"); if(!words[0].startsWith("--")) words[0] = new String("--"+words[0]); confArgs.add(words[0]); if(words.length>1){ String rest=words[1]; for(int w=2; w<words.length; w++) rest = rest+" "+words[w]; confArgs.add(rest); } } String [] confArgsArr = confArgs.toArray(new String[confArgs.size()]); String [] newargs =new String[args.length + confArgsArr.length]; System.arraycopy(args, 0, newargs, 0, args.length); System.arraycopy(confArgsArr, 0, newargs, args.length, confArgsArr.length); args = newargs; ap = new ArgParser(args); reader.close(); } ////////////////////// //Which reads to load? //////////////////////// loadPairs = Args.parseFlags(args).contains("loadpairs"); loadType1Reads = !Args.parseFlags(args).contains("not1reads"); loadType2Reads = Args.parseFlags(args).contains("loadt2reads"); loadRead2 = !Args.parseFlags(args).contains("noread2"); //////////////////////// //Read limit parameters //////////////////////// //Fixed per base read limits perBaseReadLimit = Args.parseInteger(args,"fixedpb",-1); //Do per base filtering with a Poisson Gaussian window poissonGaussWinPerBaseFilter = Args.parseFlags(args).contains("poissongausspb"); if(poissonGaussWinPerBaseFilter) perBaseReadLimit=0; //Switch off per-base read filtering? perBaseReadFiltering = !Args.parseFlags(args).contains("nopbfilter"); ////////////////////// //Scaling parameters ////////////////////// //Turn off scaling estimation estimateScaling = Args.parseFlags(args).contains("noscaling") ? false : true; //Fixed scaling factor if not estimating scaling if(ap.hasKey("fixedscaling")){ estimateScaling=false; fixedScalingFactor = Args.parseFloat(args,"fixedscaling",1); } //Scale by NCIS is default //Scale by median scalingByMedian = Args.parseFlags(args).contains("medianscale") ? true : false; //Scale by SES scalingBySES = Args.parseFlags(args).contains("sesscale") ? true : false; //Scale by regression scalingByRegression = Args.parseFlags(args).contains("regressionscale") ? true : false; //Scaling window scalingSlidingWindow = Args.parseInteger(args,"scalewin",scalingSlidingWindow); //Make a scaling method plot plotScaling = Args.parseFlags(args).contains("plotscaling") ? true : false; //////////////////////// //Misc parameters //////////////////////// //Mappability mappableGenome = Args.parseDouble(args,"mappability",0.8); //Use non-unique reads? nonUnique = Args.parseFlags(args).contains("nonunique") ? true : false; //Local dynamic backgrounds localBackgroundWindows= (List<Integer>) Args.parseIntegers(args, "dynback"); if(localBackgroundWindows.size()==0){localBackgroundWindows.add(10000);} //Caching cacheAllHits = Args.parseFlags(args).contains("nocache") ? false : true; //Parse command-line experiments (optional experiment and replicate names can be specified within the argument name - e.g. --exptName-Rep ) String fileFormat = Args.parseString(args, "format", "SAM").toUpperCase(); ArrayList<String> exptTags = new ArrayList<String>(); for(String s : ap.getKeys()){ if(!exptTags.contains(s)){ if(s.contains("expt") || s.contains("ctrl")){ String datatype = ""; String condrep = ""; boolean signal = true; if(s.contains("ctrl")) signal = false; if(s.startsWith("rdb")){ datatype = "READDB"; condrep = s.replaceFirst("rdbexpt", ""); condrep = condrep.replaceFirst("rdbctrl", ""); }else{ datatype = fileFormat; condrep = s.replaceFirst("expt", ""); condrep = condrep.replaceFirst("ctrl", ""); } //Parse the condition & replicate names String cond = condrep.split("-")[0]; if(cond.length()==0 || cond.equals("")) cond = signal ? "experiment" : "DEFAULT"; String rep = condrep.split("-").length>1 ? condrep.split("-")[1] : ""; if(rep.length()==0 || rep.equals("")){ rep = signal ? "rep1" : "DEFAULT"; } //Parse the file/rdb names Collection<String> sourceNames = Args.parseStrings(args, s); List<Pair<String,String>> sources = new ArrayList<Pair<String,String>>(); for(String n : sourceNames) sources.add(new Pair<String,String>(n,datatype)); expts.add(new ExptDescriptor("", "", cond, rep, signal, sources, perBaseReadLimit)); exptTags.add(s); } } } //Parse experiment design file // Format: (tab separated) // SrcName Signal/Control DataType Condition Replicate [ExptType] [per-base max] [Target] if(ap.hasKey("design")){ String dfile = ap.getKeyValue("design"); File df = new File(dfile); BufferedReader reader = new BufferedReader(new FileReader(df)); String line; while ((line = reader.readLine()) != null) { if(!line.startsWith("#")){ line = line.trim(); String[] words = line.split("\\t"); if(words.length >=3){ String cond="", rep="", expttype="", expttarget=""; boolean validLine = true; Pair<String, String> src=null; boolean signal= true; if(words[0].toUpperCase().equals("SIGNAL") || words[0].toUpperCase().equals("CONTROL")){ signal = words[0].toUpperCase().equals("SIGNAL") ? true : false; src = new Pair<String, String>(words[1], words[2]); }else if(words[1].toUpperCase().equals("SIGNAL") || words[1].toUpperCase().equals("CONTROL")){ signal = words[1].toUpperCase().equals("SIGNAL") ? true : false; src = new Pair<String, String>(words[0], words[2]); }else{ System.err.println("Incorrectly formatted line in design file:\n\t"+line+"\n"); validLine=false; } if(validLine){ //Condition in field 3, replicate in field 4 if(words.length>=4 && !words[3].equals("")){ cond = words[3]; }else cond = signal ? "experiment" :"DEFAULT"; if(words.length>=5 && !words[4].equals("")){ rep = words[4]; }else{ rep = signal ? "rep1" : "DEFAULT"; } //Expt type in field 5 (optional) if(words.length>=6 && !words[5].equals("") && !words[5].equals("NULL")) expttype = words[5]; //Per-base read limit in field 6 float currCondPerBaseReads = perBaseReadLimit; if(words.length>=7 && !words[6].equals("") && !words[6].equals("NULL")){ if(words[6].equals("P")) currCondPerBaseReads=0; else currCondPerBaseReads = new Float(words[6]); } //Expt target in field 7 if(words.length>=8 && !words[7].equals("") && !words[7].equals("NULL")) expttarget = words[7]; //Check if we have other entries for this experiment boolean found=false; for(ExptDescriptor e : expts){ if(e.signal==signal && e.expttype.equals(expttype) && e.target.equals(expttarget) && e.condition.equals(cond) && e.replicate.equals(rep)){ found = true; e.sources.add(src); } } if(!found){ expts.add(new ExptDescriptor(expttype, expttarget, cond, rep, signal, src, currCondPerBaseReads)); } } }else{ System.err.println("Incorrectly formatted line in design file:\n\t"+line+"\n"); } } } reader.close(); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } //Accessors public Genome getGenome(){return gen;} public boolean getNonUnique(){return nonUnique;} public List<ExptDescriptor> getExperimentDescriptors(){return expts;} public boolean helpWanted(){return printHelp;} public boolean getPrintLoadingProgress(){return printLoadingProgress;} public boolean doPoissonGaussWinPerBaseFiltering(){return poissonGaussWinPerBaseFilter;} public boolean doPerBaseFiltering(){return perBaseReadFiltering;} public double getPerBaseLogConf(){return perBaseLogConf;} public float getPerBaseMax(){return perBaseReadLimit;} public double getMappableGenomeProp(){return mappableGenome;} public double getMappableGenomeLength(){return mappableGenome*gen.getGenomeLength();} public List<Integer> getLocalBackgroundWindows(){return localBackgroundWindows;} public boolean getEstimateScaling(){return estimateScaling;} public float getFixedScalingFactor(){return fixedScalingFactor;} public boolean getScalingByMedian(){return scalingByMedian;} public boolean getScalingByRegression(){return scalingByRegression;} public boolean getScalingBySES(){return scalingBySES;} public int getScalingSlidingWindow(){return scalingSlidingWindow;} public boolean getPlotScaling(){return plotScaling;} public boolean getCacheAllData(){return cacheAllHits;} public String getFileCacheDirName(){return fileCacheDir;} public List<Region> getInitialCachedRegions(){return initialCachedRegions;} public boolean getLoadType1Reads(){return loadType1Reads;} public boolean getLoadType2Reads(){return loadType2Reads;} public boolean getLoadRead2(){return loadRead2;} public boolean getLoadPairs(){return loadPairs;} //Some accessors to allow modification of options after config . public void setPrintLoadingProgress(boolean plp){printLoadingProgress = plp;} public void setPerBaseReadFiltering(boolean pbrf){perBaseReadFiltering = pbrf;} public void setPerBaseReadLimit(float pbrl){perBaseReadLimit = pbrl; perBaseReadFiltering=true;} public void setMedianScaling(boolean ms){scalingByMedian = ms;} public void setRegressionScaling(boolean rs){scalingByRegression = rs;} public void setSESScaling(boolean ses){scalingBySES = ses;} public void setScalingSlidingWindow(int ssw){scalingSlidingWindow = ssw;} public void setFileCacheDirName(String d){fileCacheDir = d;} public void setLoadType1Reads(boolean l){loadType1Reads = l;} public void setLoadType2Reads(boolean l){loadType2Reads = l;} public void setLoadRead2(boolean l){loadRead2 = l;} public void setLoadPairs(boolean l){loadPairs = l;} /** * Merge a set of genomes that have been estimated from individual Samples * @param estGenomes * @return */ public Genome mergeEstGenomes(List<Genome> estGenomes){ //Combine the chromosome information HashMap<String, Integer> chrLenMap = new HashMap<String, Integer>(); for(Genome e : estGenomes){ Map<String, Integer> currMap = e.getChromLengthMap(); for(String s: currMap.keySet()){ if(!chrLenMap.containsKey(s) || chrLenMap.get(s)<currMap.get(s)) chrLenMap.put(s, currMap.get(s)); } } gen =new Genome("Genome", chrLenMap); return gen; } /** * returns a string describing the arguments handled by this config parser. * @return String */ public static String getArgsList(){ return(new String("" + "Experiments:\n" + "\t--design <design file name>\n" + "\tOR\n" + "\t--expt/--ctrl <signal/control experiment file name> AND --format <SAM/BED/SCIDX/BOWTIE/NOVO>\n" + "\tAND/OR" + "\t--rdbexpt/--rdbctrl <signal/control ReadDB experiment identifier>\n" + "\t\tNote that if you use --expt/--ctrl or --rdbexpt/--rdbctrl, you can specify the names of the experiment & replicate\n" + "\t\tdirectly in the argument. Here's an example: --exptConditionA-Rep1 somefile.bam\n" + "Scaling control vs signal counts:\n" + "\t--noscaling [flag to turn off auto estimation of signal vs control scaling factor]\n" + "\t--medianscale [flag to use scaling by median ratio (default = scaling by NCIS)]\n" + "\t--regressionscale [flag to use scaling by regression (default = scaling by NCIS)]\n" + "\t--sesscale [flag to use scaling by SES (default = scaling by NCIS)]\n" + "\t--fixedscaling <multiply control counts by total tag count ratio and then by this factor if not estimating scaling>\n" + "\t--scalewin <window size for scaling procedure (default=10000)>\n" + "\t--plotscaling [flag to plot diagnostic information for the chosen scaling method]\n" + "Miscellaneous Experiment Loading Args:\n" + "\t--fixedpb <fixed per base limit>\n" + "\t--poissongausspb <filter per base using a Poisson threshold parameterized by a local Gaussian sliding window>\n" + "\t--mappability <fraction of the genome that is mappable for these experiments>\n" + "\t--nocache [flag to turn off caching of the entire set of experiments (i.e. run slower with less memory)]\n" + "\t--not1reads / --loadt2reads [flags to use Type1 or Type2 reads] (Type1 loaded by default)\n" + "\t--noread2 [flag to ignore second reads in paired-end]\n" + "")); } }