package org.seqcode.gseutils; import java.util.*; import java.util.regex.*; import java.io.*; import java.sql.SQLException; import org.seqcode.data.connections.DatabaseException; import org.seqcode.data.motifdb.*; import org.seqcode.data.seqdata.*; import org.seqcode.genome.Genome; import org.seqcode.genome.Species; import org.seqcode.genome.location.Region; import org.seqcode.gsebricks.verbs.location.ChromRegionIterator; import org.seqcode.gsebricks.verbs.location.RefGeneGenerator; /** * <code>Args</code> is a utility class for parsing command line arguments. It can parse * different types of values (eg, strings, integers, Genomes, WeightMatrixScans). * * <code>Args</code> provides some internal caching of previously * parsed objects based on the String[] object such that it returns the same Genome object * no matter how many times parseGenome is called. * * @author <a href="mailto:arolfe@mit.edu">Alex Rolfe</a> * @version 1.0 */ public class Args { private static Map<String[], Species> orgs = new HashMap<String[], Species>(); private static Map<String[], Genome> genomes = new HashMap<String[], Genome>(); private static Map<String[], Set<String>> flags = new HashMap<String[],Set<String>>(); private static Map<String[], Set<String>> arguments = new HashMap<String[],Set<String>>(); /** * Parses all arguments. Similar to parseFlags, but with no restrictions on * the argument not taking a value <br> * Returns all strings preceded by "<tt>--</tt>" * @param args The command line options of the form <tt>--foo</tt> * @return */ public static Set<String> parseArgs(String args[]) { if (arguments.containsKey(args)) { return arguments.get(args); } HashSet<String> output = new HashSet<String>(); for (int i = 0; i < args.length; i++) { if (args[i].matches("^--.*")){ output.add(args[i].substring(2)); } } arguments.put(args,output); return output; } /** parses flags. These are command line options of the form * --foo * followed by another option (eg, --foo --bar quux) or * the end of the command line. They take no value after the name of the argument * * @returns the Set of flags present in args[]. The Strings returned do not include the leading -- */ public static Set<String> parseFlags(String args[]) { if (flags.containsKey(args)) { return flags.get(args); } HashSet<String> output = new HashSet<String>(); for (int i = 0; i < args.length; i++) { if (args[i].matches("^--.*") && ((i == args.length - 1) || args[i+1].matches("^--.*"))) { output.add(args[i].substring(2)); } } flags.put(args,output); return output; } /** Parses the integer value of the argument named by <code>key</code> from the specified command line. * If no value is present, returns <code>defaultValue</code>. If the key is present multiple times * on the command line, the first instance is returned. * Example: * parseInteger(args,"foo",10); where args={"--minimum","1.3", "--foo","50"} returns 50. */ public static int parseInteger(String args[], String key, int defaultValue) { if (!key.matches("^\\-\\-.*")) { key = "--" + key; } for (int i = 0; i < args.length; i++) { if (args[i].equals(key)) { return Integer.parseInt(args[++i]); } } return defaultValue; } /** * Parses all the integers of the arguments that are named by <tt>key</tt> * and returns them as a <tt>Collection</tt> of <tt>Integers</tt> <br> * Example: parseIntegers(args, "foo"); * where args ={"--foo", "3", "--min", "2.5", "--foo", "4"} returns [3, 4]. * @param args arguments of the command line * @param key the argument named by <tt>key</tt> * @return */ public static Collection<Integer> parseIntegers(String args[], String key) { ArrayList<Integer> output = new ArrayList<Integer>(); if (!key.matches("^\\-\\-.*")) { key = "--" + key; } for (int i = 0; i < args.length; i++) { if (args[i].equals(key)) { output.add(new Integer(args[++i])); } } return output; } /** * Parses all the doubles of the arguments that are named by <tt>key</tt> * and returns them as a <tt>Collection</tt> of <tt>Doubles</tt> <br> * Example: parseDoubles(args, "foo"); * where args ={"--foo", "3.2", "--min", "2.5", "--foo", "4.3"} returns [3.2, 4.3]. * @param args arguments of the command line * @param key the argument named by <tt>key</tt> * @return */ public static Collection<Double> parseDoubles(String args[], String key) { ArrayList<Double> output = new ArrayList<Double>(); if (!key.matches("^\\-\\-.*")) { key = "--" + key; } for (int i = 0; i < args.length; i++) { if (args[i].equals(key)) { output.add(new Double(args[++i])); } } return output; } /** Parses a long from the specified command line. * @see org.seqcode.gse.tools.utils.Args.parseInteger */ public static long parseLong(String args[], String key, long defaultValue) { if (!key.matches("^\\-\\-.*")) { key = "--" + key; } for (int i = 0; i < args.length; i++) { if (args[i].equals(key)) { return Long.parseLong(args[++i]); } } return defaultValue; } /** Parses a double from the specified command line. * @see org.seqcode.gse.tools.utils.Args.parseInteger */ public static double parseDouble(String args[], String key, double defaultValue) { if (!key.matches("^\\-\\-.*")) { key = "--" + key; } for (int i = 0; i < args.length; i++) { if (args[i].equals(key)) { return Double.parseDouble(args[++i]); } } return defaultValue; } /** Parses a float from the specified command line. * @see org.seqcode.gse.tools.utils.Args.parseInteger */ public static float parseFloat(String args[], String key, float defaultValue) { if (!key.matches("^\\-\\-.*")) { key = "--" + key; } for (int i = 0; i < args.length; i++) { if (args[i].equals(key)) { return Float.parseFloat(args[++i]); } } return defaultValue; } /** Parses a string from the specified command line. * @see org.seqcode.gse.tools.utils.Args.parseInteger */ public static String parseString(String args[], String key, String defaultValue) { if (!key.matches("^\\-\\-.*")) { key = "--" + key; } for (int i = 0; i < args.length; i++) { if (args[i].equals(key)) { return args[++i]; } } return defaultValue; } /** Parses all strings of the argument by the name <tt>key<tt>. * @see org.seqcode.gse.tools.utils.Args.parseIntegers */ public static Collection<String> parseStrings(String args[], String key) { ArrayList<String> output = new ArrayList<String>(); if (!key.matches("^\\-\\-.*")) { key = "--" + key; } for (int i = 0; i < args.length; i++) { if (args[i].equals(key)) { output.add(args[++i]); } } return output; } /** Parses a filename from the command line. The file name is either * immediately preceded by <tt>--file</tt> or is any argument(s) that come(s) * after <tt>--</tt> and then follows the end of the command line. <br> * For example: * <tt>--max 10 --min 3 -- foo.txt bar.txt baz.txt</tt> */ public static List<String> parseFile(String args[]) { ArrayList<String> output = new ArrayList<String>(); for (int i = 0; i < args.length; i++) { if (args[i].equals("--file")) { output.add(args[++i]); } if (args[i].equals("--")) { for (int j = i + 1; j < args.length; j++) { output.add(args[j]); } break; } } return output; } /** Parses a list of files of the argument by the name <tt>key</tt> * from the command line and returns file handles * */ public static List<File> parseFileHandles(String args[], String key) { if (!key.matches("^\\-\\-.*")) { key = "--" + key; } ArrayList<File> output = new ArrayList<File>(); for (int i = 0; i < args.length; i++) { if (args[i].equals(key)) { output.add(new File(args[++i])); } } return(output); } /** This method returns a list containing all values preceded by the specified key. * for example: * <tt>--input foo.txt --input bar.txt --input baz.txt</tt> * would return the list "foo.txt","bar.txt","baz.txt" if key="input" */ public static List<String> parseList(String args[], String key) { if (!key.matches("^\\-\\-.*")) { key = "--" + key; } ArrayList<String> output = new ArrayList<String>(); for (int i = 0; i < args.length; i++) { if (args[i].equals(key)) { output.add(args[++i]); } } return output; } /** Parses <tt>--species "Mus musculus;mm8"</tt> into a Species and Genome * Also parses <tt>--genome mm8</tt> or <tt>--gen mm8</tt> into a Genome and inferred Species * @see org.seqcode.genome.Species * @see org.seqcode.genome.Genome */ public static Pair<Species,Genome> parseGenome(String args[]) throws NotFoundException { if (orgs.containsKey(args) && genomes.containsKey(args)) { return new Pair<Species,Genome>(orgs.get(args), genomes.get(args)); } String speciesname = null, genomename = null; for (int i = 0; i < args.length; i++) { if (args[i].equals("--species")) { String[] pieces = args[++i].split(";"); speciesname = pieces[0]; genomename = pieces[1]; } } Species org=null; Genome genome=null; if(speciesname==null && genomename==null){ for (int i = 0; i < args.length; i++) { if (args[i].equals("--gen") || args[i].equals("--genome")) genomename = args[++i]; } if(genomename==null) return null; else{ genome =Genome.findGenome(genomename); org = new Species(genome.getSpeciesName()); } }else{ org = new Species(speciesname); genome =Genome.findGenome(genomename); } orgs.put(args,org); genomes.put(args,genome); return new Pair<Species,Genome>(org,genome); } /** * parses SeqLocators from the <tt>--seqexpt</tt> parameters. Takes * either "name;alignment" or "name;replicate;alignment" */ public static List<SeqLocator> parseSeqExpt(String args[]) { return parseSeqExpt(args,"seqexpt"); } /** * parses SeqLocators from the <tt>argname</tt> parameters. Takes * either "name;alignment" or "name;replicate;alignment" * @see org.seqcode.data.seqdata.SeqLocator */ public static List<SeqLocator> parseSeqExpt(String args[], String argname) { argname = "--" + argname; ArrayList<SeqLocator> output = new ArrayList<SeqLocator>(); for (int i = 0; i < args.length; i++) { if (args[i].equals(argname)) { String[] pieces = args[++i].trim().split(";"); Set<String> reps = new TreeSet<String>(); if (pieces.length == 2) { output.add(new SeqLocator(pieces[0], reps, pieces[1])); } else if (pieces.length == 3) { reps.add(pieces[1]); output.add(new SeqLocator(pieces[0], reps, pieces[2])); } else { throw new RuntimeException("Couldn't parse a SeqAlignmentsLocator from " + args[++i]); } } } return output; } public static Collection<SeqAnalysis> parseSeqAnalyses(String args[], String argname) throws NotFoundException { Collection<String> bases = parseStrings(args,argname); Collection<SeqAnalysis> out = new ArrayList<SeqAnalysis>(); SeqDataLoader loader = null; try { for (String base : bases) { String pieces[] = base == null ? null : base.split(";"); if (pieces != null && pieces.length != 2 ) { throw new RuntimeException("Invalid string for SeqAnalysis " + base); } SeqAnalysis a = null; loader = new SeqDataLoader(false, true); if (pieces != null) { a = SeqAnalysis.get(loader,pieces[0],pieces[1]); } out.add(a); } } catch (SQLException e) { throw new DatabaseException(e.toString(),e); } catch (Exception e) { throw new RuntimeException(e.toString(),e); } finally { if (loader != null) { loader.close(); } loader = null; } return out; } public static SeqAnalysis parseSeqAnalysis(String args[], String argname) { String base = parseString(args,argname,null); String aname = parseString(args,"analysisname",null); String aversion = parseString(args,"analysisversion",null); String pieces[] = base == null ? null : base.split(";"); if (pieces != null && pieces.length != 2 ) { throw new RuntimeException("Invalid string for ChipSeqAnalysis " + base); } SeqAnalysis a = null; SeqDataLoader loader = null; try { loader = new SeqDataLoader(false, true); if (pieces != null) { try { a = SeqAnalysis.get(loader,pieces[0],pieces[1]); } catch (NotFoundException e) { System.err.println("Couldn't find analysis from " + pieces[0] + " and " + pieces[1]); } } if (a == null && aname != null && aversion != null) { try { a = SeqAnalysis.get(loader,aname,aversion); } catch (NotFoundException e) {} } if (a == null) { throw new RuntimeException("Couldn't parse or find a ChipSeqAnalysis from " + base + " or " + aname +","+aversion); } } catch (SQLException e) { throw new DatabaseException(e.toString(),e); } catch (Exception e) { throw new RuntimeException(e.toString(),e); } finally { if (loader != null) { loader.close(); } loader = null; } return a; } /** parses <tt>--scan</tt> or <tt>wmscan</tt> options into a list of WeightMatrixScans. --scan takes 3 or 4 semicolon separated values: * - matrix name * - matrix version * - scan name * - organism for the matrix if not the same as what was specified in --species * @see org.seqcode.data.motifdb.WeightMatrix * @see org.seqcode.data.motifdb.WeightMatrixLoader * @see org.seqcode.data.motifdb.WeightMatrixScan */ public static List<WeightMatrixScan> parseWMScans(String args[]) throws NotFoundException { Species org = parseGenome(args).getFirst(); List<WeightMatrixScan> output = new ArrayList<WeightMatrixScan>(); WeightMatrixLoader wmloader = new WeightMatrixLoader(); for (int i = 0; i < args.length; i++) { if (args[i].equals("--scan") || args[i].equals("--wmscan")) { String[] pieces = args[++i].split(";"); Species thisorg = org; if (pieces.length == 4) { thisorg = new Species(pieces[3]); } WeightMatrix matrix = wmloader.query(thisorg.getDBID(),pieces[0],pieces[1]); WeightMatrixScan scan = WeightMatrixScan.getScanForMatrix(matrix.dbid, pieces[2]); output.add(scan); } } return output; } private static List<Pattern> makePatterns(Collection<String> strings) { List<Pattern> out = new ArrayList<Pattern>(); for (String s : strings) { out.add(Pattern.compile(s)); } return out; } private static boolean matchesAny(String s, List<Pattern> patterns) { for (Pattern p : patterns) { Matcher m = p.matcher(s); if (m.find()) { return true; } } return false; } public static Collection<WeightMatrix> filterMatrices(Collection<String> accepts, Collection<String> rejects, Collection<String> acceptvers, Collection<String> rejectvers, Collection<String> accepttypes, Collection<String> rejecttypes, Collection<WeightMatrix> matrices) { ArrayList<WeightMatrix> out = new ArrayList<WeightMatrix>(); List<Pattern> acceptp = makePatterns(accepts); List<Pattern> rejectp = makePatterns(rejects); List<Pattern> acceptversp = makePatterns(acceptvers); List<Pattern> rejectversp = makePatterns(rejectvers); List<Pattern> accepttypep = makePatterns(accepttypes); List<Pattern> rejecttypep = makePatterns(rejecttypes); for (WeightMatrix wm : matrices) { if (matchesAny(wm.name, rejectp) || matchesAny(wm.version, rejectversp) || matchesAny(wm.type, rejecttypep)) { continue; } if ((matchesAny(wm.name, acceptp) || matchesAny(wm.version, acceptversp) || matchesAny(wm.type, accepttypep)) || (accepts.size() == 0 && acceptvers.size() == 0 && accepttypes.size() == 0)) { out.add(wm); } } return out; } public static Collection<WeightMatrix> parseWeightMatrices(String args[]) throws NotFoundException { Collection<String> awm = parseStrings(args,"acceptwm"); Collection<String> rwm = parseStrings(args,"rejectwm"); Collection<String> awmv = parseStrings(args,"acceptwmver"); Collection<String> rwmv = parseStrings(args,"rejectwmver"); Collection<String> awmt = parseStrings(args,"acceptwmtype"); Collection<String> rwmt = parseStrings(args,"rejectwmtype"); if (awm.size() > 0 ) { System.err.println("Acceping wmnames " + awm);} if (awmv.size() > 0 ) { System.err.println("Acceping wmvers " + awmv);} if (awmt.size() > 0 ) { System.err.println("Acceping wmtypes " + awmt);} if (rwm.size() > 0 ) { System.err.println("Rejecting wmnames " + rwm);} if (rwmv.size() > 0 ) { System.err.println("Rejecting wmvers " + rwmv);} if (rwmt.size() > 0 ) { System.err.println("Rejecting wmtypes " + rwmt);} Collection<WeightMatrix> out = new ArrayList<WeightMatrix>(); if (awm.size() > 0 || rwm.size() > 0 || awmv.size() > 0 || rwmv.size() > 0 || awmt.size() > 0 || rwmt.size() > 0) { ArrayList<WeightMatrix> matrices = new ArrayList<WeightMatrix>(); matrices.addAll(WeightMatrix.getAllWeightMatrices()); out.addAll(filterMatrices(awm, rwm, awmv, rwmv, awmt, rwmt, matrices)); } Collection<String> namevers = parseStrings(args,"wm"); if (namevers.size() > 0) { WeightMatrixLoader loader = new WeightMatrixLoader(); for (String nv : namevers) { String pieces[] = nv.split(";"); for (WeightMatrix m : loader.query(pieces[0], pieces[1], null)) { if (m.getName().equals(pieces[0]) && m.getVersion().equals(pieces[1])) { out.add(m); } } } loader.close(); } else if (awm.size() == 0 && rwm.size() == 0 && awmv.size() == 0 && rwmv.size() == 0 && awmt.size() == 0 && rwmt.size() == 0) { return WeightMatrix.getAllWeightMatrices(); } System.err.println("parseWeightMatrices returning " + out.size()); return out; } /** regularization computes ratios as ratio = (ip + alpha)/(wce + alpha) <br> * Argument should appear as: <tt>--regularize alpha</tt> */ public static int parseRegularize(String args[]) { int alpha = 0; for (int i = 0; i < args.length; i++) { if (args[i].equals("--regularize")) { alpha = Integer.parseInt(args[++i]); } } return alpha; } /** returns a list of RefGeneGenerator. If --genes is specified, uses the value as the type rather than * the default type of refGene. The type is the name of the table that the RefGeneGenerator will query. */ public static List<RefGeneGenerator> parseGenes (String args[]) throws NotFoundException { ArrayList<RefGeneGenerator> output = new ArrayList<RefGeneGenerator>(); boolean flipstrands = parseFlags(args).contains("flipgenestrands") || parseFlags(args).contains("flipgenestrand"); for (int i = 0; i < args.length; i++) { if (args[i].equals("--genes")) { output.add(new RefGeneGenerator(parseGenome(args).getLast(),args[++i])); } } if (flipstrands) { for (RefGeneGenerator rg : output) { rg.setFlipStrand(true); } } return output; } /** Takes a <tt>key</tt> that specifies the name of the command line option. * For example, if key is <tt>quux</tt>, then looks for <tt>--quux</tt>. * The value after each <tt>--quux</tt> is parsed as specifying a filename * that should be opened and read. * Each line is parsed as a region and those regions are returned, sorted. * @see org.seqcode.genome.Genome * @see org.seqcode.genome.location.Region */ public static List<Region> readLocations(String args[], String key) throws IOException, NotFoundException { Genome genome = parseGenome(args).getLast(); ArrayList<Region> output = null; if (!key.matches("^\\-\\-")) { key = "--" + key; } for (int i = 0; i < args.length; i++) { if (args[i].equals(key)) { String fname = args[++i]; output = new ArrayList<Region>(); BufferedReader reader; if (fname.equals("-")) { reader = new BufferedReader(new InputStreamReader(System.in)); } else { reader = new BufferedReader(new FileReader(fname)); } String line; while ((line = reader.readLine()) != null) { if (line.matches("^#.*")) { continue; } Region r = Region.fromString(genome, line); if (r != null) { output.add(r); } } Collections.sort(output); break; } } return output; } /** Parses the regions from the command line as specified by <tt>--region</tt>. * The command line must also contain a <tt>--species</tt> option somewhere */ public static List<Region> parseRegions(String args[]) throws NotFoundException { Genome genome = parseGenome(args).getLast(); ArrayList<Region> regions = new ArrayList<Region>(); for (int i = 0; i < args.length; i++) { if (args[i].equals("--region")) { regions.add(Region.fromString(genome, args[++i])); } } return regions; } /** Parses the <tt>--region</tt> options from the command line. If none are specified, * returns regions corresponding to the chromosomes in the genome specified * on the command line. */ public static List<Region> parseRegionsOrDefault(String args[]) throws NotFoundException { Genome genome = parseGenome(args).getLast(); ArrayList<Region> regions = new ArrayList<Region>(); for (int i = 0; i < args.length; i++) { if (args[i].equals("--region")) { regions.add(Region.fromString(genome, args[++i])); } } if (regions.size() == 0) { ChromRegionIterator chroms = new ChromRegionIterator(genome); while (chroms.hasNext()) { regions.add(chroms.next()); } } return regions; } }// end of Args class