package org.seqcode.motifs; import java.io.*; import java.util.*; import java.sql.*; import java.text.ParseException; import java.util.regex.Pattern; import java.util.regex.Matcher; import org.seqcode.data.connections.DatabaseException; import org.seqcode.data.connections.DatabaseFactory; import org.seqcode.data.connections.UnknownRoleException; import org.seqcode.data.motifdb.*; import org.seqcode.genome.Genome; import org.seqcode.genome.Species; import org.seqcode.genome.location.Region; import org.seqcode.genome.sequence.SequenceGenerator; import org.seqcode.genome.sequence.SequenceUtils; import org.seqcode.gsebricks.verbs.*; import org.seqcode.gseutils.*; /** * Scans sequence to determine the overal density of motifs (hits / window) in windows of * some size. * * --cutoff .7 minimum percent (specify between 0 and 1) match to maximum motif score that counts as a match. * --species "$SC;SGDv2" * --window 100 specifies the window size to scan. The windows are non-overlapping * * can also specify --accept to give a regex which the motif name must match * of --reject to specify a regex that the motif name must not match. Remember the regex must match * the *entire* name, so use something like Hnf.* * * Output format is * chrom\tstart\tstop\tcount * */ public class MotifDensity { public static void main(String args[]) throws Exception { Collection<String> accept, reject; Collection<WeightMatrix> matrices = new ArrayList<WeightMatrix>(); matrices.addAll(WeightMatrix.getAllWeightMatrices()); double cutoffpercent = Args.parseDouble(args,"cutoff",.7); accept = Args.parseStrings(args,"accept"); reject = Args.parseStrings(args,"reject"); int windowsize = Args.parseInteger(args,"window",100); Genome genome = Args.parseGenome(args).cdr(); SequenceGenerator seqgen = new SequenceGenerator(genome); MarkovBackgroundModel bgModel = null; String bgmodelname = Args.parseString(args,"bgmodel","whole genome zero order"); BackgroundModelMetadata md = BackgroundModelLoader.getBackgroundModel(bgmodelname, 1, "MARKOV", genome.getDBID()); boolean multicount = Args.parseFlags(args).contains("multicount"); if (md != null) { bgModel = BackgroundModelLoader.getMarkovModel(md); } else { System.err.println("Couldn't get metadata for " + bgmodelname); } matrices = Args.parseWeightMatrices(args); System.err.println("Scanning for " + matrices.size() + " matrices"); if (bgModel == null) { for (WeightMatrix m : matrices) { m.toLogOdds(); } } else { for (WeightMatrix m : matrices) { m.toLogOdds(bgModel); } } List<Region> regions = Args.parseRegionsOrDefault(args); for (Region region : regions) { char[] aschars = seqgen.execute(region).toCharArray(); int counts[] = new int[region.getWidth() / windowsize]; int toadd[] = multicount ? null : new int[region.getWidth() / windowsize]; for (int i = 0; i < counts.length; i++) { counts[i] = 0; } for (WeightMatrix m : matrices) { if (toadd != null) { for (int i = 0; i < counts.length; i++) { toadd[i] = 0; } } List<WMHit> hits = WeightMatrixScanner.scanSequence(m, (float)(m.getMaxScore() * cutoffpercent), aschars); if (multicount) { for (WMHit h : hits) { counts[h.getStart() / windowsize]++; } } else { for (WMHit h : hits) { toadd[h.getStart() / windowsize]++; } } if (!multicount) { for (int i = 0; i < counts.length; i++) { if (toadd[i] > 0) { counts[i]++; } } } } for (int i = 0; i < counts.length; i++) { if (counts[i] > 0) { System.out.println(String.format("%s\t%d\t%d\t%d", region.getChrom(), region.getStart() + i * windowsize, region.getStart() + (i+1) * windowsize, counts[i])); } } } } }