package org.seqcode.motifs;
import java.io.*;
import java.util.*;
import java.sql.*;
import java.text.ParseException;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import org.seqcode.data.connections.DatabaseException;
import org.seqcode.data.connections.DatabaseFactory;
import org.seqcode.data.connections.UnknownRoleException;
import org.seqcode.data.motifdb.*;
import org.seqcode.genome.Genome;
import org.seqcode.genome.Species;
import org.seqcode.genome.location.Region;
import org.seqcode.genome.sequence.SequenceGenerator;
import org.seqcode.genome.sequence.SequenceUtils;
import org.seqcode.gsebricks.verbs.*;
import org.seqcode.gseutils.*;
/**
* Reads a list of Points or Regions on stdin. These should be ordered and will
* be presented along the x-axis of the plot. The command line arguments specify a motif
* and a distance from the point (or center of region) within which an occurence of the
* motif must fall. The y-value of the plot is the fraction of regions near a motif; the fraction is computed
* over a sliding window of n regions/points. The x-y values for the plot are produced on
* stdout.
*
* java MotifOverlapPlot --species "$MM;mm9" --wm "Cdx2;Jaspar" --distance 50 --smooth 100 < regions.txt > plot.txt
*/
public class MotifOverlapPlot {
public static void main(String args[]) throws Exception {
int smooth = Args.parseInteger(args,"smooth",100);
int distance = Args.parseInteger(args,"distance",50);
double minpercent = Args.parseDouble(args,"minpercent",.75);
Genome genome = Args.parseGenome(args).cdr();
SequenceGenerator seqgen = new SequenceGenerator(genome);
MarkovBackgroundModel bgModel = null;
String bgmodelname = Args.parseString(args,"bgmodel","whole genome zero order");
BackgroundModelMetadata md = BackgroundModelLoader.getBackgroundModel(bgmodelname,
1,
"MARKOV",
genome.getDBID());
Collection<WeightMatrix> matrices = Args.parseWeightMatrices(args);
if (bgModel == null) {
for (WeightMatrix m : matrices) {
m.toLogOdds();
}
} else {
for (WeightMatrix m : matrices) {
m.toLogOdds(bgModel);
}
}
int numFound = 0;
int found[] = new int[smooth]; // circular buffer for whether or not there was a motif under a region
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
String line = null;
int lineno = 0;
while ((line = reader.readLine()) != null) {
Region r = null;
try {
r = Region.fromString(genome, line);
} catch (Exception e) {
System.err.println("Can't parse line " + line);
continue;
}
if (smooth != 0 && lineno >= smooth && lineno % smooth == 0) {
System.out.println(String.format("%d\t%.2f",
(lineno - smooth),
(numFound / (double)smooth)));
}
if (r == null) {
System.err.println("Null region from " + line);
continue;
}
int center = (r.getStart() + r.getEnd()) / 2;
int start = center - distance;
int end = center + distance;
if (start < 1) { start = 1; }
if (end > genome.getChromLength(r.getChrom())) { end = genome.getChromLength(r.getChrom());}
r = new Region(r.getGenome(), r.getChrom(), start, end);
char string[] = seqgen.execute(r).toCharArray();
boolean anyhits = false;
for (WeightMatrix wm : matrices) {
float minscore = (float)(wm.getMaxScore() * minpercent);
WMHit hit = WeightMatrixScanner.scanSequenceBestHit(wm,
minscore,
string);
if (hit != null) {
anyhits = true;
break;
}
}
int newval = anyhits ? 1 : 0;
if (smooth == 0) {
System.out.println(newval);
continue;
}
int oldval = found[lineno % smooth];
if (oldval != 0 && lineno >= smooth) {
numFound--;
}
if (newval != 0) {
numFound++;
}
found[lineno % smooth] = newval;
lineno++;
}
}
}