package org.seqcode.motifs;
import java.io.*;
import java.util.*;
import java.sql.*;
import java.text.ParseException;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import org.seqcode.data.connections.DatabaseException;
import org.seqcode.data.connections.DatabaseFactory;
import org.seqcode.data.connections.UnknownRoleException;
import org.seqcode.data.motifdb.*;
import org.seqcode.genome.Genome;
import org.seqcode.genome.Species;
import org.seqcode.genome.location.Region;
import org.seqcode.genome.sequence.SequenceGenerator;
import org.seqcode.genome.sequence.SequenceUtils;
import org.seqcode.gsebricks.verbs.*;
import org.seqcode.gseutils.*;
/**
* Reads a list of Points or Regions on stdin. These should be ordered and will
* be presented along the x-axis of the plot. The command line arguments specify a motif
* and a maximum distance from the point (or center of region) to look for the motif.
* The y-value of the plot is the average distance to a motif over a sliding window
* of n regions/points. The x-y values for the plot are produced on
* stdout.
*
* java MotifDistancePlot --species "$MM;mm9" --wm "Cdx2;Jaspar" [--maxdistance 500] [--smooth 100] [--minpercent .75] < regions.txt > plot.txt
*/
public class MotifDistancePlot {
public static void main(String args[]) throws Exception {
int smooth = Args.parseInteger(args,"smooth",100);
int maxdistance = Args.parseInteger(args,"maxdistance",500);
double minpercent = Args.parseDouble(args,"minpercent",.75);
Genome genome = Args.parseGenome(args).cdr();
SequenceGenerator seqgen = new SequenceGenerator(genome);
MarkovBackgroundModel bgModel = null;
String bgmodelname = Args.parseString(args,"bgmodel","whole genome zero order");
BackgroundModelMetadata md = BackgroundModelLoader.getBackgroundModel(bgmodelname,
1,
"MARKOV",
genome.getDBID());
Collection<WeightMatrix> matrices = Args.parseWeightMatrices(args);
if (bgModel == null) {
for (WeightMatrix m : matrices) {
m.toLogOdds();
}
} else {
for (WeightMatrix m : matrices) {
m.toLogOdds(bgModel);
}
}
int sumdistances = 0;
int distances[] = new int[smooth]; // circular buffer for whether or not there was a motif under a region
int temp[] = new int[smooth];
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
String line = null;
int lineno = 0;
while ((line = reader.readLine()) != null) {
Region r = null;
try {
r = Region.fromString(genome, line);
} catch (Exception e) {
System.err.println("Can't parse line " + line);
continue;
}
if (smooth != 0 && lineno >= smooth && lineno % smooth == 0) {
double sum = 0;
for (int j = 0; j < temp.length; j++) {
temp[j] = distances[j];
sum += distances[j];
}
Arrays.sort(temp);
int median = temp[temp.length / 2];
double mean = sum / temp.length;
System.out.println(String.format("%d\t%d\t%.2f",
(lineno - smooth),
median, mean));
}
if (r == null) {
System.err.println("Null region from " + line);
continue;
}
int center = (r.getStart() + r.getEnd()) / 2;
int start = center - maxdistance;
int end = center + maxdistance;
if (start < 1) { start = 1; }
if (end > genome.getChromLength(r.getChrom())) { end = genome.getChromLength(r.getChrom());}
r = new Region(r.getGenome(), r.getChrom(), start, end);
char string[] = seqgen.execute(r).toCharArray();
int mindistance = maxdistance * 2;
for (WeightMatrix wm : matrices) {
float minscore = (float)(wm.getMaxScore() * minpercent);
List<WMHit> hits = WeightMatrixScanner.scanSequence(wm,
minscore,
string);
for (WMHit h : hits) {
int c = start + (h.start + h.end) / 2;
int d = Math.abs(c - center);
if (d < mindistance) {
mindistance = d;
}
}
}
if (smooth == 0) {
System.out.println(mindistance);
continue;
}
int oldval = distances[lineno % smooth];
if (lineno >= smooth) {
sumdistances -= oldval;
}
sumdistances += mindistance;
distances[lineno % smooth] = mindistance;
lineno++;
}
}
}