package org.seqcode.motifs;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.seqcode.data.io.BackgroundModelIO;
import org.seqcode.data.motifdb.CountsBackgroundModel;
import org.seqcode.data.motifdb.MarkovBackgroundModel;
import org.seqcode.data.motifdb.WeightMatrix;
import org.seqcode.genome.Genome;
import org.seqcode.genome.Species;
import org.seqcode.genome.sequence.RandomSequenceGenerator;
import org.seqcode.gsebricks.verbs.motifs.WeightMatrixScoreProfile;
import org.seqcode.gsebricks.verbs.motifs.WeightMatrixScorer;
import org.seqcode.gseutils.ArgParser;
import org.seqcode.gseutils.NotFoundException;
import org.seqcode.gseutils.Pair;
public class MarkovMotifThresholdFinder {
private WeightMatrix motif = null;
private MarkovBackgroundModel back;
private ArrayList<String> seqSet = new ArrayList<String>();
private static int numTest=100000;
private int window=300;
private boolean ROC=false;
private boolean scored=false;
private boolean seqGenerated=false;
private ArrayList<Double> scores;
public static void main(String[] args) throws IOException, ParseException {
MarkovMotifThresholdFinder finder;
ArgParser ap = new ArgParser(args);
if(!ap.hasKey("species") || !ap.hasKey("genome")||(!ap.hasKey("motifname")&&!ap.hasKey("motiffile"))) {
System.err.println("Usage:\n " +
"MarkovMotifThresholdFinder " +
"--species <organism name> " +
"--genome <genome version> "+
"--motifname <weightmatrix name> "+
"--motifversion <weightmatrix version> " +
"--motiffile <file containing motifs> "+
"--back <background Markov model> "+
"--win <window of sequence around positive/negative points> "+
"--num <number of sequences to sample> " +
"--printroc ");
return;
}
String species = ap.getKeyValue("species");
String genome = ap.getKeyValue("genome");
String motifversion=null;
if(ap.hasKey("motifversion")){motifversion = ap.getKeyValue("motifversion");}
String backFile =ap.hasKey("back") ? ap.getKeyValue("back"):null;
int win = ap.hasKey("win") ? new Integer(ap.getKeyValue("win")).intValue():-1;
int numSim = 1000000;
if(ap.hasKey("num")){
numSim = new Integer(ap.getKeyValue("num")).intValue();
}
boolean printROC= ap.hasKey("printroc");
boolean loadFromFile = ap.hasKey("motiffile");
try {
//Load genome
Species currorg = Species.getSpecies(species);
//Genome currgen = currorg.getGenome(genome);
//Load the background model
MarkovBackgroundModel backMod;
if(backFile == null){
backMod = new MarkovBackgroundModel(CountsBackgroundModel.modelFromWholeGenome(Genome.findGenome(genome)));
}else{
backMod = BackgroundModelIO.parseMarkovBackgroundModel(backFile, Genome.findGenome(genome));
}
//Pre-load the random sequences
ArrayList<String> randSeq = new ArrayList<String>();
RandomSequenceGenerator gen = new RandomSequenceGenerator(backMod);
for(int i=0; i<numTest; i++){
randSeq.add(gen.execute(win));
}
//Load motifs
List<WeightMatrix> motifList=new ArrayList<WeightMatrix>();
if(loadFromFile){
String motifFile = ap.getKeyValue("motiffile");
FreqMatrixImport motifImport = new FreqMatrixImport();
motifImport.setBackground(backMod);
motifList.addAll(motifImport.readTransfacMatrices(motifFile));
}else{
String motifname = ap.getKeyValue("motifname");
if (motifname.indexOf(';') != -1) {
String[] pieces = motifname.split(";");
motifname = pieces[0];
motifversion = pieces[1];
}
int wmid = WeightMatrix.getWeightMatrixID(currorg.getDBID(), motifname, motifversion);
motifList.add(WeightMatrix.getWeightMatrix(wmid));
}
if(printROC){
for(WeightMatrix matrix : motifList){
System.out.println("ROC:");
finder = new MarkovMotifThresholdFinder(matrix, backMod, numSim);
if(win >0){finder.setWin(win);}
finder.setRandomSeq(randSeq);
finder.setROC(printROC);
double thres_1 = finder.execute(0.1);
}
}else{
System.out.println("Name\tMin\tMax\tThres0.1\tThres0.05\tThres0.01\tThres0.005\tThres0.001");
for(WeightMatrix matrix : motifList){
//Run the threshold finder
//System.err.println("Initializing the threshold finder");
finder = new MarkovMotifThresholdFinder(matrix, backMod, numSim);
if(win >0){finder.setWin(win);}
finder.setRandomSeq(randSeq);
finder.setROC(printROC);
//System.err.println("Finding the best threshold");
double max = matrix.getMaxScore();
double min = matrix.getMinScore();
double thres_001 = finder.execute(0.001);
double thres_005 = finder.execute(0.005);
double thres_01 = finder.execute(0.01);
double thres_05 = finder.execute(0.05);
double thres_1 = finder.execute(0.1);
System.out.println(matrix.getName()+"\t"+min+"\t"+max+"\t"+thres_1+"\t"+thres_05+"\t"+thres_01+"\t"+thres_005+"\t"+thres_001);
//System.out.println("Threshold for Sp=0.005:\t"+thres_005);
//System.out.println("Threshold for Sp=0.01:\t"+thres_01);
//System.out.println("Threshold for Sp=0.05:\t"+thres_05);
}
}
} catch (NotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//Constructors
public MarkovMotifThresholdFinder(WeightMatrix wm, MarkovBackgroundModel markov){
this(wm, markov, numTest);
}
public MarkovMotifThresholdFinder(WeightMatrix wm, MarkovBackgroundModel markov, int numSim){
numTest=numSim;
motif=wm;
if(wm==null){System.err.println("No motif specified");System.exit(1);}
back=markov;
}
public void setNumTest(int nt){numTest=nt;}
public void setWin(int w){window=w;}
public void setROC(boolean r){ROC = r;}
public void setRandomSeq(ArrayList<String> rand){if(rand.size()>0){seqSet = rand; seqGenerated=true;}}
//Find the motif-scoring threshold for the given specificity rate
public double execute(double Sp){
if(Sp<0 || Sp>1){System.err.println("Invalid Sp value in MarkovMotifThreshold");System.exit(1);}
WeightMatrixScorer scorer = new WeightMatrixScorer(motif);
double bestThres=0.0;
//Find the scores for the random sequences
if(!scored){
//Generate the sequences first
//Simulate sequences using the markov background
if(!seqGenerated){
RandomSequenceGenerator gen = new RandomSequenceGenerator(back);
for(int i=0; i<numTest; i++){
seqSet.add(gen.execute(window));
}
seqGenerated=true;
}
scores=new ArrayList<Double>();
for(String s : seqSet){
WeightMatrixScoreProfile profiler = scorer.execute(s);
scores.add(new Double(profiler.getMaxScore(profiler.getMaxIndex())));
}
Collections.sort(scores);
scored=true;
}
//Find the score which corresponds to the required Specificity rate
int index = (int)((double)scores.size()*(1-Sp));
bestThres=scores.get(index);
//Print an ROC if required
if(ROC){
System.out.println("i\tThreshold\tPerformance\tSp");
int count=1;
for(Double d : scores){
double currThres = d.doubleValue();
double currSp =(double)count/(double)scores.size();
System.out.println(count+"\t"+currThres+"\t"+currSp);
count++;
}
}
return bestThres;
}
public Score2Sp getMotifROC(){
ArrayList<Pair<Double,Double>> scoreVsSp = new ArrayList<Pair<Double,Double>>();
WeightMatrixScorer scorer = new WeightMatrixScorer(motif);
//Find the scores for the random sequences
if(!scored){
//Generate the sequences first
//Simulate sequences using the markov background
if(!seqGenerated){
RandomSequenceGenerator gen = new RandomSequenceGenerator(back);
for(int i=0; i<numTest; i++){
seqSet.add(gen.execute(window));
}
seqGenerated=true;
}
scores=new ArrayList<Double>();
for(String s : seqSet){
WeightMatrixScoreProfile profiler = scorer.execute(s);
scores.add(new Double(profiler.getMaxScore(profiler.getMaxIndex())));
}
Collections.sort(scores);
scored=true;
}
int count=1;
for(Double d : scores){
double currThres = d.doubleValue();
double currSp =(double)count/(double)scores.size();
scoreVsSp.add(new Pair<Double,Double>(currThres,currSp));
count++;
}
return(new Score2Sp(scoreVsSp));
}
}