package org.seqcode.motifs;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import org.seqcode.data.io.RegionFileUtilities;
import org.seqcode.data.io.StreamGobbler;
import org.seqcode.data.motifdb.WeightMatrix;
import org.seqcode.genome.Genome;
import org.seqcode.genome.GenomeConfig;
import org.seqcode.genome.Species;
import org.seqcode.genome.location.NamedRegion;
import org.seqcode.genome.location.Region;
import org.seqcode.genome.sequence.SequenceGenerator;
import org.seqcode.gsebricks.verbs.location.ChromRegionIterator;
import org.seqcode.gsebricks.verbs.motifs.WeightMatrixScoreProfile;
import org.seqcode.gsebricks.verbs.motifs.WeightMatrixScorer;
import org.seqcode.gseutils.ArgParser;
import org.seqcode.gseutils.Args;
import org.seqcode.gseutils.NotFoundException;
import org.seqcode.gseutils.Pair;
/**
* Utility to run MEME within a java class and from a command line. It also evaluates significance
* of the MEME discovered motifs against randomly picked genomic sequences and calculates ROC score.
*
* Input:
* - Genome
* - MEME path and MEME arguments
* - Window around peaks in which to get sequences
* Output:
* - MEME motif files
* - ROC score for each motif
*
* @author akshay kakumanu
*/
public class MemeER {
protected String MEMEpath;
protected String MEMEargs;
protected static String PWMfile = null; // pwm output file name
protected Float pseudo = (float)0.001;
public static final int MOTIF_FINDING_NEGSEQ=5000;
public static final double MOTIF_FINDING_ALLOWED_REPETITIVE = 0.2;
// public static final double MOTIF_MIN_ROC = 0.70;
public static double MOTIF_MIN_ROC = 0.70;
public MemeER(String path, String args) {
this.MEMEpath = path;
this.MEMEargs = args;
}
// option to set motif minimum ROC
public void setMotifMinROC(double minroc){MOTIF_MIN_ROC = minroc;}
public Pair<List<WeightMatrix>,List<WeightMatrix>> execute(List<String> sequences, File memeOutDirFullName, boolean bestOnly){
List<WeightMatrix> wm = new ArrayList<WeightMatrix>();
List<WeightMatrix> fm = new ArrayList<WeightMatrix>();
String memeOutDir = null;
File workingDir = new File(System.getProperty("user.dir"));
if(memeOutDirFullName == null){
String wDir = System.getProperty("user.dir");
memeOutDir = wDir+"/meme_out";
}else{
memeOutDir = memeOutDirFullName.getAbsolutePath();
}
try {
//Set up the input file
File seqFile= File.createTempFile("seq", ".fa", workingDir);
String seqFilename = seqFile.getCanonicalPath();
FileWriter fout = new FileWriter(seqFile);
int sCount=1;
for(String seq : sequences){
fout.write(">Seq"+sCount+"\n"+seq+"\n");
sCount++;
}
fout.close();
//Test if meme directory exists. If it does, recursively delete contents
File memeOutPath = new File(memeOutDir);
if(memeOutPath.exists())
deleteDirectory(memeOutPath);
//Call MEME
String MEMEcmd = MEMEpath+"/meme ";
Process proc = Runtime.getRuntime().exec(MEMEcmd+" "+seqFilename+" "+MEMEargs +" -o "+memeOutDir);
// any error message?
StreamGobbler errorGobbler = new
StreamGobbler(proc.getErrorStream(), "MEME_ERR", true);
// any output?
StreamGobbler outputGobbler = new
StreamGobbler(proc.getInputStream(), "MEME_OUT", true);
// kick them off
errorGobbler.start();
outputGobbler.start();
// any error???
int exitVal = proc.waitFor();
System.err.println("MEME ExitValue: " + exitVal);
File memeOutFile = new File(memeOutDir+"/meme.txt");
if (!memeOutFile.exists()) {
//Clean up intermediate files (fasta, etc)
if(seqFile.exists())
seqFile.delete();
throw new FileNotFoundException("Can't find file " + memeOutFile.getName());
}else{
BufferedReader memeReader = new BufferedReader(new FileReader(memeOutFile));
Map<String,Double> back = parseMEMEResultsForBackground(memeReader);
memeReader.close();
BufferedReader memeReader2 = new BufferedReader(new FileReader(memeOutFile));
List<Pair<WeightMatrix,Double>> currFM = parseMEMEResultsForFreqMatries(memeReader2);
memeReader2.close();
//Extract best motif?
if(bestOnly){
double minScore = Double.MAX_VALUE;
WeightMatrix bestMotif=null;
for(Pair<WeightMatrix,Double> m : currFM)
if(m.cdr()<minScore){
minScore = m.cdr();
bestMotif = m.car();
}
fm.add(bestMotif);
WeightMatrix wMatrix = WeightMatrix.getLogOddsVersion(bestMotif, back);
wm.add(wMatrix);
//System.out.println(bestMotif.getName()+"\t"+minScore);
//System.out.println(WeightMatrix.printMatrix(bestMotif));
}else{
for(Pair<WeightMatrix,Double> m : currFM){
fm.add(m.car());
WeightMatrix wMatrix = WeightMatrix.getLogOddsVersion(m.car(), back);
wm.add(wMatrix);
}
}
}
//Clean up intermediate files (fasta, etc)
if(seqFile.exists())
seqFile.delete();
proc.destroy();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
return new Pair<List<WeightMatrix>,List<WeightMatrix>>(wm,fm);
}
protected Map<String,Double> parseMEMEResultsForBackground(BufferedReader memeOut){
Map<String,Double> back = new HashMap<String,Double>();
try {
String line=memeOut.readLine();
int lineno = 1;
while (line!=null && !line.matches(".*Background letter frequencies.*")) {
line = memeOut.readLine();
lineno++;
}
line = memeOut.readLine();
if(line!=null){
try {
String[] pieces = line.split("\\s+");
double A = Double.parseDouble(pieces[1]);
double C = Double.parseDouble(pieces[3]);
double G = Double.parseDouble(pieces[5]);
double T = Double.parseDouble(pieces[7]);
back.put("A", A);
back.put("C", C);
back.put("G", G);
back.put("T", T);
}
catch (NumberFormatException ex) {
System.err.println("At line " + lineno + ": " + line);
ex.printStackTrace();
throw ex;
}
catch (ArrayIndexOutOfBoundsException ex) {
System.err.println("At line " + lineno + ": " + line);
ex.printStackTrace();
throw ex;
}
}
} catch (NumberFormatException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return back;
}
protected List<Pair<WeightMatrix,Double>> parseMEMEResultsForFreqMatries(BufferedReader memeOut){
List<Pair<WeightMatrix,Double>> parsed = new ArrayList<Pair<WeightMatrix,Double>>();
String line;
int lineno = 1;
int motifCount=0;
try {
while((line = memeOut.readLine()) != null){
while (line!=null && !line.matches(".*letter-probability matrix.*")) {
line = memeOut.readLine();
lineno++;
}
if(line!=null){
motifCount++;
String lenStr = line.replaceFirst("^.*w=\\s*", "");
lenStr = lenStr.replaceFirst("\\s*nsites=.*", "");
int length = Integer.parseInt(lenStr);
String EStr = line.replaceFirst("^.*E=\\s*", "");
double Eval = Double.parseDouble(EStr);
WeightMatrix matrix = new WeightMatrix(length);
matrix.setNameVerType("Motif"+motifCount, "freq", "MEME");
for (int i = 0; i < length; i++) {
line = memeOut.readLine().replaceFirst("^\\s*", "");
lineno++;
try {
String[] pieces = line.split("\\s+");
float A = Float.parseFloat(pieces[0])+pseudo;
float C = Float.parseFloat(pieces[1])+pseudo;
float G = Float.parseFloat(pieces[2])+pseudo;
float T = Float.parseFloat(pieces[3])+pseudo;
float total = A+C+G+T;
matrix.matrix[i]['A'] = A/total;
matrix.matrix[i]['C'] = C/total;
matrix.matrix[i]['G'] = G/total;
matrix.matrix[i]['T'] = T/total;
}
catch (NumberFormatException ex) {
System.err.println("At line " + lineno + ": " + line);
ex.printStackTrace();
throw ex;
}
catch (ArrayIndexOutOfBoundsException ex) {
System.err.println("At line " + lineno + ": " + line);
ex.printStackTrace();
throw ex;
}
}
matrix.setLogOdds();
parsed.add(new Pair<WeightMatrix,Double>(matrix, Eval));
}
}
} catch (NumberFormatException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return parsed;
}
public static void main(String[] args) {
try {
ArgParser ap = new ArgParser(args);
String memeargs = Args.parseString(args, "memeargs", " -dna -mod zoops -revcomp -nostatus ");
int MEMEminw = Args.parseInteger(args, "mememinw", 6);
//MEME maxw
int MEMEmaxw = Args.parseInteger(args, "mememaxw", 18);
//MEME nmotifs option
int MEMEnmotifs = Args.parseInteger(args,"memenmotifs", 3);
int WinSize = Args.parseInteger(args, "win", 200);
if (!ap.hasKey("memepath")||!ap.hasKey("seq")||!ap.hasKey("locations")){
System.err.println("Usage:\n " +
"MemeER\n " +
"--geninfo <genome info file> \n " +
"--seq <fasta seq directory> \n " +
"--memepath <path to the meme bin dir (default: meme is in $PATH)>\n " +
"\nOPTIONS:\n " +
"--win <window of sequence to take around peaks(default=200)>\n " +
"--memenmotifs <number of motifs MEME should find for each condition (default=3)>\n " +
"--mememinw <minw arg for MEME (default=6)>\n " +
"--mememaxw <maxw arg for MEME (default=18)>\n " +
"--memeargs <additional args for MEME (default= -dna -mod zoops -revcomp -nostatus)>\n " +
"--out <output file prefix>\n " +
"--printPWM [flag to print PWM]\n " +
"--minROC <min ROC required for pwm output (default=0.7)>\n " +
"");
System.exit(0);
}
String GenPath = ap.getKeyValue("seq");
memeargs = memeargs + " -nmotifs "+MEMEnmotifs + " -minw "+MEMEminw+" -maxw "+MEMEmaxw;
// Multiple input files
String[] points = ap.getKeyValue("locations").split(";");
List<WeightMatrix> selectedMotifs = new ArrayList<WeightMatrix>();
List<Double> selectedMotifsRocs = new ArrayList<Double>();
GenomeConfig gcon = new GenomeConfig(args);
Genome gen = gcon.getGenome();
MemeER meme = new MemeER(Args.parseString(args, "memepath", ""), memeargs);
// if minROC is provided, set the new value
if (ap.hasKey("minROC")){
meme.setMotifMinROC(Args.parseDouble(args, "minROC", 0.7));
}
// specify output directory if provided
String outFolderName = null;
if (ap.hasKey("out")){
outFolderName = ap.getKeyValue("out");
}else{
outFolderName = System.getProperty("user.dir");
}
File outFolder = new File(outFolderName);
outFolder.mkdirs();
PrintWriter writer = null;
// print PWM if specified
if (ap.hasKey("printPWM")){
writer = new PrintWriter(new File(outFolderName+File.separator+"pwm.out"));
}
for(int p=0; p<points.length; p++){
List<Region> search_regs = RegionFileUtilities.loadRegionsFromPeakFile(gen, points[p], WinSize);
SequenceGenerator<Region> seqgen = new SequenceGenerator<Region>();
seqgen.useCache(true);
seqgen.useLocalFiles(true);
seqgen.setGenomePath(GenPath);
SequenceGenerator.setOffRegionCache();
String[] randomSequences=new String[5000];
if (!seqgen.isRegionCached()){
System.err.println("Caching sequences");
List<Region> randomRegions = randomRegionPick(gen, null, MemeER.MOTIF_FINDING_NEGSEQ,WinSize);
randomSequences = seqgen.setupRegionCache(search_regs, randomRegions);
System.err.println("Caching completed");
}
List<String> seqs = new ArrayList<String>();
for(int i=0; i<search_regs.size(); i++){
String currSeq = seqgen.execute(search_regs.get(i));
if(lowercaseFraction(currSeq)<=MOTIF_FINDING_ALLOWED_REPETITIVE){
seqs.add(currSeq);
}
}
// Pair<List<WeightMatrix>,List<WeightMatrix>> matrices = meme.execute(seqs, null, false);
// allowing to specify output directory
Pair<List<WeightMatrix>,List<WeightMatrix>> matrices = meme.execute(seqs, new File(outFolderName+File.separator+"meme_out"), false);
List<WeightMatrix> wm = matrices.car();
List<WeightMatrix> fm = matrices.cdr();
if(wm.size()>0){
//Evaluate the significance of the discovered motifs
double rocScores[] = meme.motifROCScores(wm,seqs,randomSequences);
System.err.println("MEME results for:" );
for(int w=0; w<fm.size(); w++){
if(fm.get(w)!=null){
System.err.println("\t"+fm.get(w).getName()+"\t"+ WeightMatrix.getConsensus(fm.get(w))+"\tROC:"+String.format("%.2f",rocScores[w]));
}
if(rocScores[w] > MemeER.MOTIF_MIN_ROC){
selectedMotifs.add(fm.get(w));
selectedMotifsRocs.add(rocScores[w]);
}
}
}
}
//Printing the selected motifs
for(int m =0; m<selectedMotifs.size(); m++){
// make motif ID consistent between the ROC and PWM
String out = WeightMatrix.printTransfacMatrix(selectedMotifs.get(m),selectedMotifs.get(m).getName());
// String out = WeightMatrix.printTransfacMatrix(selectedMotifs.get(m),"Motif_"+Integer.toString(m));
System.err.println(out);
// optional : print selected motif ROC and PWM to a file
if (writer != null){
writer.println("\t"+selectedMotifs.get(m).getName()+"\t"+ WeightMatrix.getConsensus(selectedMotifs.get(m))+"\tROC:"+String.format("%.2f",selectedMotifsRocs.get(m)));
writer.println(out);
}
}
// close the pwm output file
if (writer != null){writer.close();}
} catch (FileNotFoundException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
public static double lowercaseFraction(String seq){
double count = 0;
for (char c:seq.toCharArray())
if (Character.isLowerCase(c) || c=='N')
count++;
return count/(double)seq.length();
}
public static boolean deleteDirectory(File path) {
if( path.exists() ) {
File[] files = path.listFiles();
for(int i=0; i<files.length; i++) {
if(files[i].isDirectory()) {
deleteDirectory(files[i]);
}
else {
files[i].delete();
}
}
}
return( path.delete() );
}
public double[] motifROCScores(List<WeightMatrix> matrices, List<String> posSeqs, String[] negSeqs){
double[] rocScores = new double[matrices.size()];
int m=0;
for(WeightMatrix motif : matrices){
List<Double> posScores = new ArrayList<Double>();
List<Double> negScores = new ArrayList<Double>();
if(motif!=null){
WeightMatrixScorer scorer = new WeightMatrixScorer(motif);
for(String posSeq : posSeqs){
WeightMatrixScoreProfile profiler = scorer.execute(posSeq);
posScores.add(profiler.getMaxScore());
}
for(int s=0; s<negSeqs.length; s++){
WeightMatrixScoreProfile profiler = scorer.execute(negSeqs[s]);
negScores.add(profiler.getMaxScore());
}
}
rocScores[m] = calcROCAUC(posScores, negScores);
m++;
}
return rocScores;
}
protected double calcROCAUC(List<Double> posMaxScores, List<Double> negMaxScores) {
double auc = 0;
if(posMaxScores.size()==0)
return 0;
if(negMaxScores.size()==0)
return 1;
ArrayList<LabeledDouble> data = new ArrayList<LabeledDouble>();
for(Double d : posMaxScores)
data.add(new LabeledDouble(d, 1));
for(Double d : negMaxScores)
data.add(new LabeledDouble(d, 0));
Collections.sort(data);
double pCount = (double)posMaxScores.size();
double nCount = (double)negMaxScores.size();
int x=0;
double possum=0;
double lastsn=0;
double lastfpr=0;
double lastdval = 10000000;
for(LabeledDouble d : data){
possum+=d.label;
if(d.dat!=lastdval){
double sn = possum/pCount;
double fp = (x+1)-possum;
double sp = (nCount-fp)/nCount;
double fpr=1-sp;
if(x>0){
//Rectangle //Triangle
auc += ((fpr-lastfpr)*lastsn) + ((sn-lastsn)*(fpr-lastfpr)/2);
}
lastfpr=fpr;
lastsn = sn;
}
lastdval = d.dat;
x++;
}
return auc;
}
public class LabeledDouble implements Comparable<LabeledDouble>{
public Double dat;
public Integer label;
public LabeledDouble(Double d, Integer i){dat=d; label=i;}
public int compareTo(LabeledDouble ld) {
if(dat > ld.dat){return(-1);}
else if(dat < ld.dat){return(1);}
else{return 0;}
}
}
public static List<Region> randomRegionPick(Genome gen, List<Region> blackList, int numSamples, int sampleSize){
List<Region> regs = new ArrayList<Region>();
Random rand = new Random();
int validSamples=0;
//First see how big the genome is:
int numChroms=0;
long genomeSize=0;
long [] chromoSize = new long[gen.getChromList().size()];
String [] chromoNames = new String[gen.getChromList().size()];
Iterator<NamedRegion> chroms = new ChromRegionIterator(gen);
while (chroms.hasNext()) {
NamedRegion currentChrom = chroms.next();
genomeSize += (double)currentChrom.getWidth();
chromoSize[numChroms]=currentChrom.getWidth();
chromoNames[numChroms]=currentChrom.getChrom();
numChroms++;
}
//Now, iteratively generate random positions and check if they are valid and not overlapping repeats.
while(validSamples<numSamples){
Region potential;
long randPos = (long)(1+(rand.nextDouble()*genomeSize));
//find the chr
boolean found=false;
long total=0;
for(int c=0; c<numChroms && !found; c++){
if(randPos<total+chromoSize[c]){
found=true;
if(randPos+sampleSize<total+chromoSize[c]){
potential = new Region(gen, chromoNames[c], (int)(randPos-total), (int)(randPos+sampleSize-total));
//is this region in the blacklist?
boolean valid=true;
if(blackList!=null){
for(Region r : blackList){
if(potential.overlaps(r)){valid=false;}
}
}
if(valid){
validSamples++;
regs.add(potential);
}
}
}total+=chromoSize[c];
}
}
return(regs);
}
}