package org.seqcode.motifs;
/**
* Reads a list of n motif names on STDIN (each line contains a set of one or more names for the same gene).
* Outputs on STDOUT an n x n matrix of motif presence. The value in each entry (row i column j) is the highest
* score of motif i in the promoter of gene j. An entry of zero indicates that the motif or gene couldn't
* be found or that no motif was found.
*
* Input lines should contain tab-separated aliases for the same gene. Output will be tab separated
*
* cat aliases.txt | java MotifOccurrenceMatrix --species "$MM;mm8" --genes refGene --genes ensGene --upstream 10000 --downstream 2000
*/
import java.io.*;
import java.util.*;
import java.sql.SQLException;
import org.seqcode.data.motifdb.*;
import org.seqcode.genome.Genome;
import org.seqcode.genome.Species;
import org.seqcode.genome.location.ChromosomeInfo;
import org.seqcode.genome.location.Gene;
import org.seqcode.genome.location.Region;
import org.seqcode.genome.location.StrandedRegion;
import org.seqcode.genome.sequence.SequenceGenerator;
import org.seqcode.gsebricks.verbs.location.RefGeneGenerator;
import org.seqcode.gseutils.Args;
import org.seqcode.gseutils.NotFoundException;
import org.seqcode.motifs.WeightMatrixScanner;
public class MotifOccurrenceMatrix {
private List<List<String>> genes;
private List<WeightMatrix> matrices;
private double[][] bestScores;
private int upstream, downstream;
private List<RefGeneGenerator> geneGenerators;
private Genome genome;
private WeightMatrixLoader loader;
public static void main(String args[]) throws Exception {
MotifOccurrenceMatrix mom = new MotifOccurrenceMatrix();
mom.parseArgs(args);
mom.readGenes();
mom.getMatrices();
mom.computeMatrix();
mom.printMatrix();
}
public MotifOccurrenceMatrix() {
loader = new WeightMatrixLoader();
}
public void parseArgs(String args[]) throws IOException, NotFoundException {
geneGenerators = Args.parseGenes(args);
upstream = Args.parseInteger(args,"upstream",10000);
downstream = Args.parseInteger(args,"downstream",2000);
genome = Args.parseGenome(args).getLast();
}
/**
* reads the list of genes from STDIN. Each line is one elemen of genes
* and contains the tab-separated gene names
*/
public void readGenes() throws IOException {
genes = new ArrayList<List<String>>();
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
String line = null;
while ((line = reader.readLine()) != null) {
ArrayList<String> aliases = new ArrayList<String>();
String[] split = line.split("\\t");
for (int i = 0; i < split.length; i++) {
aliases.add(split[i]);
}
genes.add(aliases);
}
reader.close();
bestScores = new double[genes.size()][genes.size()];
}
/**
* Finds a WeightMatrix for each gene by searching through the aliases
* in order.
*
* It might eventually make sense to store a set of matrices and
* then score them all and take the highest score for each target gene.
*/
public void getMatrices() {
List<String> types = new ArrayList<String>();
types.add("TRANSFAC");
types.add("MEME");
types.add("TAMO");
types.add(null);
matrices = new ArrayList<WeightMatrix>();
for (int i = 0; i < genes.size(); i++) {
List<String> aliases = genes.get(i);
WeightMatrix matrix = null;
// first try the aliases listed in the input
for (String a : aliases) {
for (String t : types) {
Collection<WeightMatrix> collection = loader.query(a, null, t);
for (WeightMatrix m : collection) {
if (m != null) {
matrix = m;
break;
}
}
if (matrix != null) {break;}
}
if (matrix != null) {break;}
}
if (matrix == null) {
// now see what aliases we have in our database for the gene and try all of those
HashSet<String> morealiases = new HashSet<String>();
for (String a : aliases) {
for (RefGeneGenerator g : geneGenerators) {
Iterator<Gene> iter = g.byName(a);
while (iter.hasNext()) {
Gene gene = iter.next();
morealiases.addAll(gene.getAliases());
}
}
}
for (String a : morealiases) {
for (String t : types) {
Collection<WeightMatrix> collection = loader.query(a, null, t);
for (WeightMatrix m : collection) {
if (m != null) {
matrix = m;
break;
}
}
if (matrix != null) {break;}
}
if (matrix != null) {break;}
}
}
if (matrix == null) {
System.err.println("No Matrix for " + aliases);
}
matrices.add(matrix);
}
}
/**
* Returns the genomic region to search for a given set of aliases.
* Looks through the aliases in order, trying all elements of geneGenerators
* on the first before going on to the second.
*
* Returns null if no promoter region can be found.
*/
public Region getPromoterForGene(List<String> aliases) {
for (String a : aliases) {
for (RefGeneGenerator g : geneGenerators) {
Iterator<Gene> iter = g.byName(a);
if (iter.hasNext()) {
Gene gene = iter.next();
StrandedRegion sr = new StrandedRegion(gene.getGenome(),
gene.getChrom(),
gene.getFivePrime(),
gene.getFivePrime(),
gene.getStrand());
sr = sr.expand(upstream, downstream);
System.err.println(a + " -> " + sr);
return sr;
}
}
}
System.err.println("No promoter for " + aliases);
return null;
}
public void computeMatrix() throws SQLException {
WMHitScoreComparator comparator = new WMHitScoreComparator();
List<Region> promoters = new ArrayList<Region>();
SequenceGenerator<Region> seqgen = new SequenceGenerator<Region>(genome);
for (int j = 0; j < genes.size(); j++) {
Region promoter = getPromoterForGene(genes.get(j));
promoters.add(promoter);
}
for (int i = 0; i < genes.size(); i++) {
WeightMatrix m = matrices.get(i);
if (m == null) {
continue;
}
for (int j = 0; j < genes.size(); j++) {
Region promoter = promoters.get(j);
if (promoter == null) {
continue;
}
String seq = seqgen.execute(promoter);
List<WMHit> hits = WeightMatrixScanner.scanSequence(m,
(float)(m.getMaxScore() * .75),
seq.toCharArray());
Collections.sort(hits,comparator);
if (hits.size() > 0) {
bestScores[i][j] = hits.get(0).score;
} else {
bestScores[i][j] = 0;
}
}
}
}
public void printMatrix () {
for (int i = 0; i < bestScores.length; i++) {
for (int j = 0; j < bestScores[i].length - 1; j++) {
System.out.print(bestScores[i][j] + "\t");
}
System.out.print(bestScores[i][bestScores[i].length - 1] + "\n");
}
}
}
class WMHitScoreComparator implements Comparator<WMHit> {
public int compare(WMHit a, WMHit b) {
return (int)(100 * (b.score - a.score));
}
}