package org.seqcode.motifs;
import java.io.*;
import java.util.*;
import java.sql.*;
import java.text.ParseException;
import org.seqcode.data.*;
import org.seqcode.data.connections.DatabaseException;
import org.seqcode.data.connections.DatabaseFactory;
import org.seqcode.data.connections.UnknownRoleException;
import org.seqcode.data.io.BackgroundModelIO;
import org.seqcode.data.motifdb.CountsBackgroundModel;
import org.seqcode.data.motifdb.MarkovBackgroundModel;
import org.seqcode.data.motifdb.WeightMatrix;
import org.seqcode.data.motifdb.WeightMatrixImport;
import org.seqcode.genome.Genome;
import org.seqcode.gseutils.*;
/* Loads frequency and count matrix motifs to the db.
* The motifs are first converted to log-odds weightmatrices using a background model.
* The background model file can be generated using MakeBackgroundModel or left unspecified, in which case the mononucleotide background is taken from the genome.
Usage is similar to WeightMatrixImport:
java org.seqcode.gse.shaun.FreqMatrixImport --species 'Saccharomyces cerevisiae' --fmname HSF1 --fmversion TRANSFAC11.3 --fmtype TRANFSAC --fmfile HSF1.motif --backfile yeast.back
*/
public class FreqMatrixImport {
private static WeightMatrixImport wmimp = new WeightMatrixImport();
private static MarkovBackgroundModel back = null;
private static int[] indices = { 'A', 'C', 'G', 'T' };
private static int MAX_MOTIF_LEN = 200;
private static float SCALE_FACTOR = (float) 0.1;
public static void main(String args[]) throws IOException, ParseException, NotFoundException {
String species = null;
String fmname = null, fmversion = null, fmtype = null;
String fmfile = null; String backfile = null; String genome=null;
for (int i = 0; i < args.length; i++) {
if (args[i].equals("--species")) {
species = args[++i];
if (species.indexOf(';') != -1) {
String[] pieces = species.split(";");
species = pieces[0];
}
}
if (args[i].equals("--fmname")) {
fmname = args[++i];
if (fmname.indexOf(';') != -1) {
String[] pieces = fmname.split(";");
fmname = pieces[0];
fmversion = pieces[1];
if (pieces.length >= 3) {
fmtype = pieces[2];
}
}
}
if (args[i].equals("--fmversion")) {
fmversion = args[++i];
}
if (args[i].equals("--fmtype")) {
fmtype = args[++i];
}
if (args[i].equals("--backfile")) {
backfile = args[++i];
}if (args[i].equals("--genome")) {
genome = args[++i];
}
if (args[i].equals("--") ||
args[i].equals("--fmfile")) {
fmfile = args[++i];
}
}
if (species == null) {
System.err.println("Must supply a --species"); System.exit(1);
}
if (fmfile == null) {
System.err.println("Must supply a --wmfile"); System.exit(1);
}
if(backfile==null && genome!=null){
back = new MarkovBackgroundModel(CountsBackgroundModel.modelFromWholeGenome(Genome.findGenome(genome)));
}else if(backfile==null){
System.err.println("Must supply --backfile or --genome"); System.exit(1);
}else{
back = BackgroundModelIO.parseMarkovBackgroundModel(backfile, Genome.findGenome(genome));
}
try {
if(fmname==null) {
insertMultiFMFromFile(species,fmversion, fmtype,fmfile);
} else {
if (fmversion == null) {
System.err.println("Must supply a --wmversion"); System.exit(1);
}
insertFMFromFile(species,fmname,fmversion,fmtype,fmfile);
}
} catch (SQLException ex) {
ex.printStackTrace();
} catch (NotFoundException ex) {
ex.printStackTrace();
System.err.println("Must supply a valid species and genome");
} catch (UnknownRoleException ex ){
ex.printStackTrace();
System.err.println("Couldn't connect to role annotations");
} catch (FileNotFoundException ex) {
ex.printStackTrace();
System.err.println("Couldn't find the input file");
} catch (ParseException ex) {
ex.printStackTrace();
} catch (IOException ex) {
ex.printStackTrace();
}
}
public void setBackground(MarkovBackgroundModel b){back=b;}
// Does not convert the freq matrix into a weight matrix; loads multiple motifs
public static List<WeightMatrix> readTransfacMatricesAsFreqMatrices(String freqFile) throws NumberFormatException, IOException{
List<WeightMatrix> matrices = new ArrayList<WeightMatrix>();
BufferedReader br = new BufferedReader(new FileReader(new File(freqFile)));
String line;
WeightMatrix matrix = null;
int motifCount=0;
Vector<float[][]> arrays = new Vector<float[][]>();
Vector<Integer> arrayLens = new Vector<Integer>();
Vector<String> names = new Vector<String>();
Vector<String> versions=new Vector<String>();
//Read in Transfac format first
boolean nameLoaded=false;
int matLen=0;
while((line = br.readLine()) != null) {
line = line.trim();
if(line.length() > 0) {
String[] pieces = line.split("\\s+");
if(pieces[0].equals("DE")){
names.add(pieces[1]);
nameLoaded=true;
arrays.add(new float[MAX_MOTIF_LEN][4]);
matLen=0;
}else if(pieces[0].equals("XX")){
arrayLens.add(matLen);
motifCount++;
}else if(nameLoaded && (pieces.length==5 || pieces.length==6)){
//Load the matrix
for(int i = 1; i <=4 ; i++) {
arrays.get(motifCount)[matLen][i-1] = Float.parseFloat(pieces[i]);
}
matLen++;
}
}
}
for(int m = 0; m<motifCount; m++){
//Make a new WeightMatrix
matrix = new WeightMatrix(arrayLens.get(m));
matrix.name=names.get(m);
//Convert the freq matrix to a weight matrix
for(int i = 0; i < arrayLens.get(m); i++) {
for(int j = 0; j < 4; j++) {
matrix.matrix[i][indices[j]] = arrays.get(m)[i][j];
}
}
matrices.add(matrix);
}
br.close();
return matrices;
}
//Unlike the WeigthMatrixImport version, this one actually loads more than one matrix!
public static LinkedList<WeightMatrix> readTransfacMatrices(String wmfile, String version) throws IOException {
LinkedList<WeightMatrix> matrices = new LinkedList<WeightMatrix>();
BufferedReader br = new BufferedReader(new FileReader(new File(wmfile)));
String line;
WeightMatrix matrix = null;
int motifCount=0;
Vector<float[][]> arrays = new Vector<float[][]>();
Vector<Integer> arrayLens = new Vector<Integer>();
Vector<String> names = new Vector<String>();
Vector<String> versions=new Vector<String>();
//Read in Transfac format first
boolean nameLoaded=false;
int matLen=0;
while((line = br.readLine()) != null) {
line = line.trim();
if(line.length() > 0) {
String[] pieces = line.split("\\s+");
if(pieces[0].equals("DE")){
names.add(pieces[1]);
if(pieces.length>=3){
String v_string =pieces[2];
if(pieces.length>=4){for(int v=3; v<pieces.length; v++){v_string =v_string+","+pieces[v];}}
if(version!=null){versions.add(new String(v_string+","+version));}
else{versions.add(v_string);}
}else{
versions.add(version);
}
nameLoaded=true;
arrays.add(new float[MAX_MOTIF_LEN][4]);
matLen=0;
}else if(pieces[0].equals("XX")){
arrayLens.add(matLen);
motifCount++;
}else if(nameLoaded && (pieces.length==5 || pieces.length==6)){
//Load the matrix
for(int i = 1; i <=4 ; i++) {
arrays.get(motifCount)[matLen][i-1] = Float.parseFloat(pieces[i]);
}
matLen++;
}
}
}
for(int m = 0; m<motifCount; m++){
//Make a new WeightMatrix
matrix = new WeightMatrix(arrayLens.get(m));
matrix.name=names.get(m);
matrix.version=versions.get(m);
//Convert the freq matrix to a weight matrix
for(int i = 0; i < arrayLens.get(m); i++) {
float ttl=0;
for(int j=0; j<4; j++){ttl += arrays.get(m)[i][j];}
float currScale = SCALE_FACTOR*ttl;
for(int j = 0; j < 4; j++) {
matrix.matrix[i][indices[j]] = (float)(Math.log((((arrays.get(m)[i][j] + (currScale*back.getMarkovProb(j, 1)))/(ttl+currScale))/back.getMarkovProb(j, 1)))/Math.log(2));
}
}
matrices.add(matrix);
System.err.println("Added \"" + matrix.name + "\"\t"+matrix.version);
System.err.println("Max score = "+matrix.getMaxScore());
System.err.println(WeightMatrix.printMatrix(matrix));
}
br.close();
return matrices;
}
public static LinkedList<WeightMatrix> readTransfacMatrices(String wmfile, String version, float pseudocountTotal){
SCALE_FACTOR = pseudocountTotal;
LinkedList<WeightMatrix> matrices = new LinkedList<WeightMatrix>();
try {
matrices = readTransfacMatrices(wmfile, version);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}return(matrices);
}
public static LinkedList<WeightMatrix> readTransfacMatrices(String wmfile){
LinkedList<WeightMatrix> matrices = new LinkedList<WeightMatrix>();
try {
matrices = readTransfacMatrices(wmfile, "");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}return(matrices);
}
public static Set<Integer> insertMultiFMFromFile(String species, String version,String fmtype, String fmfile)
throws IOException, SQLException, NotFoundException {
LinkedList<WeightMatrix> matrices=null;
HashSet<Integer> ids = new HashSet<Integer>();
if(fmtype.matches(".*TRANSFAC.*")) {
matrices = readTransfacMatrices(fmfile, version);
}else if (fmtype.matches(".*SEQ.*")) {
try {
matrices = readAlignedSequenceMatrices(fmfile);
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
System.err.println("Didn't see a program I recognize in the type. defaulting to TRANSFAC");
matrices = readTransfacMatrices(fmfile);
}
if(matrices!=null){
for(WeightMatrix matrix : matrices) {
matrix.type = fmtype;
matrix.species = species;
matrix.setLogOdds();
ids.add(wmimp.insertMatrixIntoDB(matrix));
}
}System.out.println("Matrices Loaded: "+matrices.size());
return ids;
}
/* reads a freq matrix from the specified file,
inserts it into the database with the specified name and
version, and returns its dbid. This means that the file may contain
only a single weight matrix*/
public static int insertFMFromFile(String species,
String fmname,
String fmversion,
String fmtype,
String fmfile) throws SQLException, NotFoundException, UnknownRoleException, FileNotFoundException, ParseException, IOException {
LinkedList<WeightMatrix> matrices;
if (fmtype.matches(".*TRANSFAC.*")) {
matrices = readTransfacMatrices(fmfile);
} else if (fmtype.matches(".*SEQ.*")) {
matrices = readAlignedSequenceMatrices(fmfile);
} else {
System.err.println("Didn't see a program I recognize in the type. defaulting to TRANSFAC");
matrices = readTransfacMatrices(fmfile);
}
matrices.get(0).name = fmname;
matrices.get(0).version = fmversion;
matrices.get(0).type = fmtype;
matrices.get(0).species = species;
matrices.get(0).setLogOdds();
return wmimp.insertMatrixIntoDB(matrices.get(0));
}
/**
* Constructs a matrix from a set of strings. The strings must all have the same length.
* The WeightMatrix returned has the frequencies of the bases at each position given.
*
* @param strings
* @return
* @throws IOException
* @throws ParseException
*/
public static WeightMatrix buildAlignedSequenceMatrix(Collection<String> strings) throws ParseException {
WeightMatrix wm = null;
int[] counts = null;
for(String line : strings) {
line = line.trim().toUpperCase();
if(line.length() > 0) {
if(wm == null) {
wm = new WeightMatrix(line.length());
counts = new int[line.length()];
for(int i = 0; i < wm.length(); i++) {
counts[i] = 0;
for(int j = 0; j < wm.matrix[i].length; j++) {
wm.matrix[i][j] = (float)0.0;
}
}
}
if(line.length() != wm.length()) {
throw new ParseException("Line \"" + line + "\" was of uneven length (" +
wm.length() + ")", 0);
}
for(int i = 0; i < line.length(); i++) {
char c = line.charAt(i);
if(c != 'N' && c != '-') {
wm.matrix[i][c] += (float)1.0;
counts[i] += 1;
}
}
}
}
for(int i = 0; wm != null && i < wm.length(); i++) {
if(counts[i] > 0) {
for(int j = 0; j < wm.matrix[i].length; j++) {
wm.matrix[i][j] /= (float)counts[i];
}
} else {
for(int j = 0; j < wm.matrix[i].length; j++) {
wm.matrix[i][j] = (float)1.0 / (float)(wm.matrix[i].length);
}
}
}
return wm;
}
/**
* Reads a matrix from a text-file. The file is assumed to have one sequence per line,
* and all lines must be the same length. The WeightMatrix returned has the frequencies
* of the bases at each position given.
*
* @param fname
* @return
* @throws IOException
* @throws ParseException
*/
public static LinkedList<WeightMatrix> readAlignedSequenceMatrices(String fname) throws IOException,ParseException {
LinkedList<WeightMatrix> matrices = new LinkedList<WeightMatrix>();
int motifCount=0;
LinkedList<String> strings = new LinkedList<String>();
String line, currName="Motif";
BufferedReader br = new BufferedReader(new FileReader(new File(fname)));
while((line = br.readLine()) != null) {
line = line.trim();
if(line.contains(">")){
line.replaceAll(">", "");
currName = line;
if(strings.size()>0){
matrices.add(buildAlignedSequenceMatrix(strings));
matrices.get(motifCount).name = currName;
}
strings = new LinkedList<String>();
motifCount++;
}else if(line.length() > 0) {
strings.addLast(line);
}
}
br.close();
if(strings.size()>0){
matrices.add(buildAlignedSequenceMatrix(strings));
matrices.get(motifCount).name = currName;
}
return(matrices);
}
}