/**
*
*/
package org.seqcode.data.io.parsing;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.text.ParseException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.seqcode.data.io.LineByLineFileReader;
import org.seqcode.data.io.Points2RegionsConverter;
import org.seqcode.data.motifdb.WeightMatrix;
import org.seqcode.data.motifdb.WeightMatrixImport;
import org.seqcode.genome.Species;
import org.seqcode.gseutils.NotFoundException;
import cern.colt.matrix.DoubleMatrix2D;
import cern.colt.matrix.impl.DenseDoubleMatrix2D;
/**
* @author rca
* This class parses output files from a variety of common motif discovery
* algorithms to create WeightMatrix objects, and can also parse raw PWMs in a
* variety of formats.
*/
public class PWMParser {
private static Logger logger = Logger.getLogger(PWMParser.class);
/**
* Parse a PWM in which the rows are nucleotides and the columns are positions.
* This method only checks that each row has the expected quantity of number
* tokens, but doesn't make any assumptions about the format of the PWM
* (freq, counts, log-odds, etc...). Thus each row may contain other
* non-number tokens such as the nucleotide letter, etc...
* @param length the number of positions in the pwm
* @param aRow the row for A
* @param cRow the row for C
* @param gRow the row for G
* @param tRow the row for T
* @return a 4 x length DenseDoubleMatrix2D containing the parsed values
* @throws IOException if an IO error occurs tokenizing one of the lines
* @throws ParseException if the quantity of number tokens is not <i>length</i>
*/
public static DenseDoubleMatrix2D parsePWM(int length, String aRow, String cRow, String gRow, String tRow) throws IOException, ParseException {
DenseDoubleMatrix2D rawPWM = new DenseDoubleMatrix2D(4, length);
String[] rows = new String[] {aRow, cRow, gRow, tRow};
for (int i = 0; i < rows.length; i++) {
String row = rows[i];
StreamTokenizer tokenizer = new StreamTokenizer(new StringReader(row));
int valIndex = 0;
boolean done = false;
while (!done && (tokenizer.nextToken() != StreamTokenizer.TT_EOF)) {
if (tokenizer.ttype == StreamTokenizer.TT_NUMBER) {
rawPWM.setQuick(i, valIndex, tokenizer.nval);
valIndex++;
done = (valIndex == length);
}
}
//check that no more number tokens are available
while (tokenizer.ttype != StreamTokenizer.TT_EOF) {
if (tokenizer.nextToken() == StreamTokenizer.TT_NUMBER) {
valIndex++;
}
}
//check whether the right number of tokens were parsed
if (valIndex != length) {
throw new ParseException("Expected " + length + " numbers, but parsed " + valIndex + " from:\n" + row, row.length());
}
}
return rawPWM;
}
/**
* Parse a PWM in which the columns are nucleotides and the rows are positions.
* This method will attempt to handle rows that include a row number as the
* first number token. This method only checks that each column has the four
* number tokens (and a possible row number), but doesn't make any
* assumptions about the format of the PWM (freq, counts, log-odds, etc...).
* Thus each row may contain other non-number tokens such as the nucleotide
* letter, etc...
* @param posRows An array of Strings that are the consecutive rows of the PWM
* @return a 4 x length DenseDoubleMatrix2D containing the parsed values
* @throws IOException if an IO error occurs tokenizing one of the lines
* @throws ParseException if the quantity of number tokens is not 4, or 5
* (including a row number)
*/
public static DenseDoubleMatrix2D parsePWM(String[] posRows) throws IOException, ParseException {
DenseDoubleMatrix2D rawPWM = new DenseDoubleMatrix2D(4, posRows.length);
boolean hasRowNumbers = false;
boolean zeroBased = false;
for (int i = 0; i < posRows.length; i++) {
String row = posRows[i];
StreamTokenizer tokenizer = new StreamTokenizer(new StringReader(row));
int valIndex = 0;
boolean done = false;
boolean skippedHeader = false;
while (!done && (tokenizer.nextToken() != StreamTokenizer.TT_EOF)) {
if (tokenizer.ttype == StreamTokenizer.TT_NUMBER) {
//handle row numbers
if (hasRowNumbers && (valIndex == 0) && !skippedHeader) {
if (zeroBased && (tokenizer.nval == i)) {
skippedHeader = true;
}
else if (!zeroBased && (tokenizer.nval == i+1)) {
skippedHeader = true;
}
else {
throw new ParseException("Unknown PWM format, or incorrect row number:\n" + row, 0);
}
}
else {
rawPWM.setQuick(valIndex, i, tokenizer.nval);
valIndex++;
done = (valIndex == 4);
}
}
}
//check that no more number tokens are available
double temp = Double.NaN;
while (tokenizer.ttype != StreamTokenizer.TT_EOF) {
if (tokenizer.nextToken() == StreamTokenizer.TT_NUMBER) {
temp = tokenizer.nval;
valIndex++;
}
}
//check whether the right number of tokens were parsed
if (valIndex != 4) {
//If the first column may be a row number try to keep going
if ((valIndex == 5) && (i == 0)
&& ((rawPWM.getQuick(0, 0) == 0) || (rawPWM.getQuick(0, 0) == 1))) {
if (rawPWM.getQuick(0, 0) == 0) {
zeroBased = true;
}
else {
zeroBased = false;
}
//adjust the matrix to correct for the row number
for (int j = 0; j < 3; j++) {
rawPWM.setQuick(j, 0, rawPWM.getQuick(j+1, 0));
}
rawPWM.setQuick(3, 0, temp);
hasRowNumbers = true;
}
else {
throw new ParseException("Expected 4 numbers, but parsed " + valIndex + " from:\n" + row, row.length());
}
}
}
return rawPWM;
}
/**
*
* @param priorityFile
* @return
* @throws IOException
*/
public static WeightMatrix parsePriorityBestOutput(String priorityFile) throws IOException, ParseException {
LineByLineFileReader lblfr = null;
try {
lblfr = new LineByLineFileReader();
lblfr.openFile(priorityFile);
char[] rowOrder = { 'A', 'C', 'G', 'T' };
String tfString = "Transcription factor:";
String tfname = null;
DenseDoubleMatrix2D rawPWM = null;
int pwmLength = -1;
boolean done = false;
String currLine = lblfr.readLine();
while (!done && (currLine != null)) {
//read the tf name
if (currLine.startsWith(tfString)) {
tfname = currLine.substring(tfString.length()).trim();
}
else if (currLine.startsWith("Phi:")) {
//read the header row
currLine = lblfr.readLine().trim();
pwmLength = Integer.valueOf(currLine.substring(currLine.lastIndexOf(" ") + 1));
//read the pwm
String aRow = lblfr.readLine();
String cRow = lblfr.readLine();
String gRow = lblfr.readLine();
String tRow = lblfr.readLine();
rawPWM = PWMParser.parsePWM(pwmLength, aRow, cRow, gRow, tRow);
done = true;
}
currLine = lblfr.readLine();
}
WeightMatrix wm = new WeightMatrix(pwmLength);
wm.name = tfname;
for (int i = 0; i < rowOrder.length; i++) {
for (int j = 0; j < pwmLength; j++) {
wm.matrix[j][rowOrder[i]] = (float)rawPWM.getQuick(i, j);
}
}
return wm;
}
finally {
if (lblfr != null) {
lblfr.closeFile();
}
}
}
// public static List<WeightMatrix> parsePriorityTrialOutput(String priorityFile) {
//
// }
//
/////////////////////////////////////////////////////////////////////////////
// old code from WeightMatrixImport
/////////////////////////////////////////////////////////////////////////////
public static LinkedList<WeightMatrix> readTamoMatrices(String wmfile) throws IOException {
int[] indices = { 'A', 'C', 'G', 'T' };
LinkedList<WeightMatrix> matrices = new LinkedList<WeightMatrix>();
BufferedReader br = new BufferedReader(new FileReader(new File(wmfile)));
String line;
String name = null, version = null;
WeightMatrix matrix = null;
Vector<float[]> array = null;
while ((line = br.readLine()) != null) {
line = line.trim();
if (line.length() > 0) {
if (name == null) {
int index = line.indexOf("\\s+");
name = line.substring(0, index);
version = line.substring(index, line.length()).trim();
array = new Vector<float[]>();
}
else {
String[] sarray = line.split("\\s+");
float[] col = new float[4];
for (int i = 0; i < col.length; i++) {
col[i] = Float.parseFloat(sarray[i]);
}
array.add(col);
}
}
else {
if (name != null) {
matrix = new WeightMatrix(array.size());
matrix.name = name;
matrix.version = version;
matrix.type = "TAMO";
for (int i = 0; i < array.size(); i++) {
for (int j = 0; j < indices.length; j++) {
matrix.matrix[i][indices[j]] = array.get(i)[j];
}
}
matrices.add(matrix);
// System.err.println("Added \"" + matrix.name + "\"");
}
array = null;
name = null;
version = null;
matrix = null;
}
}
if (name != null) {
matrix = new WeightMatrix(array.size());
matrix.name = name;
matrix.version = version;
for (int i = 0; i < array.size(); i++) {
for (int j = 0; j < indices.length; j++) {
matrix.matrix[i][indices[j]] = array.get(i)[j];
}
}
matrices.add(matrix);
// System.err.println("Added \"" + matrix.name + "\"");
}
br.close();
return matrices;
}
/**
* Parses weight matrices in JASPAR format. These needs the files in, eg,
* psrg/datasets/jaspar-10_12_09/all_data/FlatFileDir. The filename should be
* the name of the matrix_list.txt file; this method will then figure out the
* names of the individual matrix files from that.
*
* The species in matrix_list.txt are the IDs from the NCBI taxonomy database.
* This method has a hard-coded reference to a copy of that DB in AFS since we
* haven't loaded it anywhere else yet.
*
*/
public static List<WeightMatrix> readJASPARFreqMatrices(String wmfile, String wmversion) throws IOException {
LinkedList<WeightMatrix> matrices = new LinkedList<WeightMatrix>();
Map<Integer, String> speciesmap = new HashMap<Integer, String>();
String taxofile = "/afs/csail.mit.edu/group/psrg/datasets/ncbi_taxonomy_nov_09/id_to_name.tsv";
BufferedReader br = new BufferedReader(new FileReader(new File(taxofile)));
String line = null;
while ((line = br.readLine()) != null) {
String pieces[] = line.split("\\t");
speciesmap.put(Integer.parseInt(pieces[0]), pieces[1]);
}
br.close();
File inputfile = new File(wmfile);
br = new BufferedReader(new FileReader(inputfile));
String dirname = inputfile.getParent();
Pattern specpatt = Pattern.compile("species \"(\\d+)\"");
Pattern grouppatt = Pattern.compile("tax_group \"(\\w+)\"");
while ((line = br.readLine()) != null) {
String pieces[] = line.split("\\t");
String matrixfile = dirname + "/" + pieces[0] + ".pfm";
WeightMatrix matrix = null;
try {
matrix = readJASPARFile(matrixfile);
if (matrix == null) {
continue;
}
}
catch (IOException e) {
System.err.println("Couldn't read matrixfile " + e.toString());
continue;
}
matrix.name = pieces[2];
matrix.type = "JASPAR";
matrix.version = wmversion + " " + pieces[0];
Matcher matcher = specpatt.matcher(pieces[4]);
if (matcher.find()) {
int taxospecid = Integer.parseInt(matcher.group(1));
String specname = speciesmap.get(taxospecid);
try {
matrix.speciesid = (new Species(specname)).getDBID();
matrix.species = specname;
}
catch (NotFoundException e) {
System.err.println("Couldn't find species " + specname + " from " + taxospecid + " in " + line);
continue;
// ignore it and move on
}
}
else {
matcher = grouppatt.matcher(pieces[4]);
if (matcher.find() && matcher.group(1).equals("mammals")) {
try {
matrix.speciesid = (new Species("Mus musculus")).getDBID();
}
catch (NotFoundException e) {
continue; // shouldn't happen unless I typoed the name above
}
}
else {
System.err.println("No species in " + line);
continue;
}
}
matrices.add(matrix);
}
br.close();
return matrices;
}
public static WeightMatrix readJASPARFile(String fname) throws IOException {
BufferedReader br = new BufferedReader(new FileReader(new File(fname)));
String line = null;
WeightMatrix matrix = null;
int[] indices = { 'A', 'C', 'G', 'T' };
int lineno = 0;
while ((line = br.readLine()) != null) {
line = line.trim();
if (line.length() > 0) {
String pieces[] = line.split("\\s+");
if (matrix == null) {
matrix = new WeightMatrix(pieces.length);
}
for (int i = 0; i < pieces.length; i++) {
matrix.matrix[i][indices[lineno]] = Float.parseFloat(pieces[i]);
}
lineno++;
}
}
br.close();
matrix.normalizeFrequencies();
return matrix;
}
public static WeightMatrix readUniProbeFile(String fname) throws IOException {
BufferedReader br = new BufferedReader(new FileReader(new File(fname)));
String line = null;
WeightMatrix matrix = null;
while((line = br.readLine()) != null) {
line = line.trim();
if(line.length() > 0) {
String pieces[] = line.split("\\s+");
if (pieces[0].matches("[ACTG]:")) {
int matrixlen = pieces.length - 1;
if (matrix == null) {
matrix = new WeightMatrix(matrixlen);
}
for (int i = 1; i < pieces.length; i++) {
matrix.matrix[i-1][pieces[0].charAt(0)] = Float.parseFloat(pieces[i]);
}
}
}
}
return matrix;
}
/**
* Parses in weight matrices in transfac format with counts or frequencies
* (not log odds)
*
* @param wmfile
* @param version
* @return
* @throws IOException
*/
public static List<WeightMatrix> readTRANSFACFreqMatrices(String wmfile, String version) throws IOException {
int[] indices = { 'A', 'C', 'G', 'T' };
LinkedList<WeightMatrix> matrices = new LinkedList<WeightMatrix>();
BufferedReader br = new BufferedReader(new FileReader(new File(wmfile)));
String line;
WeightMatrix matrix = null;
int motifCount = 0;
Vector<float[]> arrays = new Vector<float[]>();
// Read in Transfac format first
Species currentSpecies = null;
String name = null, id = null, accession = null;
Pattern speciesPattern = Pattern.compile(".*Species:.*, (.*)\\.");
while ((line = br.readLine()) != null) {
line = line.trim();
if (line.length() > 0) {
String[] pieces = line.split("\\s+");
if (pieces[0].equals("AC")) {
accession = pieces[1];
// System.err.println("read AC " + accession);
}
else if (pieces[0].equals("NA")) {
name = pieces[1];
// System.err.println("read NA " + name);
}
else if (pieces[0].equals("ID")) {
id = pieces[1];
// System.err.println("read ID " + id);
arrays.clear();
name = null;
accession = null;
currentSpecies = null;
}
else if (pieces[0].equals("BF") && currentSpecies == null) {
Matcher matcher = speciesPattern.matcher(line);
if (matcher.matches()) {
String specname = matcher.group(1);
try {
currentSpecies = new Species(specname);
// System.err.println("Got species " + specname);
}
catch (NotFoundException e) {
System.err.println("Couldn't find species " + specname);
// ignore it and move on
}
}
}
else if (pieces[0].equals("DE")) {
name = pieces[1];
if (pieces.length >= 3) {
String v_string = pieces[2];
if (pieces.length >= 4) {
for (int v = 3; v < pieces.length; v++) {
v_string = v_string + "," + pieces[v];
}
}
if (version != null) {
version = v_string + "," + version;
}
else {
version = v_string;
}
}
// initialize id and accession if they're still null
if (id == null) {
id = "";
}
if (accession == null) {
accession = "";
}
}
else if (pieces[0].equals("XX")) {
if (name != null && accession != null && id != null && arrays.size() > 0) {
matrix = new WeightMatrix(arrays.size());
for (int i = 0; i < arrays.size(); i++) {
matrix.matrix[i]['A'] = arrays.get(i)[0];
matrix.matrix[i]['C'] = arrays.get(i)[1];
matrix.matrix[i]['G'] = arrays.get(i)[2];
matrix.matrix[i]['T'] = arrays.get(i)[3];
}
matrix.name = name;
matrix.version = version;
if (id.length() > 0) {
matrix.version = matrix.version + " " + id;
}
if (accession.length() > 0) {
matrix.version = matrix.version + " " + accession;
}
// System.err.println("read version " + matrix.version);
if (currentSpecies != null) {
matrix.speciesid = currentSpecies.getDBID();
matrix.species = currentSpecies.getName();
}
matrix.type = "TRANSFAC";
matrix.normalizeFrequencies();
matrices.add(matrix);
// clean up to prepare to parse next pwm
arrays.clear();
name = null;
id = null;
accession = null;
currentSpecies = null;
}
else {
// System.err.println(String.format("name %s id %s species %s",name,id,currentSpecies
// == null ? "null" : currentSpecies.toString()));
}
}
else if (name != null && (pieces.length == 5 || pieces.length == 6) && Character.isDigit(pieces[0].charAt(0))) {
// Load the matrix
float[] fa = new float[4];
// System.err.println(" adding matrix line");
for (int i = 1; i <= 4; i++) {
fa[i - 1] = Float.parseFloat(pieces[i]);
}
arrays.add(fa);
}
}
}
br.close();
return matrices;
}
/**
* Reads a matrix from a text-file. The file is assumed to have one sequence
* per line, and all lines must be the same length. The WeightMatrix returned
* has the frequencies of the bases at each position given.
*
* @param fname
* @return
* @throws IOException
* @throws ParseException
*/
public static WeightMatrix readAlignedSequenceMatrix(String fname) throws IOException, ParseException {
LinkedList<String> strings = new LinkedList<String>();
String line;
BufferedReader br = new BufferedReader(new FileReader(new File(fname)));
while ((line = br.readLine()) != null) {
line = line.trim();
if (line.length() > 0) {
strings.addLast(line);
}
}
br.close();
return WeightMatrixImport.buildAlignedSequenceMatrix(strings);
}
/*
* reads a weight matrix from a file. Takes the filename as input. The file
* must contain the log-odds matrix part of the TAMO output. Anythine else is
* ignored. Returns a WeightMatrix filled in with whatever information is
* available.
*/
public static WeightMatrix readTamoMatrix(String fname) throws ParseException, FileNotFoundException, IOException {
File wmfile = new File(fname);
if (!wmfile.exists()) {
throw new FileNotFoundException("Can't find file " + fname);
}
BufferedReader reader = new BufferedReader(new FileReader(wmfile));
String line = reader.readLine();
if (line == null || line.length() < 1) {
throw new IOException("Got a null or empty line in " + fname + " when looking to skip header");
}
else {
System.err.println("READ " + line);
}
while (line.charAt(0) != '#') {
line = reader.readLine();
System.err.println("READ " + line);
}
String[] positions = line.split("\\s+");
Integer length = Integer.parseInt(positions[positions.length - 1]) + 1;
System.err.println("Length is " + length);
WeightMatrix results = new WeightMatrix(length);
for (int i = 0; i <= 3; i++) {
line = reader.readLine();
if (line == null) {
throw new IOException("Got a null line in " + fname + " when looking for matrix line " + i);
}
int index = -1;
String[] values = line.split("\\s+");
if (values[0].equals("#A")) {
index = 'A';
}
else if (values[0].equals("#C")) {
index = 'C';
}
else if (values[0].equals("#T")) {
index = 'T';
}
else if (values[0].equals("#G")) {
index = 'G';
}
else {
throw new ParseException("Can't parse line " + line, 0);
}
for (int j = 1; j < values.length; j++) {
results.matrix[j - 1][index] = Float.parseFloat(values[j]);
}
}
return results;
}
/*
* reads a weight matrix from a file. Takes the filename as input. The file
* must contain the log-odds matrix part of the MEMO output. Anythine else is
* ignored. Returns a WeightMatrix filled in with whatever information is
* available.
*/
public static WeightMatrix readMemeMatrix(String fname) throws ParseException, FileNotFoundException, IOException {
File wmfile = new File(fname);
if (!wmfile.exists()) {
throw new FileNotFoundException("Can't find file " + fname);
}
BufferedReader reader = new BufferedReader(new FileReader(wmfile));
String line = reader.readLine();
int lineno = 1;
while (!line.matches(".*log-odds matrix.*")) {
line = reader.readLine();
lineno++;
}
line = line.replaceFirst("^.*w=\\s*", "");
line = line.replaceFirst("\\s*n=.*", "");
int length = Integer.parseInt(line);
WeightMatrix results = new WeightMatrix(length);
for (int i = 0; i < length; i++) {
line = reader.readLine().replaceFirst("^\\s*", "");
lineno++;
try {
String[] pieces = line.split("\\s+");
results.matrix[i]['A'] = Float.parseFloat(pieces[0]) / (float) 100.0;
results.matrix[i]['C'] = Float.parseFloat(pieces[1]) / (float) 100.0;
results.matrix[i]['G'] = Float.parseFloat(pieces[2]) / (float) 100.0;
results.matrix[i]['T'] = Float.parseFloat(pieces[3]) / (float) 100.0;
}
catch (NumberFormatException ex) {
System.err.println("At line " + lineno + ": " + line);
ex.printStackTrace();
throw ex;
}
catch (ArrayIndexOutOfBoundsException ex) {
System.err.println("At line " + lineno + ": " + line);
ex.printStackTrace();
throw ex;
}
}
return results;
}
public static List<WeightMatrix> readGimmeMotifsMatrices(String fname, String version, String type) throws FileNotFoundException, IOException {
File wmfile = new File(fname);
if (!wmfile.exists()) {
throw new FileNotFoundException("Can't find file " + fname);
}
BufferedReader reader = new BufferedReader(new FileReader(wmfile));
String line = null;
int lineno = 1;
List<WeightMatrix> output = new ArrayList<WeightMatrix>();
ArrayList<String> lines = new ArrayList<String>();
String name = null;
while ((line = reader.readLine()) != null) {
if (line.matches("^>.*")) {
if (lines.size() > 0 && name != null) {
WeightMatrix m = parseGimmeMotifsFromLines(lines);
System.err.println("Parsed " +name);
m.name = name;
m.version = version;
m.type = type;
output.add(m);
}
name = line.replaceAll(">","");
lines.clear();
} else {
lines.add(line);
}
}
if (lines.size() > 0 && name != null) {
WeightMatrix m = parseGimmeMotifsFromLines(lines);
m.name = name;
m.version = version;
m.type = type;
output.add(m);
}
reader.close();
return output;
}
public static WeightMatrix parseGimmeMotifsFromLines(List<String> lines) {
WeightMatrix m = new WeightMatrix(lines.size());
for (int i = 0; i < lines.size(); i++) {
String line[] = lines.get(i).split("\\s+");
m.matrix[i]['A'] = Float.parseFloat(line[0]);
m.matrix[i]['C'] = Float.parseFloat(line[1]);
m.matrix[i]['G'] = Float.parseFloat(line[2]);
m.matrix[i]['T'] = Float.parseFloat(line[3]);
}
return m;
}
public static void main(String[] args) {
// int length = 6;
// String aLine = " a 1.0000 0.9487 0.5641 0.0000 0.0000 0.7949";
// String cLine = " c 0.0000 0.0513 0.2051 0.3590 1.0000 0.0000";
// String gLine = " g 0.0000 0.0000 0.1538 0.2436 0.0000 0.0641";
// String tLine = " t 0.0000 0.0000 0.0769 0.3974 0.0000 0.1410";
String[] rows = new String[] {
"01 17 22 17 43",
"02 12 41 35 12",
"03 34 9 56 0",
"04 11 48 33 9",
"05 6 80 8 6",
"06 4 39 57 0",
"07 91 0 3 6",
"08 0 0 0 100",
"09 0 0 0 100",
"10 85 0 6 8",
"11 45 13 16 26",
"12 7 37 49 6",
"13 18 24 35 24",
"14 33 33 33 0",
"15 25 25 25 25",
"16 17 17 22 43"
};
try {
// PWMParser.parsePWM(length, aLine, cLine, gLine, tLine);
DoubleMatrix2D rawPWM = PWMParser.parsePWM(rows);
System.out.println(rawPWM.toString());
}
catch (IOException ioex) {
logger.fatal(ioex);
}
catch (ParseException pex) {
logger.fatal(pex);
}
}
}