/*
* The MIT License (MIT)
*
* Copyright (c) 2007-2015 Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package org.broad.igv.tools.converters;
import org.apache.commons.math.stat.StatUtils;
import org.apache.log4j.Logger;
import org.broad.igv.Globals;
import java.io.*;
import java.util.Arrays;
/**
* Scales and centers expression data for display in IGV.
* <p/>
* 1. take log2 of data
* 2. compute median and subtract from each log2 probe value (i.e. center on the median)
* 3. compute the MAD (mean absolute deviation) -- http://stat.ethz.ch/R-manual/R-devel/library/stats/html/mad.html
* 4. divide each log2 probe value by the MAD
*
* @author jrobinso
* @date Nov 5, 2010
*/
public class ExpressionFormatter {
private static Logger log = Logger.getLogger(ExpressionFormatter.class);
enum FileType {
GCT, RES, TAB
}
FileType type;
int dataStartColumn;
int probeColumn;
int descriptionColumn = -1;
int nPts;
/**
* Parse the file and output in ".igv" format
*
* @return
*/
public void convert(File inputFile, File outputFile) throws IOException {
convert(inputFile, outputFile, getType(inputFile));
}
void convert(File inputFile, File outputFile, FileType type) throws IOException {
setType(type);
BufferedReader reader = null;
PrintWriter writer = null;
try {
reader = new BufferedReader(new FileReader(inputFile));
writer = new PrintWriter(new BufferedWriter(new FileWriter(outputFile)));
String nextLine = null;
// Skip meta data. Note for GCT files this includes the mandatory first line
while ((nextLine = reader.readLine()).startsWith("#") && (nextLine != null)) {
writer.println(nextLine);
}
// This is the first non-meta line
writer.println(nextLine);
// for TAB and RES files the first row contains the column headings.
int nCols = 0;
if (type == FileType.TAB || type == FileType.RES) {
nCols = nextLine.split("\t").length;
}
if (type == FileType.GCT) {
// GCT files. Column headings are 3rd row (read next line)
nextLine = reader.readLine();
nCols = nextLine.split("\t").length;
writer.println(nextLine);
} else if (type == FileType.RES) {
// Res files -- skip lines 2 and 3
writer.println(reader.readLine());
writer.println(reader.readLine());
}
// Compute the # of data points
int columnSkip = 1;
if (type == FileType.RES) {
columnSkip = 2;
nCols++; // <= last call column of a res file is sometimes blank, if not this will get
}
nPts = (nCols - dataStartColumn) / columnSkip;
// Now for the data
while ((nextLine = reader.readLine()) != null) {
String[] tokens = nextLine.split("\t");
for (int i = 0; i < dataStartColumn; i++) {
writer.print(tokens[i] + "\t");
}
DataRow row = new DataRow(tokens, nextLine);
for (int i = 0; i < nPts; i++) {
if (Double.isNaN(row.scaledData[i])) {
writer.print("\t");
} else {
writer.print(row.scaledData[i]);
if (type == FileType.RES) {
writer.print("\t" + row.calls[i]);
}
if (i < nPts - 1) {
writer.print("\t");
}
}
}
writer.println();
}
} finally {
if (reader != null) {
reader.close();
}
if (writer != null) {
writer.close();
}
}
}
/**
* Represents a row of data from an expression file
*/
class DataRow {
private String probe;
private String description;
private double[] data;
private double[] scaledData;
private String[] calls;
private double median;
private double mad;
DataRow(String[] tokens, String line) {
double[] nonNullData = new double[nPts];
data = new double[nPts];
scaledData = new double[nPts];
Arrays.fill(data, Double.NaN);
Arrays.fill(scaledData, Double.NaN);
calls = new String[nPts];
Arrays.fill(calls, "");
probe = tokens[probeColumn];
if (descriptionColumn >= 0) {
description = tokens[descriptionColumn];
}
int skip = type == FileType.RES ? 2 : 1;
int nNonNull = 0;
for (int dataIdx = 0; dataIdx < nPts; dataIdx++) {
int i = dataStartColumn + dataIdx * skip;
if (tokens[i] != null) {
try {
data[dataIdx] = Double.parseDouble(tokens[i]);
if (data[dataIdx] < 0) {
throw new RuntimeException("Negative value detected in input file: " + line);
}
double v = Math.log(data[dataIdx]) / Globals.log2;
scaledData[dataIdx] = v;
nonNullData[nNonNull] = v;
nNonNull++;
} catch (NumberFormatException e) {
}
}
if (type == FileType.RES) {
calls[dataIdx] = tokens[i + 1].trim();
}
}
// Compute median of log values
median = StatUtils.percentile(nonNullData, 0, nNonNull, 50);
// Center data on median
nNonNull = 0;
for (int i = 0; i < scaledData.length; i++) {
if (!Double.isNaN(scaledData[i])) {
scaledData[i] -= median;
nonNullData[nNonNull] = scaledData[i];
nNonNull++;
}
}
// Compute modified MAD (mad based on median)
// TODO -- shouldn't this be zero?
//double mean = StatUtils.mean(nonNullData, 0, nNonNull);
// The median is zero (by definition now)
double[] deviations = new double[nNonNull];
for (int i = 0; i < nNonNull; i++) {
deviations[i] = Math.abs(nonNullData[i] - 0);
}
// MAD, as defined at http://stat.ethz.ch/R-manual/R-devel/library/stats/html/mad.html
mad = 1.4826 * StatUtils.percentile(deviations, 50);
// Scale data by MAD
for (int i = 0; i < scaledData.length; i++) {
if (!Double.isNaN(scaledData[i])) {
scaledData[i] /= mad;
}
}
}
}
private FileType getType(File inputFile) {
String fn = inputFile.getName().toLowerCase();
if (fn.endsWith(".txt") || fn.endsWith(".tab") || fn.endsWith(".xls") || fn.endsWith(".gz")) {
fn = fn.substring(0, fn.lastIndexOf("."));
}
if (fn.endsWith("res")) {
return FileType.RES;
} else if (fn.endsWith("gct")) {
return FileType.GCT;
} else if (fn.endsWith("tab")) {
return FileType.TAB;
} else {
throw new RuntimeException("Unknown file type: " + inputFile);
}
}
private void setType(FileType type) {
this.type = type;
descriptionColumn = -1; // Default - no description column
switch (type) {
case RES:
dataStartColumn = 2;
probeColumn = 1;
descriptionColumn = 0;
break;
case GCT:
dataStartColumn = 2;
probeColumn = 0;
descriptionColumn = 1;
break;
case TAB:
dataStartColumn = 1;
probeColumn = 0;
break;
}
}
public static void main(String[] args) throws IOException {
if (args.length < 2) {
System.out.println("Usage: java -jar ExpressionFormatter <inputFile> <outputFile>");
return;
} else {
File inputFile = new File(args[0]);
File outputFile = new File(args[1]);
(new ExpressionFormatter()).convert(inputFile, outputFile);
}
}
}