/*
* Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fhcrc.cpl.viewer.amt;
import java.io.*;
import java.util.*;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.text.ParseException;
import org.apache.log4j.Logger;
import org.fhcrc.cpl.toolbox.filehandler.TabLoader;
import org.fhcrc.cpl.toolbox.statistics.MatrixUtil;
import org.fhcrc.cpl.toolbox.proteomics.MS2Modification;
import org.fhcrc.cpl.toolbox.proteomics.PeptideGenerator;
import org.fhcrc.cpl.toolbox.gui.chart.ScatterPlotDialog;
/**
* A class for writing AMT TSV files.
*/
public class AmtTsvReader
{
static Logger _log = Logger.getLogger(AmtTsvReader.class);
protected static DateFormat dateParser =
new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");
/**
* Read an AMT database from a tsv file
* @param file
* @throws IOException
*/
public static AmtDatabase read(File file) throws IOException
{
TabLoader tabLoader = new TabLoader(file, AmtTsvRow.class);
AmtTsvRow[] tsvRows = (AmtTsvRow[]) tabLoader.load();
Map<Integer, List<AmtTsvRow>> runNumberTsvRowMap =
new HashMap<Integer, List<AmtTsvRow>>();
//should we use time values, or scan numbers? Times are preferable,
//if available
boolean useTimesInsteadOfScans = true;
//figure out the hydrophobicity algorithm and version that was used
String hydroAlgorithmString =
(String) tabLoader.getComments().get("HydrophobicityAlgorithm");
String[] hydroAlgPieces = hydroAlgorithmString.split(",");
String hydroAlgorithmName = hydroAlgPieces[0].toLowerCase();
double hydroAlgorithmVersion = Double.parseDouble(hydroAlgPieces[1]);
//check that the specified hydrophobicity algorithm and version
//are known to us. If not, can't normalize, die.
if (!HydrophobicityNormalizer.algorithmVersionKnown(hydroAlgorithmName,
hydroAlgorithmVersion))
throw new RuntimeException("Unknown hydrophobicity algorithm and version: " + hydroAlgorithmName + ", version " + hydroAlgorithmVersion);
//first get all the hydrophobicity values and times/scans for all the rows
for (AmtTsvRow tsvRow : tsvRows)
{
//normalize hydrophobicity
tsvRow.setH(HydrophobicityNormalizer.normalize(tsvRow.getH(),
hydroAlgorithmName,hydroAlgorithmVersion));
//if any row doesn't have retention time, use scan instead
//TODO: for some reason, tabloader sets time to 0 if it's not specified.
//TODO: that isn't all that cool. Rows with time 0 will throw everything off
if (tsvRow.getRetentiontime() <= 0)
useTimesInsteadOfScans = false;
//strip tryptic ends from all rows
tsvRow.setPeptide(stripTrypticEnds(tsvRow.getPeptide()));
//Associate the run numbers with lists of rows
if (!runNumberTsvRowMap.containsKey(tsvRow.getRun()))
{
runNumberTsvRowMap.put(tsvRow.getRun(), new ArrayList<AmtTsvRow>());
}
runNumberTsvRowMap.get(tsvRow.getRun()).add(tsvRow);
}
//System.err.println("***read 2, rows:" + tsvRows.length);
//map run numbers to runs, so we can get at them easily later
Map<Integer, AmtRunEntry> runNumberRunMap = new HashMap<Integer, AmtRunEntry>();
//map variable mod symbols to their associated modifications, within each run
Map<AmtRunEntry, Map<String, MS2Modification>> runSymbolModMap =
new HashMap<AmtRunEntry, Map<String, MS2Modification>>();
//create the actual run entries
for (int runNumber : runNumberTsvRowMap.keySet())
{
AmtRunEntry runEntry = createRunEntry(runNumberTsvRowMap.get(runNumber),
(String) tabLoader.getComments().get("Run" + runNumber),
useTimesInsteadOfScans);
runNumberRunMap.put(runNumber, runEntry);
}
//System.err.println("***read 3, runs: " + runSymbolModMap.keySet().size());
AmtDatabase result = new AmtDatabase();
result.setHydrophobicityAlgorithmName(hydroAlgorithmName);
result.setHydrophobicityAlgorithmVersion(hydroAlgorithmVersion);
//addRunEntry implicitly overrides duplicate modifications found in the run
for (AmtRunEntry runEntry : runNumberRunMap.values())
result.addRunEntry(runEntry);
//map the symbols of modifications to their modifications
for (AmtRunEntry runEntry : runNumberRunMap.values())
{
Map<String, MS2Modification> thisRunSymbolModMap =
new HashMap<String, MS2Modification>();
for (MS2Modification mod : runEntry.getModifications())
{
if (mod.getVariable())
thisRunSymbolModMap.put(mod.getSymbol(), mod);
//System.err.println(" adding mod for char " + mod.getSymbol());
}
runSymbolModMap.put(runEntry, thisRunSymbolModMap);
}
//add the individual observations
for (AmtTsvRow tsvRow : tsvRows)
{
String strippedPeptide = stripPeptide(tsvRow.getPeptide());
AmtRunEntry run = runNumberRunMap.get(tsvRow.getRun());
List<MS2Modification>[] modifications =
identifyAllModifications(strippedPeptide, tsvRow.getPeptide(), run,
runSymbolModMap.get(run));
result.addObservation(strippedPeptide, modifications, 0,
run.convertTimeToHydrophobicity(useTimesInsteadOfScans ?
tsvRow.getRetentiontime() : tsvRow.getScan()), run,
AmtPeptideEntry.AmtPeptideObservation.SPECTRAL_COUNT_UNKNOWN,
useTimesInsteadOfScans ?
tsvRow.getRetentiontime() : tsvRow.getScan());
}
//testing code
double[] origXValues = new double[tsvRows.length];
double[] origYValues = new double[tsvRows.length];
for (int i=0; i<tsvRows.length; i++)
{
origXValues[i] = tsvRows[i].getH();
origYValues[i] = tsvRows[i].getScan();
}
List<Double> dbXValuesList = new ArrayList<Double>();
List<Double> dbYValuesList = new ArrayList<Double>();
for (AmtPeptideEntry entry : result.getEntries())
for (AmtPeptideEntry.AmtPeptideObservation obs : entry.getObservations())
{
dbXValuesList.add(entry.getPredictedHydrophobicity());
dbYValuesList.add(obs.getTimeInRun());
// dbYValuesList.add(obs.getRunEntry().recoverTimeForHydrophobicity(obs.getObservedHydrophobicity()));
}
double[] dbXValues = new double[dbXValuesList.size()];
double[] dbYValues = new double[dbXValuesList.size()];
for (int i=0; i<dbXValues.length; i++)
{
dbXValues[i] = dbXValuesList.get(i);
dbYValues[i] = dbYValuesList.get(i);
}
ScatterPlotDialog chartDialog = new ScatterPlotDialog(origXValues, origYValues, "from tsv");
chartDialog.addData(dbXValues, dbYValues, "from db");
chartDialog.setVisible(true);
return result;
}
/**
* Strip tryptic ends off a raw peptide string
* @param peptideString
* @return
*/
protected static String stripTrypticEnds(String peptideString)
{
if (peptideString.charAt(1) == '.' &&
peptideString.charAt(peptideString.length()-2) == '.')
{
peptideString = peptideString.substring(2, peptideString.length()-2);
}
return peptideString;
}
/**
* Return a version of the peptide string that contains only the residues
* @param peptideWithMods
* @return
*/
protected static String stripPeptide(String peptideWithMods)
{
StringBuffer resultBuf = new StringBuffer();
for (byte charByte : peptideWithMods.getBytes())
{
char thisChar = (char) charByte;
if (Character.isLetter(thisChar))
resultBuf.append(thisChar);
}
return resultBuf.toString();
}
/**
* Figure out what modifications exist in this peptide string
* @param strippedPeptide
* @param peptideWithMods
* @param run
* @param thisRunSymbolModMap
* @return
*/
protected static List<MS2Modification>[]
identifyAllModifications(String strippedPeptide,
String peptideWithMods,
AmtRunEntry run,
Map<String, MS2Modification> thisRunSymbolModMap)
{
//System.err.println("identifyAllMods, pep=" + strippedPeptide + ", wmods=" + peptideWithMods);
List<MS2Modification>[] result =
(List<MS2Modification>[]) new List[strippedPeptide.length()];
int peptideSequenceIndex = -1;
for (byte charByte : peptideWithMods.getBytes())
{
String thisChar = Character.toString((char) charByte);
if (Character.isLetter(thisChar.charAt(0)))
{
peptideSequenceIndex++;
MS2Modification mod = run.getStaticMod(thisChar);
if (mod != null)
{
if (result[peptideSequenceIndex] == null)
result[peptideSequenceIndex] = new ArrayList<MS2Modification>();
result[peptideSequenceIndex].add(mod);
}
}
else
{
if (result[peptideSequenceIndex] == null)
result[peptideSequenceIndex] = new ArrayList<MS2Modification>();
MS2Modification mod = thisRunSymbolModMap.get(thisChar);
if (mod == null)
throw new RuntimeException("AmtTsvReader: row contained a mod with character " + thisChar + ", which is not defined at run level");
result[peptideSequenceIndex].add(mod);
}
}
return result;
}
/**
* Create a run entry from a comment row in the tsv file
* @param amtTsvRowList
* @param commentLineForRun
* @param useTimesInsteadOfScans
* @return
*/
protected static AmtRunEntry createRunEntry(List<AmtTsvRow> amtTsvRowList,
String commentLineForRun,
boolean useTimesInsteadOfScans)
{
double[] elutionValues = new double[amtTsvRowList.size()];
double[] hydrophobicities = new double[amtTsvRowList.size()];
for (int i=0; i<amtTsvRowList.size(); i++)
{
AmtTsvRow tsvRow = amtTsvRowList.get(i);
hydrophobicities[i] = tsvRow.getH();
elutionValues[i] = useTimesInsteadOfScans? tsvRow.getRetentiontime() :
tsvRow.getScan();
}
//System.err.println("useTimes? " + useTimesInsteadOfScans);
double[] timeToHydroArray =
MatrixUtil.linearRegression(elutionValues, hydrophobicities);
//#Run92=Desktop/haloICAT2_30 (mascot)|haloICAT2_30.pep.xml|2006-07-09 08:37:56.234|C'=442.225;C"=450.275
String[] fields = commentLineForRun.split("\\|");
//TODO: do something with description
String description = fields[0];
String pepXmlFilename = fields[1];
String dateString = fields[2];
String modificationsString = fields[3];
//modifications
String[] modificationStrings = modificationsString.split(";");
MS2Modification[] modifications = null;
if (modificationStrings != null && modificationStrings.length>0 &&
!("".equals(modificationStrings[0])))
{
modifications = new MS2Modification[modificationStrings.length];
for (int i=0; i<modificationStrings.length; i++)
{
String modString = modificationStrings[i];
String residueWithPossibleMod =
modString.substring(0, modString.indexOf('='));
String residue = residueWithPossibleMod.substring(0,1);
//System.err.println("mod**** " + modString + "***" + residueWithPossibleMod + "***" + residue);
boolean variable = false;
String symbol = null;
if (residueWithPossibleMod.length() > 1)
{
variable = true;
symbol = residueWithPossibleMod.substring(1,2);
}
double mass = Double.parseDouble(modString.substring(modString.indexOf('=') + 1));
MS2Modification mod = new MS2Modification();
mod.setAminoAcid(residue);
mod.setMass((float) mass);
mod.setMassDiff((float) (mass -
PeptideGenerator.AMINO_ACID_MONOISOTOPIC_MASSES[residue.charAt(0)]));
mod.setVariable(variable);
mod.setSymbol(symbol);
//System.err.println("mod: " + mod.getAminoAcid() + ", " + mod.getMassDiff() + ", var?"+mod.getVariable() + ", sym="+mod.getSymbol());
modifications[i] = mod;
}
}
Date date = new Date();
try
{
dateParser.parse(dateString);
}
catch (ParseException e)
{
_log.error("error parsing date format (" + dateString +
") for TSV run, using current date");
}
//slope, int, slope, int
AmtRunEntry run =new AmtRunEntry(timeToHydroArray, modifications, date);
run.setPepXmlFilename(pepXmlFilename);
return run;
}
/**
* Used by TabLoader to represent one row. Rows have the format:
* run fraction scan retentiontime peptide h
*/
protected static class AmtTsvRow
{
protected int run;
protected int fraction;
protected int scan;
protected double retentiontime;
protected String peptide;
protected double h;
public AmtTsvRow()
{
run=-1; fraction=-1; scan=-1; retentiontime=-1;
peptide="";
h=-1;
}
public int getRun()
{
return run;
}
public void setRun(int run)
{
this.run = run;
}
public int getFraction()
{
return fraction;
}
public void setFraction(int fraction)
{
this.fraction = fraction;
}
public int getScan()
{
return scan;
}
public void setScan(int scan)
{
this.scan = scan;
}
public double getRetentiontime()
{
return retentiontime;
}
public void setRetentiontime(double retentiontime)
{
this.retentiontime = retentiontime;
}
public String getPeptide()
{
return peptide;
}
public void setPeptide(String peptide)
{
this.peptide = peptide;
}
public double getH()
{
return h;
}
public void setH(double h)
{
this.h = h;
}
}
}