/*
* Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fhcrc.cpl.viewer.amt;
import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet;
import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureClusterer;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.AmtExtraInfoDef;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef;
import org.fhcrc.cpl.toolbox.proteomics.ProteomicsRegressionUtilities;
import org.fhcrc.cpl.toolbox.statistics.MatrixUtil;
import org.fhcrc.cpl.toolbox.ApplicationContext;
import org.fhcrc.cpl.toolbox.statistics.RegressionUtilities;
import org.fhcrc.cpl.toolbox.proteomics.*;
import org.apache.log4j.Logger;
import java.util.*;
/**
* Utilities related to AMT
*/
public class AmtUtilities
{
private static Logger _log = Logger.getLogger(AmtUtilities.class);
//name and version of the hydrophobicity algorithm currently used.
//Values should be stored in normalized form, but it's still important
//to keep track of what version of the algorithm they came from.
public static final String HYDROPHOBICITY_ALGORITHM_NAME = "krokhin";
public static final double HYDROPHOBICITY_ALGORITHM_VERSION = 3.0;
//the search_engine string to use for pepXML files that contain AMT results
public static final String AMT_SEARCH_ENGINE_CODE_FOR_PEPXML = "msInspect/AMT";
public static void recordHydrophobicities(FeatureSet featureSet,
Map<String,Double> regressionLine,
int scanOrTimeMode)
{
for (Feature feature : featureSet.getFeatures())
AmtExtraInfoDef.setObservedHydrophobicity(feature,
RegressionUtilities.predictYFromX(getSlopeFromRegressionLine(regressionLine),
getInterceptFromRegressionLine(regressionLine),
(scanOrTimeMode== ProteomicsRegressionUtilities.REGRESSION_MODE_SCAN ?
feature.getScan() :
feature.getTime())));
}
public static void recordMS1Hydrophobicities(FeatureSet featureSet,
Map<String,Double> regressionLine,
int scanOrTimeMode)
{
recordHydrophobicities(featureSet, regressionLine, scanOrTimeMode);
}
/**
* Pick numProteins proteins at "random" from a specified list of proteins.
* It would sure be nice if we didn't have to load all the proteins in order to
* pick a random sample, but since FastaLoader just provides an iterator, this is
* about all we've got
* @param fullProteinArray
* @param numProteinsToLoad
* @return
*/
public static ArrayList<Protein> pickRandomProteins(ArrayList<Protein> fullProteinArray,
int numProteinsToLoad)
{
//cowardly refuse to pull out a random subset larger than the original set
int fullProteinArraySize = fullProteinArray.size();
if (fullProteinArraySize < numProteinsToLoad)
return null;
ArrayList<Protein> randomProteinArray = new ArrayList<Protein>(numProteinsToLoad);
Random random = new Random();
int[] chosenIndexes = new int[numProteinsToLoad];
Arrays.fill(chosenIndexes,-1);
for (int i=0; i<numProteinsToLoad; i++)
{
boolean foundOne = false;
int nextIndex = -1;
//prevent duplicate proteins
while (!foundOne)
{
nextIndex = random.nextInt(fullProteinArraySize);
if (Arrays.binarySearch(chosenIndexes,nextIndex) < 0)
foundOne = true;
}
randomProteinArray.add(fullProteinArray.get(nextIndex));
}
return randomProteinArray;
}
public static double[] getTimesForSortedScanArray(MSRun run,
int[] sortedScanArray)
{
double[] result = new double[sortedScanArray.length];
if (run == null)
{
_log.error("Null run, can't calculate times for scans");
return null;
}
int ms2IndexInRun = 0;
MSRun.MSScan[] ms2Scans = run.getMS2Scans();
for (int i = 0; i < sortedScanArray.length; i++)
{
int thisFeatureScanNum = sortedScanArray[i];
//this relies on the fact that featuresForRegression is sorted by scan
MSRun.MSScan thisFeatureScan = null;
while (true)
{
if (ms2IndexInRun >= ms2Scans.length)
{
//This will throw a NullPointerException down below when we
//try to access the retention time.
break;
}
int evalScanNum = ms2Scans[ms2IndexInRun].getNum();
if (evalScanNum == thisFeatureScanNum)
{
thisFeatureScan = ms2Scans[ms2IndexInRun];
break;
}
else if (evalScanNum > thisFeatureScanNum)
{
_log.error("Error! Read scan number " + evalScanNum + " while attempting to find scan number " + thisFeatureScanNum + ".");
_log.error("I.e., scan number " + thisFeatureScanNum + " was not found");
_log.error("Probably the wrong mzXml file was provided for pepXml file.");
break;
}
else
ms2IndexInRun++;
}
assert(thisFeatureScan != null);
result[i] = thisFeatureScan.getDoubleRetentionTime();
}
return result;
}
/**
* Workhorse method for relating time to hydrophobicity using linear regression,
* can be used to relate time in terms of hydrophobicity or vice versa.
*
* optionally use robust regression, which takes a lot longer
*
* @return a hashmap containing three elements, whose keys are defined in constants: the slope,
* intercept and sigma of the regression
*
* Note: this regression predicts hydrophobicity in terms of time. This relationship is
* not reciprocal, due to all sorts of arcane weirdness related to linear regression.
*/
public static Map<String,Double> calculateTimeHydroRelationshipEitherWay(
Feature[] featuresForRegression, int scanOrTimeMode,
boolean predictTimeFromHydro,
boolean robustRegression)
{
Map<String,Double> result = new HashMap<String,Double>();
//clone because we're sorting
Feature[] featuresForRegressionClone =
featuresForRegression.clone();
if (featuresForRegressionClone.length == 0)
{
throw new RuntimeException("AmtUtilities.calculateTimeHydroRelationshipEitherWay: " +
"regression failed, no eligible features to use for regression.");
}
Arrays.sort(featuresForRegressionClone, new Feature.ScanAscComparator());
double[] hydrophobicities = new double[featuresForRegressionClone.length];
double[] scansOrTimes = new double[hydrophobicities.length];
for (int i=0; i<featuresForRegressionClone.length; i++)
{
Feature feature = featuresForRegressionClone[i];
hydrophobicities[i] = calculateNormalizedHydrophobicity(
MS2ExtraInfoDef.getFirstPeptide(feature));
scansOrTimes[i] =
(scanOrTimeMode == ProteomicsRegressionUtilities.REGRESSION_MODE_TIME) ?
feature.getTime() : feature.getScan();
}
try
{
double regressionLineSlope, regressionLineIntercept, sigma;
double[] regressionResult;
if (robustRegression)
regressionResult = RegressionUtilities.robustRegression(
predictTimeFromHydro ? hydrophobicities : scansOrTimes,
predictTimeFromHydro ? scansOrTimes : hydrophobicities);
else
regressionResult = MatrixUtil.linearRegression(
predictTimeFromHydro? hydrophobicities : scansOrTimes,
predictTimeFromHydro? scansOrTimes : hydrophobicities);
regressionLineSlope = regressionResult[1];
regressionLineIntercept = regressionResult[0];
//not currently using sigma for anything, but I have a hunch it'll be crucial
sigma = MatrixUtil.sigma(hydrophobicities, scansOrTimes, regressionResult);
result.put(RegressionUtilities.REGRESSION_SLOPE_KEY, regressionLineSlope);
result.put(RegressionUtilities.REGRESSION_INTERCEPT_KEY, regressionLineIntercept);
result.put(RegressionUtilities.REGRESSION_SIGMA_KEY, sigma);
}
catch (Exception e)
{
_log.error("Failure to calculate hydrophobicity->time prediction",e);
}
return result;
}
/**
* @return a HashMap containing three elements, whose keys are defined in constants: the slope,
* intercept and sigma of the regression
*
* Note: this regression predicts time in terms of hydrophobicity. This relationship is
* not reciprocal, due to all sorts of arcane weirdness related to linear regression.
*/
public static Map<String,Double> calculateScanOrTimeHydrophobicityRelationship(
Feature[] featuresForRegression, int scanOrTimeMode,
boolean robustRegression)
{
return calculateTimeHydroRelationshipEitherWay(featuresForRegression, scanOrTimeMode,
false,
robustRegression);
}
/**
* @return a HashMap containing three elements, whose keys are defined in constants: the slope,
* intercept and sigma of the regression
*
* Note: this regression predicts hydrophobicity in terms of time. This relationship is
* not reciprocal, due to all sorts of arcane weirdness related to linear regression.
*/
public static Map<String,Double> calculateHydrophobicityScanOrTimeRelationship(
Feature[] featuresForRegression, int scanOrTimeMode,
boolean robustRegression)
{
return calculateTimeHydroRelationshipEitherWay(featuresForRegression, scanOrTimeMode,
true,
robustRegression);
}
/**
* Given a peptide, and a line describing the relationship between hydrophobicity and
* scan number, calculate the hydrophobicity of the peptide and return the predicted scan
* for that hydrophobicity
* @param intercept
* @param peptide
* @return
*/
protected static double predictScanOrTime(double slope, double intercept, Peptide peptide)
{
return predictScanOrTime(slope, intercept,
calculateNormalizedHydrophobicity(peptide));
}
/**
* Given a hydrophobicity, and a line describing the relationship between hydrophobicity and
* scan number, return the predicted scan for a given hydrophobicity.
* @param slope
* @param intercept
* @param hydrophobicity
* @return
*/
public static double predictScanOrTime(double slope, double intercept, double hydrophobicity)
{
double predictedScanOrTime = RegressionUtilities.predictYFromX(slope, intercept, hydrophobicity);
predictedScanOrTime = Math.max(predictedScanOrTime,0);
return predictedScanOrTime;
}
public static double getSlopeFromRegressionLine(Map<String,Double> regressionLine)
{
return regressionLine.get(RegressionUtilities.REGRESSION_SLOPE_KEY);
}
public static double getInterceptFromRegressionLine(Map<String,Double> regressionLine)
{
return regressionLine.get(RegressionUtilities.REGRESSION_INTERCEPT_KEY);
}
/**
* Predict the scan or time of a feature (depending on what the regression line represents)
* @param regressionLine
* @param feature
* @return
*/
public static double predictScanOrTime(Map<String,Double> regressionLine,
Feature feature)
{
double slope = getSlopeFromRegressionLine(regressionLine);
double intercept = getInterceptFromRegressionLine(regressionLine);
return predictScanOrTime(slope, intercept, feature);
}
/**
* Predict the scan or time of a feature (depending on what the regression parameters represent)
* @param feature
* @return
*/
public static double predictScanOrTime(double slope, double intercept, Feature feature)
{
//this is lame. Really peptide should have a constructor that doesn't require a protein
Protein fakeProtein =
new Protein("", MS2ExtraInfoDef.getFirstPeptide(feature).getBytes());
Peptide currentPeptide = new Peptide(fakeProtein,0,fakeProtein.getBytes().length);
//predict either the scan or the time, based on hydrophobicity.
//the result will depend on which mode we're in
return predictScanOrTime(slope, intercept, currentPeptide);
}
public static double convertDeltaScanOrTimeToHydro(Map<String,Double> regressionLineMap,
double scanOrTimeValueToConvert)
{
double slope = AmtUtilities.getSlopeFromRegressionLine(regressionLineMap);
double intercept = AmtUtilities.getInterceptFromRegressionLine(regressionLineMap);
return convertDeltaScanOrTimeToHydro(slope, intercept, scanOrTimeValueToConvert);
}
public static double convertDeltaScanOrTimeToHydro(double slope, double intercept,
double deltaScanOrTime)
{
return RegressionUtilities.predictYFromX(slope, intercept, deltaScanOrTime) -
RegressionUtilities.predictYFromX(slope, intercept, 0);
}
/**
* Create a peptide object from a peptide sequence. Useful for finding mass, etc.
* @param peptideSequence
* @return
*/
public static Peptide createPeptideFromSequence(String peptideSequence)
{
//this is lame. Really peptide should have a constructor that doesn't require a protein
Protein fakeProtein = new Protein("",peptideSequence.getBytes());
return new Peptide(fakeProtein,0,fakeProtein.getBytes().length);
}
//Methods related to calculation and normalization of peptide hydrophobicity.
//It's all about convenience.
public static double calculateRawHydrophobicity(String peptideSequence)
{
return calculateRawHydrophobicity(createPeptideFromSequence(peptideSequence));
}
/**
* Cover method for Peptide.getHydrophobicity / getHydrophobicity3.
* Use this to switch between versions of the hydrophobicity calculator
* @param peptide
* @return
*/
public static double calculateRawHydrophobicity(Peptide peptide)
{
//return peptide.getHydrophobicity();
return peptide.getHydrophobicity3();
}
public static double normalizeHydrophobicity(double inputHydrophobicity)
{
return HydrophobicityNormalizer.normalize(inputHydrophobicity,
HYDROPHOBICITY_ALGORITHM_NAME,
HYDROPHOBICITY_ALGORITHM_VERSION);
}
public static double calculateNormalizedHydrophobicity(Peptide peptide)
{
return normalizeHydrophobicity(calculateRawHydrophobicity(peptide));
}
public static double calculateNormalizedHydrophobicity(String peptideSequence)
{
return normalizeHydrophobicity(calculateRawHydrophobicity(peptideSequence));
}
/**
* Warning about this one: if there's more than one peptide associated with
* the feature, it'll take the first
* @param feature
* @return
*/
public static double calculateNormalizedHydrophobicity(Feature feature)
{
return calculateNormalizedHydrophobicity(MS2ExtraInfoDef.getFirstPeptide(feature));
}
/**
* Assumes features have times populated if necessary
*
* Given a line that should predict time from hydrophobicity, indicate by how much
* (in units of scans or seconds) each feature appears to deviate from that line
* @param features
* @param scanOrTimeMode
* @return
*/
public static void addHydrophobicityToFeatures(Feature[] features,
int scanOrTimeMode,
double[] timeToHCoefficients)
{
try
{
for (Feature feature : features)
{
AmtExtraInfoDef.setObservedHydrophobicity(feature,
RegressionUtilities.mapValueUsingCoefficients(timeToHCoefficients,
(scanOrTimeMode == ProteomicsRegressionUtilities.REGRESSION_MODE_TIME) ?
feature.getTime() : feature.getScan()));
}
}
catch (Exception e)
{
ApplicationContext.errorMessage("Error adding hydrophobicity observations to features",e);
}
}
/**
* Clusterable for clustering based on feature mz and hydrophobicity.
* Allows access to the original Feature
*/
public static class FeatureMzHydroClusterable
extends FeatureClusterer.FeatureClusterable
{
public FeatureMzHydroClusterable(Feature feature)
{
super(feature);
}
public double getDimension1Value()
{
return parentFeature.mz;
}
public double getDimension2Value()
{
return AmtExtraInfoDef.getObservedHydrophobicity(parentFeature);
}
}
/**
* Clusterable for clustering based on feature mass and hydrophobicity.
* Allows access to the original Feature
*/
public static class FeatureMassHydroClusterable
extends FeatureClusterer.FeatureClusterable
{
public FeatureMassHydroClusterable(Feature feature)
{
super(feature);
}
public double getDimension1Value()
{
return parentFeature.mass;
}
public double getDimension2Value()
{
return AmtExtraInfoDef.getObservedHydrophobicity(parentFeature);
}
}
}