/*
* Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fhcrc.cpl.viewer.amt;
import java.util.*;
import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet;
import org.fhcrc.cpl.toolbox.proteomics.feature.matching.ClusteringFeatureSetMatcher;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef;
import org.fhcrc.cpl.toolbox.proteomics.MSRun;
import org.fhcrc.cpl.toolbox.proteomics.ProteomicsRegressionUtilities;
import org.fhcrc.cpl.toolbox.statistics.BasicStatistics;
import org.fhcrc.cpl.toolbox.statistics.RegressionUtilities;
import org.fhcrc.cpl.toolbox.proteomics.Peptide;
import org.fhcrc.cpl.toolbox.proteomics.MS2Modification;
import org.apache.log4j.Logger;
public class PeptideMatcher
{
private static Logger _log = Logger.getLogger(PeptideMatcher.class);
//number of times the standard deviation by which a peptide's intensity must differ from the
//protein's average intensity in order for it to be tossed out
//Not implemented
// protected static final double INTENSITY_OUTLIER_STD_DEV_MULTIPLIER = 1;
//Two different modes for time tolerance when matching peptides.
//Fixed: always use the same time tolerance
//Adaptive:
// For unobserved peptides, UNOBSERVED_PEPTIDE_GRADIENT_PERCENT_TOLERANCE percent of the gradient
// For peptides observed once before, deltaTime
// For peptides observed multiple times before, a multiple of the standard deviation of the observations
public static final int PEPTIDE_TIME_TOLERANCE_MODE_FIXED=0;
public static final int PEPTIDE_TIME_TOLERANCE_MODE_ADAPTIVE=1;
public static final int DEFAULT_PEPTIDE_TIME_TOLERANCE_MODE=PEPTIDE_TIME_TOLERANCE_MODE_FIXED;
public static final int UNOBSERVED_PEPTIDE_GRADIENT_PERCENT_TOLERANCE=5;
public static final Feature.MassAscComparator featureMassAscComp = new Feature.MassAscComparator();
/**
* Calculate the standard deviation of the absolute value of hydrophobicity differences in
* peptides identified between two pepxml files
* @param ms2Features1
* @param ms2Features2
* @param scanOrTimeMode
* @param run1
* @param run2
* @param minPeptideProphet
* @return
*/
public static double calculateHydroDiffStandardDeviation(FeatureSet ms2Features1,
FeatureSet ms2Features2,
int scanOrTimeMode,
MSRun run1, MSRun run2,
double minPeptideProphet,
boolean robustRegression)
{
double[] hydrophobicityDifferences =
calculateHydrophobicityDifferences(ms2Features1, ms2Features2, scanOrTimeMode,
run1, run2, minPeptideProphet,
robustRegression);
//for (double diff : hydrophobicityDifferences) System.err.println(" " + diff);
return BasicStatistics.standardDeviation(hydrophobicityDifferences);
}
/**
* Calculate the median of the absolute value of hydrophobicity differences in
* peptides identified between two pepxml files
* @param ms2Features1
* @param ms2Features2
* @param scanOrTimeMode
* @param run1
* @param run2
* @param minPeptideProphet
* @return
*/
public static double calculateHydroDiffMedian(FeatureSet ms2Features1,
FeatureSet ms2Features2,
int scanOrTimeMode,
MSRun run1, MSRun run2,
double minPeptideProphet,
boolean robustRegression)
{
double[] hydrophobicityDifferences =
calculateHydrophobicityDifferences(ms2Features1, ms2Features2, scanOrTimeMode,
run1, run2, minPeptideProphet, robustRegression);
return BasicStatistics.median(hydrophobicityDifferences);
}
/**
* return an array of the absolute values of differences between hydrophobicities of
* peptides observed in both runs
* @param ms2Features1
* @param ms2Features2
* @param scanOrTimeMode
* @param run1
* @param run2
* @param minPeptideProphet
* @return
*/
public static double[] calculateHydrophobicityDifferences(FeatureSet ms2Features1,
FeatureSet ms2Features2,
int scanOrTimeMode,
MSRun run1, MSRun run2,
double minPeptideProphet,
boolean robustRegression)
{
FeatureSet.FeatureSelector sel = new FeatureSet.FeatureSelector();
sel.setMinPProphet((float)minPeptideProphet);
ms2Features1 = ms2Features1.filter(sel);
ms2Features2 = ms2Features2.filter(sel);
ms2Features1.populateTimesForMS2Features(run1);
ms2Features2.populateTimesForMS2Features(run2);
//Calculate the time-hydro relationships for each run
Map<String, Double> lineMap =
AmtUtilities.calculateScanOrTimeHydrophobicityRelationship(ms2Features1.getFeatures(),
scanOrTimeMode, robustRegression);
double slope1 = lineMap.get(RegressionUtilities.REGRESSION_SLOPE_KEY);
double intercept1 = lineMap.get(RegressionUtilities.REGRESSION_INTERCEPT_KEY);
lineMap = AmtUtilities.calculateScanOrTimeHydrophobicityRelationship(ms2Features2.getFeatures(),
scanOrTimeMode, robustRegression);
double slope2 = lineMap.get(RegressionUtilities.REGRESSION_SLOPE_KEY);
double intercept2 = lineMap.get(RegressionUtilities.REGRESSION_INTERCEPT_KEY);
//Keep track of the peptides found in set 1 and their features
HashMap<String, Feature> peptidesFromFeatureSet1 = new HashMap<String, Feature>();
for (Feature set1Feature : ms2Features1.getFeatures())
peptidesFromFeatureSet1.put(MS2ExtraInfoDef.getFirstPeptide(set1Feature), set1Feature);
ArrayList<Double> hydroDifferencesList = new ArrayList<Double>();
//For features in the two sets with the same peptide, calculate their hydrophobicities
//and store the difference between them
for (Feature set2Feature : ms2Features2.getFeatures())
{
if (peptidesFromFeatureSet1.containsKey(MS2ExtraInfoDef.getFirstPeptide(set2Feature)))
{
//get time/scan for set 1 feature
double time1;
Feature set1Feature = peptidesFromFeatureSet1.get(MS2ExtraInfoDef.getFirstPeptide(set2Feature));
if (scanOrTimeMode == ProteomicsRegressionUtilities.REGRESSION_MODE_SCAN)
time1 = set1Feature.getScan();
else
time1 = run1.getMS2Scans()[run1.getIndexForMS2ScanNum(set1Feature.getScan())].getDoubleRetentionTime();
//get time/scan for set 2 feature
double time2;
if (scanOrTimeMode == ProteomicsRegressionUtilities.REGRESSION_MODE_SCAN)
time2 = set2Feature.getScan();
else
time2 = run2.getMS2Scans()[run2.getIndexForMS2ScanNum(set2Feature.getScan())].getDoubleRetentionTime();
//translate time to hydrophobicity
double hydro1 =
RegressionUtilities.predictYFromX(
slope1, intercept1, time1);
double hydro2 =
RegressionUtilities.predictYFromX(
slope2, intercept2, time2);
//store the absolute value of the difference
hydroDifferencesList.add(Math.abs(hydro1 - hydro2));
}
}
//Convert the ArrayList to an array. It would sure be nice if there were a better way
double[] hydrophobicityDifferences = new double[hydroDifferencesList.size()];
for (int i = 0; i < hydrophobicityDifferences.length; i++)
{
hydrophobicityDifferences[i] = Math.abs(hydroDifferencesList.get(i));
}
return hydrophobicityDifferences;
}
//Utility methods for getting what we need from a map representing a regression line
/**
* For a given feature, try all possible mass modifications to match against a featureset.
* That is, for each modification, try the low value for _all matching residues_, then
* try the high value. Does _not_ try combinations of low and high.
* So the number of times this method will be run is at most 2^(number of var mods).
* If it finds a match sooner, it'll return.
* The return value is a cloned version of the matched feature, with the peptide added.
* Note: the mass table is altered during execution of this method
* @param feature
* @param currentPeptide
* @param ms1Features
* @param comp
* @param varModArray
* @param startIndex
* @param massTable
* @return a cloned version of the matched feature, with the peptide added
*/
public static Feature recursivelyMatchAllMods(Feature feature, Peptide currentPeptide,
Feature[] ms1Features,
ClusteringFeatureSetMatcher featureSetMatcher,
Comparator<Feature> comp,
MS2Modification[] varModArray,
int startIndex, double[] massTable)
{
if (startIndex >= varModArray.length)
{
feature.setMass((float) currentPeptide.getMass(massTable));
List<Feature> matches = null;// featureSetMatcher.findMatchingFeatures(ms1Features, feature,
// comp);
Feature result = null;
if (matches != null && !matches.isEmpty())
{
//clone the feature and mark it with the peptide
//TODO: here we toss out all matches but the first. Is that ok?
result = (Feature) matches.get(0).clone();
MS2ExtraInfoDef.setSinglePeptide(result,new String(currentPeptide.getChars()));
}
return result;
}
Feature matchedFeature = recursivelyMatchAllMods(feature, currentPeptide, ms1Features,
featureSetMatcher,
comp, varModArray, startIndex+1, massTable);
if (matchedFeature != null)
return matchedFeature;
MS2Modification varMod = varModArray[0];
int indexToModify = (int) varMod.getAminoAcid().charAt(0);
float massDiff = varMod.getMassDiff();
massTable[indexToModify] = massTable[indexToModify] + massDiff;
return recursivelyMatchAllMods(feature, currentPeptide, ms1Features,
featureSetMatcher,
comp, varModArray, startIndex+1, massTable);
}
}