/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.toolbox.proteomics; import org.apache.log4j.Logger; import org.fhcrc.cpl.toolbox.gui.chart.PanelWithScatterPlot; import org.fhcrc.cpl.toolbox.*; import org.fhcrc.cpl.toolbox.statistics.MatrixUtil; import org.fhcrc.cpl.toolbox.statistics.BasicStatistics; import org.fhcrc.cpl.toolbox.statistics.RegressionUtilities; import org.fhcrc.cpl.toolbox.datastructure.Pair; import java.util.List; import java.util.ArrayList; /** * Utilities for recalibrating and characterizing the mass calibration of sets of * features, based on an approach that uses theoretical peptide mass clusters. * * Peptides should have masses with certain predictable qualities. In particular, the * distances between masses should cluster around multiples of a certain number, called * the "mass wavelength". This number would be 1 (the mass of a proton in a Carbon atom), * except that different elements' nucleon masses are different, due to the effect * of the mass defect. So the clustering isn't perfect, but it is pretty good. You can * determine how the deviation from predicted cluster changes as the distance between * masses increases. * * This deviation can be used to calibrate masses in a given run, and it can be used to * toss out wildly deviating masses from runs known to be calibrated correctly. * * For more information, and for the theoretical basis of pretty much everything in here, * check out: * "Analytical model of peptide mass cluster centres with applications" * Wolski, Farrow, et al. * http://www.proteomesci.com/content/4/1/18 */ public class MassCalibrationUtilities { protected static Logger _log = Logger.getLogger(MassCalibrationUtilities.class); //the weighted average of all peptides' mass defect per dalton, at monoisotopic mass. //That is, for the distribution of elements within the average peptide, //the weighted average of the mass defect, divided by the atomic weight //of the light (common) version of the element // public static final double THEORETICAL_MASS_WAVELENGTH = 1.000482; //public static final double THEORETICAL_MASS_WAVELENGTH = 1.000467; public static final double DEFAULT_THEORETICAL_MASS_WAVELENGTH = 1.000476; //empirically derived //The intercept for the line with slope THEORETICAL_MASS_WAVELENGTH, that //maps mass to mass cluster deviation. //I have no great understanding of why this is not zero, but it is not zero. public static final double THEORETICAL_MASS_DEFECT_INTERCEPT = .034; //public static final double THEORETICAL_MASS_DEFECT_INTERCEPT = 0.013; //public static final double THEORETICAL_MASS_DEFECT_INTERCEPT = .0; //maximum number of pairs for use in leverage calculation, when doing //robust regression to determine a run's characteristics. //This is purely a performance consideration. Higher is better, but much higher //than this default value is really, really expensive public static final int DEFAULT_MAX_PAIRS_FOR_LEVERAGE_CALC = 500000; //Parameters related to robust regression public static final double DEFAULT_MAX_LEVERAGE_NUMERATOR_FOR_REGRESSION = 4.0; public static final double DEFAULT_MAX_STUDENTIZED_RESIDUAL_FOR_REGRESSION = 2.3; //default maximum deviation from theoretical mass cluster, for use in filtering. //This default value is derived from the Wolski et al. paper. I think we could //probably go tighter. public static int DEFAULT_MAX_DEVIATION_PPM = 200; public MassCalibrationUtilities() { } /** * Convenience method for a single feature set * @param masses * @return */ public static PanelWithScatterPlot plotMassDefectDeviation(float[] masses, double theoreticalMassWavelength, boolean shouldDisplayPPMLine, double ppmForLine) { List<float[]> massesList = new ArrayList<float[]>(1); massesList.add(masses); return plotMassDefectDeviation(massesList, theoreticalMassWavelength, shouldDisplayPPMLine, ppmForLine); } /** * Plot the mass defect deviations of multiple arrays of masses, in different colors. * Optionally displays lines indicating deviations of ppmForLine parts per million * from theoretical cluster center. * @param massArrays * @return */ public static PanelWithScatterPlot plotMassDefectDeviation(List<float[]> massArrays, double theoreticalMassWavelength, boolean shouldDisplayPPMLine, double ppmForLine) { PanelWithScatterPlot pwsp = new PanelWithScatterPlot(); int setNum=1; for (float[] masses : massArrays) { double[][] scatterPlotData = new double[2][masses.length]; for (int i=0; i<masses.length; i++) { double mass = masses[i]; double massOffset = (mass - THEORETICAL_MASS_DEFECT_INTERCEPT) % theoreticalMassWavelength; if (massOffset >= 0.5) massOffset -= 1; scatterPlotData[0][i] = (float) (mass); scatterPlotData[1][i] = (float) massOffset; } pwsp.addData(scatterPlotData[0], scatterPlotData[1], "Set " + setNum++); } if (shouldDisplayPPMLine) { double minMass = Double.MAX_VALUE; double maxMass = Double.MIN_VALUE; for (float mass : massArrays.get(0)) { if (mass < minMass) minMass = mass; if (mass > maxMass) maxMass = mass; } int numPoints = (int) (maxMass - minMass + 1); double[] linesXValues = new double[2 * numPoints]; double[] linesYValues = new double[2 * numPoints]; for (int i=0; i<numPoints; i++) { linesXValues[i] = i; linesXValues[numPoints + i] = i; linesYValues[i] = ((double)i) * ppmForLine / 1000000.0; linesYValues[numPoints+i] = -linesYValues[i]; } pwsp.addData(linesXValues, linesYValues, ppmForLine + "PPM Deviation"); } pwsp.setAxisLabels("Mass","Mass Cluster Deviation"); return pwsp; } /** * Calibrate masses in a single partition. * * First we calculate the mass wavelength, and the offset from theoretical mass * intercept. Then we adjust each mass. * @param masses * @param maxPairsForLeverageCalc * @param theoreticalMassWavelength */ public static float[] calibrateMassesSinglePartition(float[] masses, int maxPairsForLeverageCalc, double theoreticalMassWavelength) { Pair<Double,Double> wavelengthAndOffset = calculateWavelengthAndOffset(masses, maxPairsForLeverageCalc, theoreticalMassWavelength); double wavelength = wavelengthAndOffset.first; double offset = wavelengthAndOffset.second; double[] deltaMasses = new double[masses.length]; double[] deltaMassesPPM = new double[masses.length]; float[] result = new float[masses.length]; for (int i=0; i<masses.length; i++) { float mass = masses[i]; float newMass = masses[i]; newMass += mass * (theoreticalMassWavelength - wavelength) - offset; deltaMasses[i] = newMass - mass; deltaMassesPPM[i] = deltaMasses[i] * (1000000.0/mass); result[i] = newMass; } if (_log.isDebugEnabled()) { double meanMassChange = BasicStatistics.mean(deltaMasses); double meanMassChangePPM = BasicStatistics.mean(deltaMassesPPM); for (int i=0; i<deltaMasses.length; i++) deltaMasses[i] = Math.abs(deltaMasses[i]); double meanAbsoluteMassChange = BasicStatistics.mean(deltaMasses); _log.debug("Average mass change: " + meanMassChange + " (" + meanMassChangePPM + " ppm)"); _log.debug("Average _absolute_ mass change: " + meanAbsoluteMassChange); } return result; } /** * Use robust regression techniques to map mass to mass offset from theoretical cluster * @param masses * @param maxPairsForLeverageCalc more is better, but more costs more time * @return a Pair containing slope, intercept of the regression line. */ public static Pair<Double,Double> calculateWavelengthAndOffset(float[] masses, int maxPairsForLeverageCalc, double theoreticalMassWavelength) { //calculate the mass defect wavelength. That is, the distance between clusters. //This can be thought of as the slope of the regression line double massDefectWavelength = calculateMassDefectWavelength(masses, maxPairsForLeverageCalc, theoreticalMassWavelength); //calculate mass defect intercept. Once we've got the wavelength, this is just //a matter of taking the average of (the feature masses (mod wavelength)) and //subtracting the theoretical intercept. //At least according to the Wolski paper double[] massDefectIntercepts = new double[masses.length]; for (int i=0; i<masses.length; i++) { double intercept = (masses[i] % massDefectWavelength) - THEORETICAL_MASS_DEFECT_INTERCEPT; if (intercept >= 0.5) intercept = intercept - 1.0; massDefectIntercepts[i] = intercept; } double meanMassDefectIntercept = BasicStatistics.mean(massDefectIntercepts); _log.debug("wavelength and offset: " + massDefectWavelength + ", " + meanMassDefectIntercept); return new Pair<Double,Double>(massDefectWavelength,meanMassDefectIntercept); } /** * Calculate the wavelength; that is, the distance between mass clusters. I.e., the slope * of the line that maps mass to mass defect. * First, find all pairs of features (in reality, this is capped at maxPairsForLeverageCalc, * due to performance considerations -- the problem is just too large. We ensure a representative * sample). Calculate the distances between each pair, and the wavelength for each pair. * Then, toss out the ones with high leverage. * Then, perform a linear regression to map distance to wavelength. Calculate all the residuals, * and toss out the ones with high studentized residuals. Do the regression again and return * the slope, which is the wavelength. The intercept has no meaning in this context. */ public static double calculateMassDefectWavelength(float[] masses, int maxPairsForLeverageCalc, double theoreticalMassWavelength) { Pair<double[],double[]> distancesAndWavelengthsForRegression = findDistancesAndWavelengthResidualsForRegression(masses, maxPairsForLeverageCalc, theoreticalMassWavelength); _log.debug("assembled distances, wavelengths: " + distancesAndWavelengthsForRegression.first.length); double[] distancesForRegression = distancesAndWavelengthsForRegression.first; double[] wavelengthResidualsForRegression = distancesAndWavelengthsForRegression.second; double[] regressionResult = MatrixUtil.linearRegression(distancesForRegression, wavelengthResidualsForRegression); _log.debug("first reg: wavelength: " + regressionResult[1]); int n = distancesForRegression.length; double[] massDefectResiduals = new double[n]; _log.debug("calculateRegressionLine: Pairs after leverage cutoff: " + n); for (int i=0; i<n; i++) { double predictedMassDefect = RegressionUtilities.predictYFromX( regressionResult[1], regressionResult[0], distancesForRegression[i]); massDefectResiduals[i] = Math.abs(wavelengthResidualsForRegression[i] - predictedMassDefect); } double[] studentizedResiduals = BasicStatistics.studentizedResiduals(distancesForRegression, massDefectResiduals); boolean[] pairsInSecondRegression = new boolean[n]; int numInSecondRegression = 0; for (int i=0; i<n; i++) { //todo: parameterize if (studentizedResiduals[i] < DEFAULT_MAX_STUDENTIZED_RESIDUAL_FOR_REGRESSION) { pairsInSecondRegression[i] = true; numInSecondRegression++; } else pairsInSecondRegression[i] = false; } double[] distancesForSecondRegression = new double[numInSecondRegression]; double[] wavelengthResidualsForSecondRegression = new double[numInSecondRegression]; int index=0; for (int i=0; i<n; i++) { if (pairsInSecondRegression[i]) { distancesForSecondRegression[index] = distancesForRegression[i]; wavelengthResidualsForSecondRegression[index] = wavelengthResidualsForRegression[i]; index++; } } _log.debug("calculateRegressionLine: pairs after stud.res. cutoff: " + numInSecondRegression); regressionResult = MatrixUtil.linearRegression(distancesForSecondRegression, wavelengthResidualsForSecondRegression); //double[] secondRegressionLineXVals = new double[2000]; //double[] secondRegressionLineYVals = new double[secondRegressionLineXVals.length]; //for (int i=0; i<secondRegressionLineXVals.length; i++) //{ // secondRegressionLineXVals[i] = i; // secondRegressionLineYVals[i] = regressionResult[1] * i + regressionResult[0]; //} //ScatterPlotDialog spd2 = // new ScatterPlotDialog(secondRegressionLineXVals, secondRegressionLineYVals, "Second Regression line"); //spd2.addData(distancesForSecondRegression, wavelengthResidualsForSecondRegression, "points after stud res cutoff"); //spd2.setVisible(true); _log.debug("After second regression, wavelength: " + (theoreticalMassWavelength + regressionResult[1])); return theoreticalMassWavelength + regressionResult[1]; } /** * First, find all pairs of features... or, actually, a representative sample of * maxPairsForLeverageCalc pairs, due to performance considerations -- the problem * is just too large. * We ensure a fairly random sample, by figuring out by what factor there are more * pairs than we can handle (x) and then taking every xth pair. This won't ensure * that we actually get maxPairs... pairs, only that we get at least maxPairs.../2. * Anything more complicated gets too intensive, though, due to the sheer number of * pairs... even choosing, say, 10 million random numbers takes way too long. * * Next, calculate the distances between each pair, and the wavelength for each pair. * * Then, calculate the leverage of each (based on distance) and toss out the ones * with high leverage. * * Return what's left * * @param masses * @param maxPairsForLeverageCalc * @return */ public static Pair<double[],double[]> findDistancesAndWavelengthResidualsForRegression( float[] masses, int maxPairsForLeverageCalc, double theoreticalMassWavelength) { int numFeatures = masses.length; int numPairs = 0; numPairs = (numFeatures*(numFeatures-1))/2; ApplicationContext.infoMessage("Total pairs: " + numPairs); int pairInterval = 1; if (numPairs > maxPairsForLeverageCalc) { ApplicationContext.infoMessage("Too many pairs (" + numPairs + "), reducing..."); pairInterval = (Math.round(numPairs / maxPairsForLeverageCalc)); if (pairInterval == 1) pairInterval = 2; //todo: this is stupid. I should be able to calculate this. Why am I so lazy??! _log.debug("Counting reduced pairs"); int numKeptPairs=0; int allPairsIndex=0; for (int i=0; i<numFeatures; i++) { for (int j=0; j<numFeatures; j++) { if (i<j && (allPairsIndex % pairInterval) == 0) { numKeptPairs++; } allPairsIndex++; } } numPairs = numKeptPairs; _log.debug("Done Counting reduced pairs: " + numPairs); } float[] pairWavelengthResiduals = new float[numPairs]; float[] pairDistances = new float[numPairs]; int allPairsIndex=0; int keptPairsIndex=0; for (int i=0; i<numFeatures; i++) { for (int j=0; j<numFeatures; j++) { if (i<j && (allPairsIndex % pairInterval) == 0) { pairDistances[keptPairsIndex] = Math.abs(masses[i] - masses[j]); double thisPairWavelengthResidual = pairDistances[keptPairsIndex] % theoreticalMassWavelength; if (thisPairWavelengthResidual >= 0.5) thisPairWavelengthResidual -= 1; pairWavelengthResiduals[keptPairsIndex++] = (float) thisPairWavelengthResidual; } allPairsIndex++; } } //ScatterPlotDialog spd = new ScatterPlotDialog(pairDistances, pairWavelengths, "all points"); //spd.setVisible(true); double maxLeverage = DEFAULT_MAX_LEVERAGE_NUMERATOR_FOR_REGRESSION / (double) numPairs; float[] leverages = BasicStatistics.leverages(pairDistances); ApplicationContext.setMessage("Calculated leverages"); List<Integer> pairsInFirstRegression = new ArrayList<Integer>(); int numPairsInRegression = 0; for (int i=0; i<numPairs; i++) { if (leverages[i] < maxLeverage) { pairsInFirstRegression.add(i); numPairsInRegression++; } } double[] regressionDistances = new double[numPairsInRegression]; double[] regressionWavelengthResiduals = new double[numPairsInRegression]; int index=0; for (int pairIndex : pairsInFirstRegression) { regressionDistances[index] = pairDistances[pairIndex]; regressionWavelengthResiduals[index] = pairWavelengthResiduals[pairIndex]; index++; } return new Pair<double[],double[]>(regressionDistances, regressionWavelengthResiduals); } /** * Convenience method for PPM conversion * @param mass * @return */ public static double calculateMassDefectDeviationPPM(float mass, double theoreticalMassWavelength) { double massDefectDaltons = calculateMassDefectDeviation(mass, theoreticalMassWavelength); return ((massDefectDaltons * 1000000.0) / mass); } /** * Calculate the deviation from the closest theoretical mass cluster. * Assumes masses are already calibrated correctly * @param mass * @return */ public static double calculateMassDefectDeviation(float mass, double theoreticalMassWavelength) { //mass offset is the mass mod the weighted average of all peptides' //actual mass per dalton double predictedMassDefectDeviation = (mass - THEORETICAL_MASS_DEFECT_INTERCEPT) % (theoreticalMassWavelength); //massOffset is now divided... half the peptides will be above the average, //and those will be just above 0. The other half will be below the average, //and those will be just below 1. //This step moves the ones just below 1 so they're just below 0 if (predictedMassDefectDeviation >= 0.5) predictedMassDefectDeviation -= 1; return predictedMassDefectDeviation; } }