MassCalibrationUtilities.java example

Explorer
msInspect-master
/*
 * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.fhcrc.cpl.toolbox.proteomics;

import org.apache.log4j.Logger;
import org.fhcrc.cpl.toolbox.gui.chart.PanelWithScatterPlot;
import org.fhcrc.cpl.toolbox.*;
import org.fhcrc.cpl.toolbox.statistics.MatrixUtil;
import org.fhcrc.cpl.toolbox.statistics.BasicStatistics;
import org.fhcrc.cpl.toolbox.statistics.RegressionUtilities;
import org.fhcrc.cpl.toolbox.datastructure.Pair;


import java.util.List;
import java.util.ArrayList;


/**
 * Utilities for recalibrating and characterizing the mass calibration of sets of
 * features, based on an approach that uses theoretical peptide mass clusters.
 *
 * Peptides should have masses with certain predictable qualities.  In particular, the
 * distances between masses should cluster around multiples of a certain number, called
 * the "mass wavelength".  This number would be 1 (the mass of a proton in a Carbon atom),
 * except that different elements' nucleon masses are different, due to the effect
 * of the mass defect.  So the clustering isn't perfect, but it is pretty good.  You can
 * determine how the deviation from predicted cluster changes as the distance between
 * masses increases.
 *
 * This deviation can be used to calibrate masses in a given run, and it can be used to
 * toss out wildly deviating masses from runs known to be calibrated correctly.
 *
 * For more information, and for the theoretical basis of pretty much everything in here,
 * check out:
 * "Analytical model of peptide mass cluster centres with applications"
 * Wolski, Farrow, et al.
 * http://www.proteomesci.com/content/4/1/18
 */
public class MassCalibrationUtilities
{
    protected static Logger _log = Logger.getLogger(MassCalibrationUtilities.class);

    //the weighted average of all peptides' mass defect per dalton, at monoisotopic mass.
    //That is, for the distribution of elements within the average peptide,
    //the weighted average of the mass defect, divided by the atomic weight
    //of the light (common) version of the element

//    public static final double THEORETICAL_MASS_WAVELENGTH = 1.000482;
    //public static final double THEORETICAL_MASS_WAVELENGTH = 1.000467;
    public static final double DEFAULT_THEORETICAL_MASS_WAVELENGTH = 1.000476; //empirically derived



    //The intercept for the line with slope THEORETICAL_MASS_WAVELENGTH, that
    //maps mass to mass cluster deviation.
    //I have no great understanding of why this is not zero, but it is not zero.
    public static final double THEORETICAL_MASS_DEFECT_INTERCEPT = .034;
//public static final double THEORETICAL_MASS_DEFECT_INTERCEPT = 0.013;
//public static final double THEORETICAL_MASS_DEFECT_INTERCEPT = .0;


    //maximum number of pairs for use in leverage calculation, when doing
    //robust regression to determine a run's characteristics.
    //This is purely a performance consideration.  Higher is better, but much higher
    //than this default value is really, really expensive
    public static final int DEFAULT_MAX_PAIRS_FOR_LEVERAGE_CALC = 500000;

    //Parameters related to robust regression
    public static final double DEFAULT_MAX_LEVERAGE_NUMERATOR_FOR_REGRESSION = 4.0;
    public static final double DEFAULT_MAX_STUDENTIZED_RESIDUAL_FOR_REGRESSION = 2.3;

    //default maximum deviation from theoretical mass cluster, for use in filtering.
    //This default value is derived from the Wolski et al. paper.  I think we could
    //probably go tighter.
    public static int DEFAULT_MAX_DEVIATION_PPM = 200;


    public MassCalibrationUtilities()
    {
    }

    /**
     * Convenience method for a single feature set
     * @param masses
     * @return
     */
    public static PanelWithScatterPlot plotMassDefectDeviation(float[] masses,
                                                            double theoreticalMassWavelength,
                                                            boolean shouldDisplayPPMLine,
                                                            double ppmForLine)
    {
        List<float[]> massesList = new ArrayList<float[]>(1);
        massesList.add(masses);
        return plotMassDefectDeviation(massesList, theoreticalMassWavelength,
                shouldDisplayPPMLine, ppmForLine);
    }

    /**
     * Plot the mass defect deviations of multiple arrays of masses, in different colors.
     * Optionally displays lines indicating deviations of ppmForLine parts per million
     * from theoretical cluster center.
     * @param massArrays
     * @return
     */
    public static PanelWithScatterPlot plotMassDefectDeviation(List<float[]> massArrays,
                                                            double theoreticalMassWavelength,
                                                            boolean shouldDisplayPPMLine,
                                                            double ppmForLine)
    {
        PanelWithScatterPlot pwsp = new PanelWithScatterPlot();

        int setNum=1;
        for (float[] masses : massArrays)
        {    
            double[][] scatterPlotData = new double[2][masses.length];

            for (int i=0; i<masses.length; i++)
            {
                double mass = masses[i];
                double massOffset = (mass - THEORETICAL_MASS_DEFECT_INTERCEPT)
                        % theoreticalMassWavelength;
                             
                if (massOffset >= 0.5)
                    massOffset -= 1;
                scatterPlotData[0][i] = (float) (mass);
                scatterPlotData[1][i] = (float) massOffset;
            }
            pwsp.addData(scatterPlotData[0], scatterPlotData[1], "Set " + setNum++);
        }

        if (shouldDisplayPPMLine)
        {
            double minMass = Double.MAX_VALUE;
            double maxMass = Double.MIN_VALUE;
            for (float mass : massArrays.get(0))
            {
                if (mass < minMass)
                    minMass = mass;
                if (mass > maxMass)
                    maxMass = mass;
            }
            int numPoints = (int) (maxMass - minMass + 1);
            double[] linesXValues = new double[2 * numPoints];
            double[] linesYValues = new double[2 * numPoints];
            for (int i=0; i<numPoints; i++)
            {
                linesXValues[i] = i;
                linesXValues[numPoints + i] = i;
                linesYValues[i] = ((double)i) * ppmForLine / 1000000.0;
                linesYValues[numPoints+i] = -linesYValues[i];
            }
            pwsp.addData(linesXValues, linesYValues, ppmForLine + "PPM Deviation");
        }

        pwsp.setAxisLabels("Mass","Mass Cluster Deviation");
        return pwsp;
    }

    /**
     * Calibrate masses in a single partition.
     *
     * First we calculate the mass wavelength, and the offset from theoretical mass
     * intercept. Then we adjust each mass.
     * @param masses
     * @param maxPairsForLeverageCalc
     * @param theoreticalMassWavelength
     */
    public static float[] calibrateMassesSinglePartition(float[] masses,
                                                       int maxPairsForLeverageCalc,
                                                       double theoreticalMassWavelength)
    {
        Pair<Double,Double> wavelengthAndOffset =
                calculateWavelengthAndOffset(masses, maxPairsForLeverageCalc, theoreticalMassWavelength);
        double wavelength = wavelengthAndOffset.first;
        double offset = wavelengthAndOffset.second;
        double[] deltaMasses = new double[masses.length];
        double[] deltaMassesPPM = new double[masses.length];

        float[] result = new float[masses.length];
        for (int i=0; i<masses.length; i++)
        {
            float mass = masses[i];
            float newMass = masses[i];
            newMass += mass * (theoreticalMassWavelength - wavelength) - offset;

            deltaMasses[i] = newMass - mass;
            deltaMassesPPM[i] = deltaMasses[i] * (1000000.0/mass);
            result[i] = newMass;
        }
        if (_log.isDebugEnabled())
        {                       
            double meanMassChange = BasicStatistics.mean(deltaMasses);
            double meanMassChangePPM = BasicStatistics.mean(deltaMassesPPM);

            for (int i=0; i<deltaMasses.length; i++)
                deltaMasses[i] = Math.abs(deltaMasses[i]);
            double meanAbsoluteMassChange = BasicStatistics.mean(deltaMasses);
            _log.debug("Average mass change: " + meanMassChange + " (" + meanMassChangePPM + " ppm)");
            _log.debug("Average _absolute_ mass change: " + meanAbsoluteMassChange);             
        }
        return result;
    }

    /**
     * Use robust regression techniques to map mass to mass offset from theoretical cluster
     * @param masses
     * @param maxPairsForLeverageCalc more is better, but more costs more time
     * @return a Pair containing slope, intercept of the regression line.
     */
    public static Pair<Double,Double> calculateWavelengthAndOffset(float[] masses,
                                                              int maxPairsForLeverageCalc,
                                                              double theoreticalMassWavelength)
    {
        //calculate the mass defect wavelength.  That is, the distance between clusters.
        //This can be thought of as the slope of the regression line
        double massDefectWavelength =
                calculateMassDefectWavelength(masses, maxPairsForLeverageCalc, theoreticalMassWavelength);

        //calculate mass defect intercept.  Once we've got the wavelength, this is just
        //a matter of taking the average of (the feature masses (mod wavelength)) and
        //subtracting the theoretical intercept.
        //At least according to the Wolski paper
        double[] massDefectIntercepts = new double[masses.length];
        for (int i=0; i<masses.length; i++)
        {
            double intercept = (masses[i] % massDefectWavelength) -
                    THEORETICAL_MASS_DEFECT_INTERCEPT;
            if (intercept >= 0.5)
                intercept = intercept - 1.0;
            massDefectIntercepts[i] = intercept;
        }

        double meanMassDefectIntercept = BasicStatistics.mean(massDefectIntercepts);

        _log.debug("wavelength and offset: " + massDefectWavelength + ", " + meanMassDefectIntercept);
        return new Pair<Double,Double>(massDefectWavelength,meanMassDefectIntercept);
    }

    /**
     * Calculate the wavelength; that is, the distance between mass clusters.  I.e., the slope
     * of the line that maps mass to mass defect.  
     * First, find all pairs of features (in reality, this is capped at maxPairsForLeverageCalc,
     * due to performance considerations -- the problem is just too large.  We ensure a representative
     * sample).  Calculate the distances between each pair, and the wavelength for each pair.
     * Then, toss out the ones with high leverage.
     * Then, perform a linear regression to map distance to wavelength.  Calculate all the residuals,
     * and toss out the ones with high studentized residuals.  Do the regression again and return
     * the slope, which is the wavelength.  The intercept has no meaning in this context.
     */
    public static double calculateMassDefectWavelength(float[] masses,
                                                       int maxPairsForLeverageCalc,
                                                       double theoreticalMassWavelength)
    {
        Pair<double[],double[]> distancesAndWavelengthsForRegression =
                findDistancesAndWavelengthResidualsForRegression(masses, maxPairsForLeverageCalc,
                        theoreticalMassWavelength);
        _log.debug("assembled distances, wavelengths: " +
                distancesAndWavelengthsForRegression.first.length);

        double[] distancesForRegression = distancesAndWavelengthsForRegression.first;
        double[] wavelengthResidualsForRegression = distancesAndWavelengthsForRegression.second;

        double[] regressionResult =
                MatrixUtil.linearRegression(distancesForRegression,
                        wavelengthResidualsForRegression);
        _log.debug("first reg: wavelength: " + regressionResult[1]);

        int n = distancesForRegression.length;

        double[] massDefectResiduals = new double[n];

        _log.debug("calculateRegressionLine: Pairs after leverage cutoff: " + n);

        for (int i=0; i<n; i++)
        {
            double predictedMassDefect =
                    RegressionUtilities.predictYFromX(
                            regressionResult[1],
                            regressionResult[0],
                            distancesForRegression[i]);
            massDefectResiduals[i] =
                    Math.abs(wavelengthResidualsForRegression[i] - predictedMassDefect);
        }

        double[] studentizedResiduals =
                BasicStatistics.studentizedResiduals(distancesForRegression,
                        massDefectResiduals);

        boolean[] pairsInSecondRegression = new boolean[n];

        int numInSecondRegression = 0;

        for (int i=0; i<n; i++)
        {
            //todo: parameterize
            if (studentizedResiduals[i] < DEFAULT_MAX_STUDENTIZED_RESIDUAL_FOR_REGRESSION)
            {
                pairsInSecondRegression[i] = true;
                numInSecondRegression++;
            }
            else
                pairsInSecondRegression[i] = false;
        }

        double[] distancesForSecondRegression = new double[numInSecondRegression];
        double[] wavelengthResidualsForSecondRegression = new double[numInSecondRegression];

        int index=0;
        for (int i=0; i<n; i++)
        {
            if (pairsInSecondRegression[i])
            {
                distancesForSecondRegression[index] = distancesForRegression[i];
                wavelengthResidualsForSecondRegression[index] =
                        wavelengthResidualsForRegression[i];
                index++;
            }
        }

        _log.debug("calculateRegressionLine: pairs after stud.res. cutoff: " +
                numInSecondRegression);



        regressionResult =
                MatrixUtil.linearRegression(distancesForSecondRegression,
                        wavelengthResidualsForSecondRegression);

//double[] secondRegressionLineXVals = new double[2000];
//double[] secondRegressionLineYVals = new double[secondRegressionLineXVals.length];
//for (int i=0; i<secondRegressionLineXVals.length; i++)
//{
//    secondRegressionLineXVals[i] = i;
//    secondRegressionLineYVals[i] = regressionResult[1] * i + regressionResult[0];
//}
//ScatterPlotDialog spd2 =
//        new ScatterPlotDialog(secondRegressionLineXVals, secondRegressionLineYVals, "Second Regression line");
//spd2.addData(distancesForSecondRegression, wavelengthResidualsForSecondRegression, "points after stud res cutoff");
//spd2.setVisible(true);

        _log.debug("After second regression, wavelength: " +
                (theoreticalMassWavelength + regressionResult[1]));

        return theoreticalMassWavelength + regressionResult[1];
    }

    /**
     * First, find all pairs of features... or, actually, a representative sample of 
     * maxPairsForLeverageCalc pairs, due to performance considerations -- the problem 
     * is just too large.  
     * We ensure a fairly random sample, by figuring out by what factor there are more 
     * pairs than we can handle (x) and then taking every xth pair.  This won't ensure
     * that we actually get maxPairs... pairs, only that we get at least maxPairs.../2.  
     * Anything more complicated gets too intensive, though, due to the sheer number of
     * pairs... even choosing, say, 10 million random numbers takes way too long.  
     *
     * Next, calculate the distances between each pair, and the wavelength for each pair.
     *
     * Then, calculate the leverage of each (based on distance) and toss out the ones 
     * with high leverage.
     *
     * Return what's left
     *
     * @param masses
     * @param maxPairsForLeverageCalc
     * @return
     */
    public static Pair<double[],double[]> findDistancesAndWavelengthResidualsForRegression(
            float[] masses, int maxPairsForLeverageCalc, double theoreticalMassWavelength)
    {
        int numFeatures = masses.length;

        int numPairs = 0;
        
        numPairs = (numFeatures*(numFeatures-1))/2;

        ApplicationContext.infoMessage("Total pairs: " + numPairs);

        int pairInterval = 1;
        if (numPairs > maxPairsForLeverageCalc)
        {
            ApplicationContext.infoMessage("Too many pairs (" + numPairs + "), reducing...");

            pairInterval = (Math.round(numPairs / maxPairsForLeverageCalc));
            if (pairInterval == 1)
                pairInterval = 2;

            //todo: this is stupid.  I should be able to calculate this.  Why am I so lazy??!
            _log.debug("Counting reduced pairs");
            int numKeptPairs=0;
            int allPairsIndex=0;
            for (int i=0; i<numFeatures; i++)
            {
                for (int j=0; j<numFeatures; j++)
                {
                    if (i<j && (allPairsIndex % pairInterval) == 0)
                    {
                        numKeptPairs++;
                    }
                    allPairsIndex++;
                }
            }

            numPairs = numKeptPairs;
            _log.debug("Done Counting reduced pairs: " + numPairs);
        }

        float[] pairWavelengthResiduals = new float[numPairs];
        float[] pairDistances = new float[numPairs];

        int allPairsIndex=0;
        int keptPairsIndex=0;

        for (int i=0; i<numFeatures; i++)
        {
            for (int j=0; j<numFeatures; j++)
            {
                if (i<j && (allPairsIndex % pairInterval) == 0)
                {
                    pairDistances[keptPairsIndex] =
                            Math.abs(masses[i] - masses[j]);

                    double thisPairWavelengthResidual =
                            pairDistances[keptPairsIndex] %
                                    theoreticalMassWavelength;
                    if (thisPairWavelengthResidual >= 0.5)
                        thisPairWavelengthResidual -= 1;
                    pairWavelengthResiduals[keptPairsIndex++] = (float) thisPairWavelengthResidual;
                }
                allPairsIndex++;
            }
        }

//ScatterPlotDialog spd = new ScatterPlotDialog(pairDistances, pairWavelengths, "all points");
//spd.setVisible(true);

        double maxLeverage =
                DEFAULT_MAX_LEVERAGE_NUMERATOR_FOR_REGRESSION / (double) numPairs;
        float[] leverages = BasicStatistics.leverages(pairDistances);

        ApplicationContext.setMessage("Calculated leverages");

        List<Integer> pairsInFirstRegression = new ArrayList<Integer>();
        int numPairsInRegression = 0;
        for (int i=0; i<numPairs; i++)
        {
            if (leverages[i] < maxLeverage)
            {
                pairsInFirstRegression.add(i);
                numPairsInRegression++;
            }
        }

        double[] regressionDistances = new double[numPairsInRegression];
        double[] regressionWavelengthResiduals = new double[numPairsInRegression];
        int index=0;
        for (int pairIndex : pairsInFirstRegression)
        {
                regressionDistances[index] = pairDistances[pairIndex];
                regressionWavelengthResiduals[index] = pairWavelengthResiduals[pairIndex];
                index++;
        }

        return new Pair<double[],double[]>(regressionDistances, regressionWavelengthResiduals);
    }

    /**
     * Convenience method for PPM conversion
     * @param mass
     * @return
     */
    public static double calculateMassDefectDeviationPPM(float mass,
                                                         double theoreticalMassWavelength)
    {
        double massDefectDaltons = calculateMassDefectDeviation(mass, theoreticalMassWavelength);
        return ((massDefectDaltons * 1000000.0) / mass);
    }

    /**
     * Calculate the deviation from the closest theoretical mass cluster.
     * Assumes masses are already calibrated correctly
     * @param mass
     * @return
     */
    public static double calculateMassDefectDeviation(float mass,
                                                      double theoreticalMassWavelength)
    {

        //mass offset is the mass mod the weighted average of all peptides'
        //actual mass per dalton
        double predictedMassDefectDeviation =
                (mass - THEORETICAL_MASS_DEFECT_INTERCEPT) % (theoreticalMassWavelength);

        //massOffset is now divided... half the peptides will be above the average,
        //and those will be just above 0.  The other half will be below the average,
        //and those will be just below 1.
        //This step moves the ones just below 1 so they're just below 0
        if (predictedMassDefectDeviation >= 0.5)
            predictedMassDefectDeviation -= 1;

        return predictedMassDefectDeviation;
    }

}