AmtUtilities.java example

Explorer
msInspect-master
/*
 * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.fhcrc.cpl.viewer.amt;

import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet;
import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureClusterer;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.AmtExtraInfoDef;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef;
import org.fhcrc.cpl.toolbox.proteomics.ProteomicsRegressionUtilities;
import org.fhcrc.cpl.toolbox.statistics.MatrixUtil;
import org.fhcrc.cpl.toolbox.ApplicationContext;
import org.fhcrc.cpl.toolbox.statistics.RegressionUtilities;
import org.fhcrc.cpl.toolbox.proteomics.*;
import org.apache.log4j.Logger;

import java.util.*;

/**
 * Utilities related to AMT
 */
public class AmtUtilities
{
    private static Logger _log = Logger.getLogger(AmtUtilities.class);

    //name and version of the hydrophobicity algorithm currently used.
    //Values should be stored in normalized form, but it's still important
    //to keep track of what version of the algorithm they came from.
    public static final String HYDROPHOBICITY_ALGORITHM_NAME = "krokhin";
    public static final double HYDROPHOBICITY_ALGORITHM_VERSION = 3.0;

    //the search_engine string to use for pepXML files that contain AMT results
    public static final String AMT_SEARCH_ENGINE_CODE_FOR_PEPXML = "msInspect/AMT";

    public static void recordHydrophobicities(FeatureSet featureSet,
                                              Map<String,Double> regressionLine,
                                              int scanOrTimeMode)
    {
        for (Feature feature : featureSet.getFeatures())
            AmtExtraInfoDef.setObservedHydrophobicity(feature,
                    RegressionUtilities.predictYFromX(getSlopeFromRegressionLine(regressionLine),
                                               getInterceptFromRegressionLine(regressionLine),
                                               (scanOrTimeMode== ProteomicsRegressionUtilities.REGRESSION_MODE_SCAN ?
                                                 feature.getScan() :
                                                 feature.getTime())));
    }

    public static void recordMS1Hydrophobicities(FeatureSet featureSet,
                                                 Map<String,Double> regressionLine,
                                                 int scanOrTimeMode)
    {
        recordHydrophobicities(featureSet, regressionLine, scanOrTimeMode);
    }

    /**
     * Pick numProteins proteins at "random" from a specified list of proteins.
     * It would sure be nice if we didn't have to load all the proteins in order to
     * pick a random sample, but since FastaLoader just provides an iterator, this is
     * about all we've got
     * @param fullProteinArray
     * @param numProteinsToLoad
     * @return
     */
    public static ArrayList<Protein> pickRandomProteins(ArrayList<Protein> fullProteinArray,
                                                        int numProteinsToLoad)
    {
        //cowardly refuse to pull out a random subset larger than the original set
        int fullProteinArraySize = fullProteinArray.size();
        if (fullProteinArraySize < numProteinsToLoad)
            return null;
        ArrayList<Protein> randomProteinArray = new ArrayList<Protein>(numProteinsToLoad);

        Random random = new Random();
        int[] chosenIndexes = new int[numProteinsToLoad];
        Arrays.fill(chosenIndexes,-1);
        for (int i=0; i<numProteinsToLoad; i++)
        {
            boolean foundOne = false;
            int nextIndex = -1;
            //prevent duplicate proteins
            while (!foundOne)
            {
                nextIndex = random.nextInt(fullProteinArraySize);
                if (Arrays.binarySearch(chosenIndexes,nextIndex) < 0)
                    foundOne = true;
            }
            randomProteinArray.add(fullProteinArray.get(nextIndex));
        }
        return randomProteinArray;
    }




    public static double[] getTimesForSortedScanArray(MSRun run,
                                                      int[] sortedScanArray)
    {
        double[] result = new double[sortedScanArray.length];
        if (run == null)
        {
            _log.error("Null run, can't calculate times for scans");
            return null;
        }
        int ms2IndexInRun = 0;
        MSRun.MSScan[] ms2Scans = run.getMS2Scans();
        for (int i = 0; i < sortedScanArray.length; i++)
        {
            int thisFeatureScanNum = sortedScanArray[i];
            //this relies on the fact that featuresForRegression is sorted by scan
            MSRun.MSScan thisFeatureScan = null;
            while (true)
            {
                if (ms2IndexInRun >= ms2Scans.length)
                {
                    //This will throw a NullPointerException down below when we
                    //try to access the retention time.
                    break;
                }
                int evalScanNum = ms2Scans[ms2IndexInRun].getNum();
                if (evalScanNum == thisFeatureScanNum)
                {
                    thisFeatureScan = ms2Scans[ms2IndexInRun];
                    break;
                }
                else if (evalScanNum > thisFeatureScanNum)
                {
                    _log.error("Error!  Read scan number " + evalScanNum + " while attempting to find scan number " + thisFeatureScanNum + ".");
                    _log.error("I.e., scan number " + thisFeatureScanNum + " was not found");
                    _log.error("Probably the wrong mzXml file was provided for pepXml file.");
                    break;
                }
                else
                    ms2IndexInRun++;
            }
            assert(thisFeatureScan != null);
            result[i] = thisFeatureScan.getDoubleRetentionTime();
        }
        return result;
    }

    /**
     * Workhorse method for relating time to hydrophobicity using linear regression,
     * can be used to relate time in terms of hydrophobicity or vice versa.
     *
     * optionally use robust regression, which takes a lot longer
     *
     * @return a hashmap containing three elements, whose keys are defined in constants: the slope,
     * intercept and sigma of the regression
     *
     * Note:  this regression predicts hydrophobicity in terms of time.  This relationship is
     * not reciprocal, due to all sorts of arcane weirdness related to linear regression.
     */
    public static Map<String,Double> calculateTimeHydroRelationshipEitherWay(
            Feature[] featuresForRegression, int scanOrTimeMode,
            boolean predictTimeFromHydro,
            boolean robustRegression)
    {


        Map<String,Double> result = new HashMap<String,Double>();

        //clone because we're sorting
        Feature[] featuresForRegressionClone =
                featuresForRegression.clone();

        if (featuresForRegressionClone.length == 0)
        {
            throw new RuntimeException("AmtUtilities.calculateTimeHydroRelationshipEitherWay: " +
            "regression failed, no eligible features to use for regression.");
        }

        Arrays.sort(featuresForRegressionClone, new Feature.ScanAscComparator());
        double[] hydrophobicities = new double[featuresForRegressionClone.length];
        double[] scansOrTimes = new double[hydrophobicities.length];

        for (int i=0; i<featuresForRegressionClone.length; i++)
        {
            Feature feature = featuresForRegressionClone[i];
            hydrophobicities[i] = calculateNormalizedHydrophobicity(
                    MS2ExtraInfoDef.getFirstPeptide(feature));
            scansOrTimes[i] =
                    (scanOrTimeMode == ProteomicsRegressionUtilities.REGRESSION_MODE_TIME) ?
                           feature.getTime() : feature.getScan();
        }


        try
        {
            double regressionLineSlope, regressionLineIntercept, sigma;


            double[] regressionResult;

            if (robustRegression)
                regressionResult = RegressionUtilities.robustRegression(
                    predictTimeFromHydro ? hydrophobicities : scansOrTimes,
                    predictTimeFromHydro ? scansOrTimes : hydrophobicities);
            else
                regressionResult = MatrixUtil.linearRegression(
                        predictTimeFromHydro? hydrophobicities : scansOrTimes,
                        predictTimeFromHydro? scansOrTimes : hydrophobicities);

            regressionLineSlope = regressionResult[1];
            regressionLineIntercept = regressionResult[0];
                        //not currently using sigma for anything, but I have a hunch it'll be crucial
            sigma = MatrixUtil.sigma(hydrophobicities, scansOrTimes, regressionResult);
            result.put(RegressionUtilities.REGRESSION_SLOPE_KEY, regressionLineSlope);
            result.put(RegressionUtilities.REGRESSION_INTERCEPT_KEY, regressionLineIntercept);
            result.put(RegressionUtilities.REGRESSION_SIGMA_KEY, sigma);
        }
        catch (Exception e)
        {
            _log.error("Failure to calculate hydrophobicity->time prediction",e);
        }

        return result;
    }

    /**
     * @return a HashMap containing three elements, whose keys are defined in constants: the slope,
     * intercept and sigma of the regression
     *
     * Note:  this regression predicts time in terms of hydrophobicity.  This relationship is
     * not reciprocal, due to all sorts of arcane weirdness related to linear regression.
     */
    public static Map<String,Double> calculateScanOrTimeHydrophobicityRelationship(
            Feature[] featuresForRegression, int scanOrTimeMode,
            boolean robustRegression)
    {
        return calculateTimeHydroRelationshipEitherWay(featuresForRegression, scanOrTimeMode,
                                                       false,
                                                       robustRegression);
    }

    /**
     * @return a HashMap containing three elements, whose keys are defined in constants: the slope,
     * intercept and sigma of the regression
     *
     * Note:  this regression predicts hydrophobicity in terms of time.  This relationship is
     * not reciprocal, due to all sorts of arcane weirdness related to linear regression.
     */
    public static Map<String,Double> calculateHydrophobicityScanOrTimeRelationship(
            Feature[] featuresForRegression, int scanOrTimeMode,
            boolean robustRegression)
    {
        return calculateTimeHydroRelationshipEitherWay(featuresForRegression, scanOrTimeMode,
                                                       true,
                                                       robustRegression);
    }

    /**
     * Given a peptide, and a line describing the relationship between hydrophobicity and
     * scan number, calculate the hydrophobicity of the peptide and return the predicted scan
     * for that hydrophobicity
     * @param intercept
     * @param peptide
     * @return
     */
    protected static double predictScanOrTime(double slope, double intercept, Peptide peptide)
    {
        return predictScanOrTime(slope, intercept,
                                 calculateNormalizedHydrophobicity(peptide));
    }

    /**
     * Given a hydrophobicity, and a line describing the relationship between hydrophobicity and
     * scan number, return the predicted scan for a given hydrophobicity.
     * @param slope
     * @param intercept
     * @param hydrophobicity
     * @return
     */
    public static double predictScanOrTime(double slope, double intercept, double hydrophobicity)
    {
        double predictedScanOrTime = RegressionUtilities.predictYFromX(slope, intercept, hydrophobicity);
        predictedScanOrTime = Math.max(predictedScanOrTime,0);
        return predictedScanOrTime;
    }

    public static double getSlopeFromRegressionLine(Map<String,Double> regressionLine)
    {
        return regressionLine.get(RegressionUtilities.REGRESSION_SLOPE_KEY);
    }

    public static double getInterceptFromRegressionLine(Map<String,Double> regressionLine)
    {
        return regressionLine.get(RegressionUtilities.REGRESSION_INTERCEPT_KEY);
    }

    /**
     * Predict the scan or time of a feature (depending on what the regression line represents)
     * @param regressionLine
     * @param feature
     * @return
     */
    public static double predictScanOrTime(Map<String,Double> regressionLine,
                                           Feature feature)
    {
        double slope = getSlopeFromRegressionLine(regressionLine);
        double intercept = getInterceptFromRegressionLine(regressionLine);
        return predictScanOrTime(slope, intercept, feature);
    }

    /**
     * Predict the scan or time of a feature (depending on what the regression parameters represent)
     * @param feature
     * @return
     */
    public static double predictScanOrTime(double slope, double intercept, Feature feature)
    {
        //this is lame.  Really peptide should have a constructor that doesn't require a protein
        Protein fakeProtein =
                new Protein("", MS2ExtraInfoDef.getFirstPeptide(feature).getBytes());
        Peptide currentPeptide = new Peptide(fakeProtein,0,fakeProtein.getBytes().length);
        //predict either the scan or the time, based on hydrophobicity.
        //the result will depend on which mode we're in
        return predictScanOrTime(slope, intercept, currentPeptide);
    }

    public static double convertDeltaScanOrTimeToHydro(Map<String,Double> regressionLineMap,
                                                       double scanOrTimeValueToConvert)
    {
        double slope = AmtUtilities.getSlopeFromRegressionLine(regressionLineMap);
        double intercept = AmtUtilities.getInterceptFromRegressionLine(regressionLineMap);
        return convertDeltaScanOrTimeToHydro(slope, intercept, scanOrTimeValueToConvert);
    }

    public static double convertDeltaScanOrTimeToHydro(double slope, double intercept,
                                                       double deltaScanOrTime)
    {
        return RegressionUtilities.predictYFromX(slope, intercept, deltaScanOrTime) -
               RegressionUtilities.predictYFromX(slope, intercept, 0);
    }





    /**
     * Create a peptide object from a peptide sequence.  Useful for finding mass, etc.
     * @param peptideSequence
     * @return
     */
    public static Peptide createPeptideFromSequence(String peptideSequence)
    {
        //this is lame.  Really peptide should have a constructor that doesn't require a protein
        Protein fakeProtein = new Protein("",peptideSequence.getBytes());
        return new Peptide(fakeProtein,0,fakeProtein.getBytes().length);
    }



    //Methods related to calculation and normalization of peptide hydrophobicity.
    //It's all about convenience.


    public static double calculateRawHydrophobicity(String peptideSequence)
    {
        return calculateRawHydrophobicity(createPeptideFromSequence(peptideSequence));
    }

    /**
     * Cover method for Peptide.getHydrophobicity / getHydrophobicity3.
     * Use this to switch between versions of the hydrophobicity calculator
     * @param peptide
     * @return
     */
    public static double calculateRawHydrophobicity(Peptide peptide)
    {
        //return peptide.getHydrophobicity();
        return peptide.getHydrophobicity3();
    }

    public static double normalizeHydrophobicity(double inputHydrophobicity)
    {
        return HydrophobicityNormalizer.normalize(inputHydrophobicity,
                                                  HYDROPHOBICITY_ALGORITHM_NAME,
                                                  HYDROPHOBICITY_ALGORITHM_VERSION);
    }

    public static double calculateNormalizedHydrophobicity(Peptide peptide)
    {
        return normalizeHydrophobicity(calculateRawHydrophobicity(peptide));
    }

    public static double calculateNormalizedHydrophobicity(String peptideSequence)
    {
        return normalizeHydrophobicity(calculateRawHydrophobicity(peptideSequence));
    }

    /**
     * Warning about this one: if there's more than one peptide associated with
     * the feature, it'll take the first 
     * @param feature
     * @return
     */
    public static double calculateNormalizedHydrophobicity(Feature feature)
    {
        return calculateNormalizedHydrophobicity(MS2ExtraInfoDef.getFirstPeptide(feature));
    }




    /**
     * Assumes features have times populated if necessary
     *
     * Given a line that should predict time from  hydrophobicity, indicate by how much
     * (in units of scans or seconds) each feature appears to deviate from that line
     * @param features
     * @param scanOrTimeMode
     * @return
     */
    public static void addHydrophobicityToFeatures(Feature[] features,
                                                   int scanOrTimeMode,
                                                   double[] timeToHCoefficients)
    {
        try
        {
            for (Feature feature : features)
            {
                AmtExtraInfoDef.setObservedHydrophobicity(feature,
                        RegressionUtilities.mapValueUsingCoefficients(timeToHCoefficients,
                            (scanOrTimeMode == ProteomicsRegressionUtilities.REGRESSION_MODE_TIME) ?
                                feature.getTime() : feature.getScan()));
            }
        }
        catch (Exception e)
        {
            ApplicationContext.errorMessage("Error adding hydrophobicity observations to features",e);
        }
    }


    /**
     * Clusterable for clustering based on feature mz and hydrophobicity.
     * Allows access to the original Feature
     */
    public static class FeatureMzHydroClusterable
        extends FeatureClusterer.FeatureClusterable
    {
        public FeatureMzHydroClusterable(Feature feature)
        {
            super(feature);
        }

        public double getDimension1Value()
        {
            return parentFeature.mz;
        }
        public double getDimension2Value()
        {
            return AmtExtraInfoDef.getObservedHydrophobicity(parentFeature);
        }
    }

    /**
     * Clusterable for clustering based on feature mass and hydrophobicity.
     * Allows access to the original Feature
     */
    public static class FeatureMassHydroClusterable
        extends FeatureClusterer.FeatureClusterable
    {
        public FeatureMassHydroClusterable(Feature feature)
        {
            super(feature);
        }

        public double getDimension1Value()
        {
            return parentFeature.mass;
        }
        public double getDimension2Value()
        {
            return AmtExtraInfoDef.getObservedHydrophobicity(parentFeature);
        }
    }
}