DefaultFeatureScorer.java example

Explorer
msInspect-master
/*
 * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.fhcrc.cpl.viewer.feature.extraction;

import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.proteomics.feature.Spectrum;
import org.fhcrc.cpl.viewer.feature.extraction.strategy.BaseFeatureStrategy;
import org.fhcrc.cpl.toolbox.gui.chart.PanelWithHistogram;
import org.fhcrc.cpl.toolbox.gui.chart.PanelWithScatterPlot;
import org.apache.log4j.Logger;

import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;

/**
 * Default feature scorer.
 * If we ever implement another one, maybe some machinery can be moved
 * up into a base class*
 *
 * Has side effects on scored features: adjusts m/z, and sets kl, etc.
 *
 */
public class DefaultFeatureScorer implements FeatureScorer
{

    //maximum peaks per feature to consider
    public static final int DEFAULT_MAX_PEAKS_PER_FEATURE = 10;
    protected int _maxPeaksPerFeature = DEFAULT_MAX_PEAKS_PER_FEATURE;

    private static Logger _log = Logger.getLogger(DefaultFeatureScorer.class);


    // need to pick units to scale by, 0 means good 1 is pretty bad...
    public static final float SUMSQUARES_MZ_WEIGHT = 6; // 1/6 of a Da is bad
    public static final float SUMSQUARES_SCALED_INTENSITY_WEIGHT = 4; // 1/4 in scaled intesity is bad

    protected boolean keepStatistics = false;
    protected List<Float> unscaledMzDistances = new ArrayList<Float>();
    protected List<Float> unscaledIntensityDistances = new ArrayList<Float>();


    /**
     * score a candidate feature f, and set the list of peaks that it's comprised of
     * <p/>
     * Expecting charge=0 features and peaks
     *
     * TODO: document this better
     *
     * @param f
     * @param peaks
     */
    public float scoreFeature(Feature f, Spectrum.Peak[] peaks)
    {

        double maxResampledDistanceBetweenFeatures =
                DefaultPeakCombiner.DEFAULT_MAX_ABS_DISTANCE_BETWEEN_PEAKS /
                        (SpectrumResampler.getResampleFrequency() - 1);

        int absCharge = Math.abs(f.charge);
        float invAbsCharge = 1.0F / absCharge;
        float mass = 0f;
        if (f.charge >= 0)
            mass = (f.mz-Spectrum.HYDROGEN_ION_MASS) * absCharge;
        else mass = f.mz * absCharge;

        Spectrum.Peak[] peptidePeaks = new Spectrum.Peak[_maxPeaksPerFeature];

        // find start position in peak list
        int p = Arrays.binarySearch(peaks, f, Spectrum.comparePeakMzAsc);
        if (p < 0) p = -(p + 1);
        p = Math.max(0, p - 1);
        int pLastFound = p;

        float mzP0 = f.mz;
        float sum = 0.0F;
        float intensityLast = 0.0F;
        float intensityHighest = 0.0F;
        boolean skippedPeak = false;
        float distSum = 0;
        float distCount = 0;
        for (int Pi = 0; Pi < peptidePeaks.length; Pi++)
        {
            float mzPi = mzP0 + Pi * invAbsCharge + (distCount > 0 ? distSum/distCount : 0);
            p = findClosestPeak(peaks, mzPi, p);
            Spectrum.Peak peakFound = peaks[p];
            float dist = Math.abs(peakFound.mz - mzPi);

//QUESTION: why 2.5?
            boolean found =
                    !peakFound.excluded &&
                            Math.abs(dist) < 2.5 * maxResampledDistanceBetweenFeatures;

            if (!found)
            {
                // miss two in a row, call it quits
                if (Pi > 0 && null == peptidePeaks[Pi-1])
                    break;
            }
            else
            {
                intensityHighest = Math.max(peakFound.intensity, intensityHighest);

                // if peak is too BIG stop:
                // CONSIDER do this at end, use fn of signal v. expected?
                // don't like hard cut-offs
                if (Pi > 2 && peakFound.intensity != intensityHighest &&
                        peakFound.intensity > intensityLast * 1.33 + f.median)
                    break;

                // fine-tune anchor MZ
                if (peakFound.intensity > intensityHighest/2)
                {
                    distSum += dist;
                    distCount++;
                }

                for (int s = pLastFound + 1; s < p; s++)
                {
                    Spectrum.Peak skipped = peaks[s];
                    if (skipped.intensity > intensityLast / 2 + f.median)
                        skippedPeak = true;
                }

                // record it
                peptidePeaks[Pi] = peakFound;
                pLastFound = p;
                intensityLast = peakFound.intensity;
            }
        }

        // Adjust MZ for feature
        float mean=0;
        float weight=0;
        for (int i=0 ; i<peptidePeaks.length ; i++)
        {
            //float mi = m0 + i * invCharge;
            if (null == peptidePeaks[i]) continue;
            mean += (peptidePeaks[i].mz - i*invAbsCharge) * peptidePeaks[i].intensity;
            weight += peptidePeaks[i].intensity;
        }
        f.mz = mean/weight;

        //
        // Calculate quality measures
        //

        //The 'signal' array will hold values for each peak, scaled to sum to 1.
        //Missing peaks are filled in with intensity 0.1 prior to scaling.
        //Maximum of 6 peaks considered.
        //This block also determines which peaks are "counted" in the Feature.peaks count.
        //Only features with intensity >= 1/50th of the most-intense peaks, and greater than twice
        //the median background noise, are counted
        int signalLength = Math.min(6, _maxPeaksPerFeature);
        float[] signal = new float[signalLength];
        int countPeaks = 0;
        for (int i = 0; i < peptidePeaks.length; i++)
        {
            Spectrum.Peak peak = peptidePeaks[i];
            if (null != peak && (peak.intensity > intensityHighest/50 &&
                    peak.intensity > 2 * f.median))
                countPeaks++;
            if (i < signal.length)
            {
                float v = null == peak ? 0 : peptidePeaks[i].intensity;
                signal[i] = Math.max(0.1F, v);
                sum += signal[i];
            }
        }
        for (int i = 0; i < signal.length; i++)
            signal[i] /= sum;

        f.kl = Spectrum.KLPoissonDistance(mass, signal);
        //TODO: Do we need another score?
        //f.score = f.kl >= 0 ? f.intensity * (float)-Math.log(f.kl + Math.exp(-1)) : 0F;
        f.setPeaks(countPeaks);
        f.skippedPeaks = skippedPeak;
        f.comprised = peptidePeaks;
        f.mzPeak0 = peptidePeaks[0].mz;

        //
        // What other metrics would be useful
        //

        float sumDist = calcSumSquaresDistance2D(f, signal, peptidePeaks);
        
        return sumDist;
    }

    /**
     * Assign a score to a particular feature, given a scaled representation of
     * its peak intensities and the peaks themselves.   This is kind of a 2-dimensional
     * sum-squares distance measure.
     *
     * Lower is better
     * @param f
     * @param scaledIntensityDistribution
     * @param peptidePeaks
     * @return
     */
    public float calcSumSquaresDistance2D(Feature f,
                                          float[] scaledIntensityDistribution,
                                          Spectrum.Peak[] peptidePeaks)
    {
        //Guess mass.  don't actually set f.mass, since this may not be the charge this feature
        //ends up having
        float mass = 0f;
        if (f.charge >= 0)
            mass = (f.mz-Spectrum.HYDROGEN_ION_MASS) * f.charge;
        else mass = f.mz * -f.charge;

        //Get a scaled poisson distribution for this mass
        float[] expectedDistribution = Spectrum.Poisson(mass);

        //Get the index of the last peak actually present. If that's beyond the end of the expected distribution,
        //move it back
        int lastPeak = 0;
        for (int i=0 ; i<scaledIntensityDistribution.length ; i++)
            if (null != peptidePeaks[i])
                lastPeak = i;
        lastPeak = Math.min(lastPeak, expectedDistribution.length-1);

        float sumDist = 0;
        for (int i=0 ; i<=lastPeak ; i++)
        {
            float dist = 1; // what should I use here....

            //only consider peaks that are actually present.
            //TODO: does this give an unfair advantage to features with missing peaks?
            if (null != peptidePeaks[i])
            {
                //compare observed peaks with theoretical peaks based on Dalton offsets from the base feature m/z
                float theoreticalPeakMz = (f.mz + i / (float) Math.abs(f.charge));
                float distMZ = Math.abs(peptidePeaks[i].mz - theoreticalPeakMz);

                //Remove the effect of resampling error on m/z accuracy
                distMZ = Math.max(0, distMZ - 1/(2*SpectrumResampler.getResampleFrequency()));

                float distIn =
                        Math.abs(scaledIntensityDistribution[i] - expectedDistribution[i]);

                if (keepStatistics)
                {
                    unscaledMzDistances.add(distMZ);
                    unscaledIntensityDistances.add(distIn);
                }

                //scale the dimensions appropriately
                distMZ *= SUMSQUARES_MZ_WEIGHT;
                distIn *= SUMSQUARES_SCALED_INTENSITY_WEIGHT;
                
                dist = distMZ * distMZ + distIn * distIn;

                //Scale the contribution of this peak by the predicted scaled intensity of the peak
                sumDist += dist * expectedDistribution[i];

//System.err.println("\t\tmass: " + mass + ", distMZ: " + distMZ + ", distIn: " + distIn + ", first int: " + scaledIntensityDistribution[0] + ", first expected int: " + expectedDistribution[0] + ", sumdist: " + sumDist);

            }
        }

        if (_log.isDebugEnabled() && Float.isNaN(sumDist))
        {
            _log.debug("scorePeakDistribution: returning NaN.  mz = " + f.mz + ", charge=" + f.charge);
        }

        return sumDist;
    }

    protected int findClosestPeak(Spectrum.Peak[] peaks, float x, int start)
    {
        float dist = Math.abs(x - peaks[start].mz);
        int i = start + 1;
        for (; i < peaks.length; i++)
        {
            float d = Math.abs(x - peaks[i].mz);
            if (d > dist)
                break;
            dist = d;
        }
        return i - 1;
    }


    public int getMaxPeaksPerFeature()
    {
        return _maxPeaksPerFeature;
    }

    public void setMaxPeaksPerFeature(int maxPeaksPerFeature)
    {
        this._maxPeaksPerFeature = maxPeaksPerFeature;
    }


    public boolean isKeepStatistics()
    {
        return keepStatistics;
    }

    public void setKeepStatistics(boolean keepStatistics)
    {
        this.keepStatistics = keepStatistics;
    }

    public void plotStatistics()
    {
        if (!keepStatistics)
            return;
        PanelWithHistogram pwhMz = new PanelWithHistogram(unscaledMzDistances,"Unscaled M/Z Dist");
        pwhMz.displayInTab();
        PanelWithHistogram pwhInt = new PanelWithHistogram(unscaledIntensityDistances,"Unscaled Int Dist");
        pwhInt.displayInTab();
//        float[] mzDistArray = new float[unscaledMzDistances.size()];
//        float[] intDistArray = new float[unscaledMzDistances.size()];
//        for (int i=0; i<unscaledMzDistances.size(); i++)
//        {
//            mzDistArray[i] = unscaledMzDistances.get(i);
//            intDistArray[i] = unscaledIntensityDistances.get(i);
//        }

        PanelWithScatterPlot pwsp = new PanelWithScatterPlot(unscaledMzDistances, unscaledIntensityDistances,
                                   "mz vs int dist");
        pwsp.setPointSize(1);
        pwsp.setAxisLabels("m/z distance","intensity distance");
        pwsp.displayInTab();
    }
}