WaveletPeakExtractor.java example

Explorer
msInspect-master
/*
 * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.fhcrc.cpl.viewer.feature.extraction;

import org.fhcrc.cpl.toolbox.proteomics.feature.Spectrum;
import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.viewer.feature.ExtractMaxima2D;
import org.fhcrc.cpl.viewer.feature.extraction.PeakExtractor;
import org.fhcrc.cpl.viewer.feature.extraction.SmootherCreator;
import org.fhcrc.cpl.toolbox.datastructure.Tree2D;
import org.fhcrc.cpl.toolbox.datastructure.FloatRange;
import org.fhcrc.cpl.toolbox.datastructure.Pair;
import org.fhcrc.cpl.toolbox.proteomics.Scan;
import org.apache.log4j.Logger;

import java.util.*;

/**
 * smooth, ExtractMaxima2D.analyze()
 *   Sort by decreasing intensity
 *   Throw out peaks with < 5 scans
 *   Break up what look like double-humped elution profiles by breaking peaks
 *   at valleys in the profile
 *   For each peak:
 *           -Compute the extents
 *                  -Walking down scans, make sure that the peak's intensity
 *                   is greater than the intensities of the two m/z's around it
 *                      -Make sure that intensity is above threshold
 *                      -If you start going back up in intensity, stop
 *                  -Same for end scan
 *                  -If too short (<5 scans), toss it out
 *            -"Integrate" to determine total intensity: for each scan, add a
 *             block of intensity corresponding to the intensity on that scan
 *             multiplied by half the amount of time between the adjoining scans
 *            -Create a feature to represent the individual peak
 *   Exclude all features initially
 *   Filter features: for each feature:
 *           find all neighbors within 5 scans and 1.1 m/z units
 *           if the m/z's are too close (within 1.5 / _resamplingFrequency, for some reason), no dice
 *           make sure they're lined up, scan-wise
 *           If not enough scans combined in the two features, no dice
 *           If intensities not high enough, no dice
 *           If we got here, unexclude both
 */
public class WaveletPeakExtractor implements PeakExtractor
{
    protected static Logger _log = Logger.getLogger(WaveletPeakExtractor.class);

    protected boolean _peakRidgeWalkSmoothed = DEFAULT_PEAK_RIDGE_WALK_SMOOTHED;

    public static final float DEFAULT_THRESHOLD_OFFSET_WAVELET = 1.0f;
    public static final float DEFAULT_THRESHOLD_OFFSET_SMOOTHED = 0f;

    public static final int DEFAULT_PEAK_LENGTH_REQUIREMENT = 5;

    protected float minRidgeProportionOfMax = .05f;


    //This constant and variable control which level of the wavelet is used.  3, the default, works well for
    //a resampling frequency of 36.
    public static final int DEFAULT_WAVELET_LEVEL = 3;
    protected int waveletLevel = DEFAULT_WAVELET_LEVEL;

    //minimum number of scans
    protected int minPeakScans = DEFAULT_PEAK_LENGTH_REQUIREMENT;

    public Feature[] extractPeakFeatures(Scan[] scans, float[][] spectra,
                                         FloatRange mzRange)
            throws InterruptedException
    {
        // debug counters
        int d_rawPeaks = 0;
        int d_tooShort = 0;
        int d_filtered = 0;

        Thread currentThread = Thread.currentThread();

        int numSpectra = spectra.length;
        int spectrumHeight = spectra[0].length;


        //
        // separate background and signal components
        // we're pretty conservative about what we call background,
        // we can use background and/or median later to filter peaks
        // further.
        //
        BackgroundRemover backgroundRemover = new BackgroundRemover();
        float[][] background = backgroundRemover.removeBackground(spectra);
        float[][] median = backgroundRemover.calculateMedian(spectra);



        float[][] wavelets = extractWavelets(spectra);

        //smooth spectra
        float[][] smoothedSpectra = new float[wavelets.length][];
        for (int s = 0; s < wavelets.length; s++)
            smoothedSpectra[s] = wavelets[s].clone();
        SmootherCreator.getThresholdSmoother().smooth(smoothedSpectra);

        //
        // extract all maxima from a time-smoothed version of the wavelet transformed data
        // we have a lot of versions of the data now, but it's helpful to keep it all around
        //
        Spectrum.Peak[] rawPeaks =
                ExtractMaxima2D.analyze(smoothedSpectra, mzRange.min, SpectrumResampler.getResampleInterval(),
                        null, 0.0F);

        int numRawPeaks = rawPeaks.length;

        // ???? if no raw peaks to process, bail with empty collection to avoid out
        // of bounds errors
        if ( 0 == numRawPeaks )
            return new Feature[0];

        Arrays.sort(rawPeaks, Spectrum.comparePeakIntensityDesc);
        if (_log.isDebugEnabled())
        {
            _log.debug("raw peaks: " + rawPeaks.length);
            if (rawPeaks.length > 0)
                for (int i = 0; i < 10; i++)
                    _log.debug("  " + (i) + ": " +
                            rawPeaks[rawPeaks.length * i / 100].getIntensity());
            if (rawPeaks.length > 0)
                for (int i = 1; i < 10; i++)
                    _log.debug("  " + (i * 10) + ": " +
                            rawPeaks[rawPeaks.length * i / 10].getIntensity());
            _log.debug(" 100: " + rawPeaks[rawPeaks.length - 1].getIntensity());
        }


        //
        // We probably have a LOT of peaks at this point.  95% are extremely small.
        // The size<shortPeak check in this loop will throw out most, and I
        // don't need to be too aggressive here
        //
        // NOTE: this loop also tries to handle "double humped elution profiles"
        // by breaking peaks at "valleys" in the elution profile.  This could perhaps
        // be improved upon with a second pass that specifically handles this case and
        // makes sure that in a region the splitting is consistent.
        //

        float[][] ridge;
        float thresholdOffset;

        if (_peakRidgeWalkSmoothed)
        {
            // Use smoothed peaks for ridge-walking
            ridge = smoothedSpectra;
            thresholdOffset = DEFAULT_THRESHOLD_OFFSET_SMOOTHED;
        }
        else
        {
            // Use wavelet peaks for ridge-walking
            ridge = wavelets;
            thresholdOffset = DEFAULT_THRESHOLD_OFFSET_WAVELET;
        }

        ArrayList<Feature> waveletPeaks = new ArrayList<Feature>();
        for (int i = 0, max = rawPeaks.length; i < max; i++)
        {
            Spectrum.Peak peak = rawPeaks[i];
            int imz = Math.round((peak.mz - mzRange.min) * SpectrumResampler.getResampleFrequency());

            if (imz < 2 || imz >= spectrumHeight - 2)
                continue;
            int scan = peak.scan;
            float peakIn = spectra[peak.scan][imz];
            float peakBg = background[peak.scan][imz];
            float peakMd = median[peak.scan][imz];

            // compute extent of this feature
            // this is complicated by the fact that we may have
            // double-humped peaks we need to separate
            int scanStart = scan;
            int scanEnd = scan;
            //todo: everything in the thresholdIn calculation is questionable and performs rather badly for metabolite data.
            //It's quite easy to get multiple features that overlap, and multiple features for the same m/z that
            //don't really seem to be separated by a valley.
            float thresholdIn = thresholdOffset + 0.5F * background[scan][imz] +
                    2 * median[scan][imz];
            thresholdIn = Math.max(peakIn * minRidgeProportionOfMax, thresholdIn);
            float minIn = Float.MAX_VALUE;
            int scanMinIn = scan;

            for (; scanStart > 0; scanStart--)
            {
                float spectraIn = spectra[scanStart][imz];
                // are we still on the ridge and > 5% of max
                if (spectraIn < thresholdIn ||
                        ridge[scanStart][imz] < ridge[scanStart][imz - 2] ||
                        ridge[scanStart][imz] < ridge[scanStart][imz + 2])
                {
                    scanStart++;
                    break;
                }
                // have we crossed a 'valley'
                float smoothIn = smoothedSpectra[scanStart][imz];
                if (smoothIn < minIn)
                {
                    minIn = spectraIn;
                    scanMinIn = scanStart;
                }
                else if (smoothIn > minIn * 1.1 + thresholdIn) // made up cutoff
                {
                    scanStart = scanMinIn;
                    break;
                }
            }

            minIn = Float.MAX_VALUE;
            scanMinIn = scan;
            for (; scanEnd < numSpectra - 1; scanEnd++)
            {
                float spectraIn = spectra[scanEnd][imz];
                // are we still on the ridge and > 5% of max
                if (spectraIn < thresholdIn ||
                        ridge[scanEnd][imz] < ridge[scanEnd][imz - 2] ||
                        ridge[scanEnd][imz] < ridge[scanEnd][imz + 2])
                {
                    scanEnd--;
                    break;
                }
                // have we crossed a 'valley'
                float smoothIn = smoothedSpectra[scanEnd][imz];
                if (smoothIn < minIn)
                {
                    minIn = spectraIn;
                    scanMinIn = scanEnd;
                }
                else if (smoothIn > minIn * 1.1 + thresholdIn) // made up cutoff
                {
                    scanEnd = scanMinIn;
                    break;
                }
            }


            if (scanEnd - scanStart + 1 < minPeakScans)
            {
                d_tooShort++;
                continue;
            }

            // total intensity calculation
            // "integrate" intensity over time
            float totalIn = 0;
            for (int s = scanStart; s <= scanEnd; s++)
            {
                double in = spectra[s][imz];
                double time = 1.0;
                if (s > 0 && s + 1 < scans.length)
                    time = (scans[s + 1].getDoubleRetentionTime() -
                            scans[s - 1].getDoubleRetentionTime()) /
                            2;
                totalIn += time * in;
            }

            Feature f = new Feature(peak.scan, scanStart, scanEnd,
                    peak.mz, peakIn, 0, -1, 0);
            f.background = peakBg;
            f.median = peakMd;
            f.totalIntensity = totalIn;
            waveletPeaks.add(f);
        }

        if (currentThread.isInterrupted())
            throw new InterruptedException();

        Collections.sort(waveletPeaks, Spectrum.compareFeatureLengthDesc);

        // some stats
        if (_log.isDebugEnabled())
        {
            _log.debug("FEATURE LENGTHS " + waveletPeaks.size() + " peaks");
            if (waveletPeaks.size() > 0)
            {
                for (int i = 0; i < 10; i++)
                    _log.debug("  " + (i) + ": " +
                            rawPeaks[waveletPeaks.size() * i / 100].getIntensity());
            }
            if (waveletPeaks.size() > 0)
            {
                for (int i = 1; i < 10; i++)
                    _log.debug("  " + (i * 10) + ": " +
                            rawPeaks[waveletPeaks.size() * i / 10].getIntensity());
            }
        }

        // OK, up the bar for a feature (min length of at largest peak in the feature)
        minPeakScans++;
        _log.debug("SHORT PEAK " + minPeakScans);

        //
        // Filtering
        //
        // Sticking with the idea of persistance, eliminate peaks
        // that don't seem to have correllated peaks
        //

        Tree2D tree = new Tree2D();
        for (Feature f : waveletPeaks)
        {
            f.excluded = true;
            tree.add(f.scan, f.mz, f);
        }
        Collections.sort(waveletPeaks, Spectrum.compareFeatureLengthDesc);
//QUESTION: so only features with other related features get included... so how do we
//end up with any single-peak features?

        for (Feature f : waveletPeaks)
        {
            int lenF = f.getScanLast() - f.getScanFirst() + 1;
            ArrayList<Feature> l = (ArrayList<Feature>)
                    tree.getPoints(f.scan - 5, f.mz - 1.1F, f.scan + 5, f.mz + 1.1F);
//too much detail
//            _log.debug("Evaluating feature with " + l.size() + " neighbors: " + f);
            for (Feature neighbor : l)
            {
                int lenN = neighbor.getScanLast() - neighbor.getScanFirst() + 1;
                // we want features of different masses
//QUESTION: what is the significance of 1.5?
                if (Math.abs(neighbor.mz - f.mz) < 1.5 / SpectrumResampler.getResampleFrequency())
                {
                    _log.debug("\tneighbor too close in m/z (" + Math.abs(neighbor.mz - f.mz) + ")");
                    continue;
                }
                // are centers aligned
                int s = Math.max(f.getScanFirst(), neighbor.getScanFirst());
                int e = Math.min(f.getScanLast(), neighbor.getScanLast());
                if (f.scan < s || f.scan > e || neighbor.scan < s || neighbor.scan > e)
                {
//too much detail
//                          _log.debug("\tneighbor scans not aligned");
                    continue;
                }
                // if neither is chosen yet, add a small hurdle
                if (f.excluded && neighbor.excluded)
                {
//QUESTION: why checking the total number of scans in both?
                    if (lenF + lenN < minPeakScans)
                    {
//too much detail
//                      _log.debug("\tneighbor combined scans too short");
                        continue;
                    }
                    if (f.intensity < 1 + 0.5 * f.background + 3 * Math.max(1, f.median) &&
                            neighbor.intensity < 1 + 0.5 * f.background +
                                    3 * Math.max(1, neighbor.median))
                    {
//too much detail
//                      _log.debug("\tneighbor too low-intensity");
                        continue;
                    }
                }
                f.excluded = false;
                neighbor.excluded = false;
//too much detail
//              _log.debug("\tSuccess!");
            }
        }
        ArrayList<Feature> copy = new ArrayList<Feature>();
        for (Feature f : waveletPeaks)
        {
            if (!f.excluded)
            {
                copy.add(f);
//too much detail
//              _log.debug("Final list: adding: " + f);
            }
            else
                d_filtered++;
        }
        waveletPeaks = copy;

        Feature[] result =
                waveletPeaks.toArray(new Feature[waveletPeaks.size()]);
        return result;
    }


    /**
     *
     *         the D3 matrix is used by default to locate peaks,
     *         the D3 processing effectively sharpens the raw data, this does a great job of cleaning
     *         up messy data, e.g. wide/overlapping peaks, and stray peaks in centroided data
     *
     *         why D3?  This is based on our default resampling frequency 1Da/36 and our max expected charge
     *         which I'm presuming to be 6.  Level three represents a feature width of 2^3=8, which
     *         is near to 36/6.  A big change in resampling or desired max charge, might require a
     *         change.
     *
     * dhmay changing 20100803 to use waveletLevel rather than always using 3
     * @param spectra
     * @return
     */
    public float[][] extractWavelets(float[][] spectra)
    {
        int numSpectra = spectra.length;

        float[][] wavelets = new float[numSpectra][];
        for (int s = 0; s < spectra.length; s++)
            wavelets[s] = Spectrum.WaveletDX(spectra[s], null, waveletLevel);
        return wavelets;
    }


    /**
     * Assign the extra necessary info to a set of features representing peaks
     * so that they can be written to a file
     * @param peakFeatures
     * @param scans
     */
    public void prettyPeakFeaturesForOutput(Collection<Feature> peakFeatures,
                                            Scan[] scans)
    {
        for (Feature peak : peakFeatures)
        {
            // fix up scan
            Scan scan = scans[peak.scan];
            peak.setTime((float)scan.getDoubleRetentionTime());
            peak.scan = scan.getNum();
            peak.scanFirst = scans[peak.scanFirst].getNum();
            peak.scanLast = scans[peak.scanLast].getNum();
            peak.setPeaks(1); // otherwise these will get filtered by default
        }
    }





    public void setShortPeak(int minPeakScans)
    {
        this.minPeakScans = minPeakScans;
    }

    public int getShortPeak()
    {
        return minPeakScans;
    }

    public boolean isPeakRidgeWalkSmoothed()
    {
        return _peakRidgeWalkSmoothed;
    }

    public void setPeakRidgeWalkSmoothed(boolean peakRidgeWalkSmoothed)
    {
        _peakRidgeWalkSmoothed = peakRidgeWalkSmoothed;
    }

    public int getWaveletLevel() {
        return waveletLevel;
    }

    public void setWaveletLevel(int waveletLevel) {
        this.waveletLevel = waveletLevel;
    }
}