FeatureStrategyUsingWindow.java example

Explorer
msInspect-master
/*
 * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.fhcrc.cpl.viewer.feature;

import org.apache.log4j.Logger;
import org.fhcrc.cpl.toolbox.proteomics.feature.Spectrum;
import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.viewer.feature.FeatureExtractor;
import org.fhcrc.cpl.viewer.feature.extraction.SpectrumResampler;
import org.fhcrc.cpl.toolbox.proteomics.MSRun;
import org.fhcrc.cpl.toolbox.datastructure.Tree2D;
import org.fhcrc.cpl.toolbox.proteomics.Scan;
import org.fhcrc.cpl.toolbox.datastructure.FloatRange;

import java.util.*;

/**
 * User: mbellew
 * Date: Sep 7, 2004
 * Time: 12:10:06 PM
 */
public class FeatureStrategyUsingWindow extends FeatureExtractor
{
    static Logger _log = Logger.getLogger(FeatureStrategyUsingWindow.class);
    static Feature[] _emptyFeatureArray = new Feature[0];

    Scan[] _scans;
    protected double _noise = 1.0;

    static double debugMZ = 0;

    public FeatureStrategyUsingWindow(MSRun run, int scan, int count,
                                      int maxCharge, FloatRange range, double sn)
    {
        super(run, scan, count, maxCharge, range, sn);
        _scans = getScans(run, scan, count);
    }


    public Feature[] _analyze() throws InterruptedException
    {
        return analyzeScanAtATime(_scans);
    }


    protected float[] _smooth(float[] s)
    {
        return Spectrum.FFTsmooth(s, 6.0, false);
    }


    protected Collection<Feature> analyze1D(Scan scan)
    {
        Spectrum.Peak[] peaks = _pickPeaks(scan);

        return ExtractPeptideFeatures(_run, peaks);//, scan);
    }


    protected Spectrum.Peak[] _pickPeaks(Scan scan)
    {
        float[][] rawSpectrum = scan.getSpectrum();
        float[][] spectrum = Spectrum.ResampleSpectrum(rawSpectrum, _mzRange,
                SpectrumResampler.getResampleFrequency(), false);
        spectrum[1] = _smooth(spectrum[1]);

        // not sure if this is technically noise, we've already smoothed
        // and perhaps removed background,
        // but it's what we consider a small peak
        _noise = Spectrum.Noise(spectrum[1], 0, spectrum[0].length);
        _noise = Math.max(_noise, 0.1);

        //
        // Generate Peaks
        //
        Spectrum.Peak[] peaks = Spectrum.PickPeaks(spectrum, _noise);
        for (Spectrum.Peak peak : peaks)
            peak.scan = scan.getNum();
        return peaks;
    }

    /**
     * For each single-peak feature:
     * Consider all features in a small window around it
     *       If multiple have the same m/z, take the one with the lower scan, discard the others
     *       Toss out anything < 1/10 the intensity of the highest-intense peak
     *       Consider all possible charge states:
     *               For each peak, determine the mass distance from the highest-intense peak
     *               If within tolerance, create a new multi-peak feature, make sure it contains
     *               the highest-intensity peak, score it (a complicated process),
     *               add it to a list of candidates
     *       Pick the best charge state and create a feature
     *
     * @param run
     * @param peaksIN
     * @return
     */
    protected ArrayList<Feature> ExtractPeptideFeatures(MSRun run, Spectrum.Peak[] peaksIN)
    {
        _logDebug("ExtractPeptideFeatures(" + peaksIN.length + ")");

        //
        // store peaks in 2-D tree for searching
        //
        Tree2D peaks2D = new Tree2D();
        for (Spectrum.Peak p : peaksIN)
        {
            peaks2D.add(p.scan, p.mz, p);
        }

        // OUTPUT list
        ArrayList<Feature> featureList = new ArrayList<Feature>();

        Spectrum.Peak[] peaksByIntensity = peaksIN.clone();
        Arrays.sort(peaksByIntensity, Spectrum.comparePeakIntensityDesc);

        //
        // UNDONE: PROBLEM WITH VERY LARGE 1+ FEATURES
        // in addition to all the other problems, we've observed a strange
        // phenomenon with very large peaks in 1+ charge features.  There may be an "echo"
        // of the feature at about .5-.7 daltons after the first, and sometimes second peak.
        // Thie echo can be very large (as large as the second "real" peak).  Moreover, the
        // echo seems to shift or compress the previous peak downward in MZ.  If this effect
        // is large enough it can break the corret identification of the main feature.
        //
        // There's probably a name for this phenomenon, but I don't know what it is.
        //
        // TODO: ignore the "echo" feature
        // TODO: correct for the shift in the main peaks
        //

        _logDebug("ExtractPeptideFeatures: using noise = " + _noise);
        for (Spectrum.Peak peakHighest : peaksByIntensity)
        {
            if (peakHighest.excluded)
                continue;

            boolean debugPeak = false;
            if (debugMZ != 0)
            {
                // note the 100-140 check will put the feature in the middle of the detail pane
                if(Math.abs(peakHighest.mz - debugMZ) < 1 && peakHighest.scan > 100 &&
                        peakHighest.scan < 140)
                    debugPeak = true;
            }

            float mzWindowStart = peakHighest.mz - 2.1F;
            float mzWindowEnd = peakHighest.mz + 6.1F;
            float scanStart = peakHighest.scan - 9;
            float scanEnd = peakHighest.scan + 9;

            // get collection of nearby peaks
            Spectrum.Peak[] peaks;
            {
                ArrayList<Spectrum.Peak> peakList =
                        peaks2D.getPoints(scanStart, mzWindowStart, scanEnd, mzWindowEnd);
                for (int p = peakList.size()-1; p>=0 ; p--)
                {
                    if ((peakList.get(p)).excluded)
                        peakList.remove(p);
                }
                // eliminate possible duplicates and get one list of peaks sorted by mz
                Spectrum.Peak[] peaksNear = peakList.toArray(new Spectrum.Peak[peakList.size()]);
                assert peaksNear.length > 0;
                Arrays.sort(peaksNear, Spectrum.comparePeakIntensityDesc);
                Arrays.sort(peaksNear, Spectrum.comparePeakMzAsc);
                int lenPeaks = 0;
                for (int s = 1; s < peaksNear.length; s++)
                {
                    if (peaksNear[lenPeaks].mz == peaksNear[s].mz)
                    {
                        if (Math.abs(peaksNear[s].scan-peakHighest.scan) <
                                Math.abs(peaksNear[lenPeaks].scan-peakHighest.scan))
                            peaksNear[lenPeaks] = peaksNear[s];
                    }
                    else
                    {
                        peaksNear[++lenPeaks] = peaksNear[s];
                    }
                }
                ++lenPeaks;
                peaks = new Spectrum.Peak[lenPeaks];
                System.arraycopy(peaksNear, 0, peaks, 0, lenPeaks);
            }

            if (debugPeak)
            {
                _logDebug("\n\npeaks");
                for (Spectrum.Peak peak : peaks)
                {
                    _logDebug((peak == peakHighest ? " *" : "  ") + peak.toString() +
                            "\t" + (peak.mz - peakHighest.mz));
                }
                _logDebug("scored features");
            }

            //
            // generate candidate features
            //

            ArrayList<Feature> candidates = new ArrayList<Feature>();

            //dhmay -- I don't really understand this parameter, why it has this exact value
            double delta = 1.0 / (SpectrumResampler.getResampleFrequency() - 1);

            for (Spectrum.Peak p : peaks)
            {
                if (p.mz > peakHighest.mz)
                    break;
                if (p.excluded)
                    continue;
                // even at high end of mass range (6400amu) leading peak is not expected
                // to be smaller than about 1/6 highest peak, and typically much larger
                if (p.intensity < peakHighest.intensity / 10.0F)
                    continue;
                double distance = peakHighest.mz - p.mz;
                for (int charge = _maxCharge; charge >= 1; charge--)
                {
                    double r = distanceNearestFraction(distance, charge);
                    if (r < 2 * delta)
                    {
                        Feature f = newFeatureRange(peakHighest, p.mz, charge);
                        ScoreFeature(f, peaks);

                        if (f.peaks == 1)
                            continue;

                        boolean containsPeakHighest = false;
                        for (Spectrum.Peak comprisedPeak : f.comprised)
                        {
                            if (comprisedPeak == peakHighest)
                            {
                                containsPeakHighest = true;
                                break;
                            }
                        }
                        if (!containsPeakHighest)
                            continue;

                        candidates.add(f);
                    }
                }
            }

            Feature bestFeature = BestFeature(candidates);

            if (bestFeature == null)
            {
                // 0+
                bestFeature = newFeatureRange(peakHighest, peakHighest.mz, 0);
                bestFeature.comprised = new Spectrum.Peak[] {peakHighest};
            }

            // CONSIDER: use expected largest peak instead of largest actual peak
            if (peakHighest instanceof Feature)
                bestFeature.totalIntensity = ((Feature)peakHighest).totalIntensity;

            //
            // CONSIDER: back check against found features
            //   could this be part of one of those??
            //

            //
            // UNDONE: quality tests, for now we output everything for evaluation
            //
            QualityTest:
            {
                if (bestFeature.getPeaks() == 1)
                {
                    bestFeature.setCharge(0);
                    bestFeature.setKl(-1);
                }

                // Now that charge is set, compute mass
                bestFeature.updateMass();

                if (debugPeak)
                {
                    _logDebug("bestFeature");
                    _logDebug("  " + bestFeature.toString());
                }

                // Got a good one

                //
                // fix ups
                //
                if (peakHighest instanceof Feature && 0 != ((Feature)peakHighest).getTime())
                    bestFeature.setTime(((Feature)peakHighest).getTime());
                else
                {
                    try
                    {
                        // sometimes we're not handed real scan numbers so catch()
                        int index = run.getIndexForScanNum(peakHighest.scan);
                        if (index < 0)
                            index = -(index+1);
                        bestFeature.setTime((float)run.getScan(index).getDoubleRetentionTime());
                    }
                    catch (Throwable t)
                    {
                    }
                }

                featureList.add(bestFeature);
            }

            //
            // exclude all peaks explained by bestFeature (even if we don't output it)
            //
            if (debugPeak)
                _logDebug("exclude peaks");

            for (Spectrum.Peak p : bestFeature.comprised)
            {
                if (null != p)
                {
                    p.excluded = true;
                    if (debugPeak)
                        _logDebug("  " + p.toString());
                }
            }
            assert peakHighest.excluded;
        }

        return featureList;
    }


    private Feature newFeatureRange(Spectrum.Peak peakHighest, float mz, int charge)
    {
        Feature f;
        if (peakHighest instanceof Feature)
            f = new Feature((Feature)peakHighest);
        else
            f = new Feature(peakHighest);
        f.charge = charge;
        f.mz = mz;
        return f;
    }



    /**
     * I used to do this pairwise, but had problem with weird
     * non-trasitive comparison heurisitics.
     *
     * This could undoubtably be improved, maybe with a combined
     * scoring function fn(charge, peaks, kl)?  or new binary
     * feature comparision BetterFeature(a,b)
     *
     * @param candidates
     * @return
     */
    static Feature BestFeature(ArrayList<Feature> candidates)
    {
        if (candidates.size() == 0)
            return null;

        //
        // Based on very little obvservation, it seems that the distribution
        // of KL scores tends to be bimodal.  There are the terrible scores
        // for guess that are hopelessly wrong, and decent scores for features
        // with the right charge (or factor of the right charge)
        //

        //
        // Toss the bad scores. .5 is arbitrary (and maybe too big).
        // It's hard to choose dynamically as the input set is often small
        //

        double min = candidates.get(0).kl;
        double max = min;
        for (Feature candidate : candidates)
        {
            min = Math.min(candidate.kl, min);
            max= Math.max(candidate.kl, max);
        }
        for (int i=candidates.size()-1 ; i>=0 ; i--)
        {
            Feature candidate = candidates.get(i);
            if (candidate.kl > min + 0.5)
                candidates.remove(i);
        }
        if (candidates.size() == 1)
            return candidates.get(0);

        //
        // other heuristics given a set of "not awful" features
        //

        Set<Feature> prunedSet = new HashSet<Feature>(candidates);
        // NOTE we rely on the candidate list being sorted by MZ ASC,CHARGE DESC,
        // see ExtractPeptideFeatures
        for (int i=0 ; i<candidates.size()-1 ; i++)
        {
            Feature a = candidates.get(i);
            for (int j=i+1 ; j<candidates.size() ; j++)
            {
                Feature b = candidates.get(j);
                assert a.mzPeak0 < b.mzPeak0  || a.charge > b.charge;   // see note above

                // if a "contains" b, remove b
                // CONSIDER: kl difference limiter (relying on filter above for now)
                if (a.charge % b.charge == 0 && a.ContainsPeak(b.comprised[0]))
                {
                    // if the test above is true, this test should be fairly redundant
                    if (a.peaks > b.peaks || a.peaks >= 6)
                        prunedSet.remove(b);
                }
            }
        }
        if (prunedSet.size() == 1)
            return prunedSet.iterator().next();

        //
        // we've still got more than one candidate!
        // probably can't explain all relatively big peaks with just one feature...
        //

        // Here's where we need a good secondary score function
        // How do we weight peaks, kl, sum(intensity)
        // in practice it's usually an easy choice (the peaks penalty only rarely changes result)

        Feature[] finalists = prunedSet.toArray(_emptyFeatureArray);
        Arrays.sort(finalists, new Comparator<Feature>()
        {
            double _score(Feature f)
            {
                return f.peaks * 0.1 - f.kl;
            }
            public int compare(Feature o1, Feature o2)
            {
                double score1 = _score(o1);
                double score2 = _score(o2);
                return score1 < score2 ? 1 : score1 > score2 ? -1 : 0; // sort DESC by score
            }
        });

        for (int i=0 ; i<finalists.length-1 ; i++)
            finalists[i].next = finalists[i+1];
        return finalists[0];
    }


    static int _closest(Spectrum.Peak[] peaks, float x, int start)
    {
        float dist = Math.abs(x - peaks[start].mz);
        int i = start + 1;
        for (; i < peaks.length; i++)
        {
            float d = Math.abs(x - peaks[i].mz);
            if (d > dist)
                break;
            dist = d;
        }
        return i - 1;
    }


    /**
     * score a candidate feature f
     * <p/>
     * Expecting charge=0 features and peaks
     *
     * TODO: document this better
     *
     * @param f
     * @param peaks
     */
    public void ScoreFeature(Feature f, Spectrum.Peak[] peaks)
    {
        int charge = f.charge;
        float invCharge = 1.0F / charge;
        float mass = (f.mz-Spectrum.HYDROGEN_ION_MASS) * charge;
        float[] expected = Spectrum.Poisson(mass);

        Spectrum.Peak[] peptidePeaks = new Spectrum.Peak[10];

        // find start position in peak list
        int p = Arrays.binarySearch(peaks, f, Spectrum.comparePeakMzAsc);
        if (p < 0) p = -(p + 1);
        p = Math.max(0, p - 1);
        int pLastFound = p;

        float mzP0 = f.mz;
        float sum = 0.0F;
        float intensityLast = 0.0F;
        float intensityHighest = 0.0F;
        boolean skippedPeak = false;
        float distSum = 0;
        float distCount = 0;

        //dhmay -- I don't really understand this parameter, why it has this exact value
        double delta = 1.0 / (SpectrumResampler.getResampleFrequency() - 1);

        for (int Pi = 0; Pi < peptidePeaks.length; Pi++)
        {
            float mzPi = mzP0 + Pi * invCharge + (distCount > 0 ? distSum/distCount : 0);
            p = _closest(peaks, mzPi, p);
            Spectrum.Peak peakFound = peaks[p];
            float dist = Math.abs(peakFound.mz - mzPi);

            boolean found = !peakFound.excluded && Math.abs(dist) < 2.5 * delta;


            if (!found)
            {
                // miss two in a row, call it quits
                if (Pi > 0 && null == peptidePeaks[Pi-1])
                    break;
            }
            else
            {
                intensityHighest = Math.max(peakFound.intensity, intensityHighest);

                // if peak is too BIG stop:
                // CONSIDER do this at end, use fn of signal v. expected?
                // don't like hard cut-offs
                if (Pi > 2 && peakFound.intensity != intensityHighest &&
                        peakFound.intensity > intensityLast * 1.33 + f.median)
                    break;

                // fine-tune anchor MZ
                if (peakFound.intensity > intensityHighest/2)
                {
                    distSum += dist;
                    distCount++;
                }

                for (int s = pLastFound + 1; s < p; s++)
                {
                    Spectrum.Peak skipped = peaks[s];
                    if (skipped.intensity > intensityLast / 2 + f.median)
                        skippedPeak = true;
                }

                // record it
                peptidePeaks[Pi] = peakFound;
                pLastFound = p;
                intensityLast = peakFound.intensity;
            }
        }

        // Adjust MZ for feature
        float mean=0;
        float weight=0;
        for (int i=0 ; i<peptidePeaks.length ; i++)
        {
            //float mi = m0 + i * invCharge;
            if (null == peptidePeaks[i]) continue;
            mean += (peptidePeaks[i].mz - i*invCharge) * peptidePeaks[i].intensity;
            weight += peptidePeaks[i].intensity;
        }
        f.mz = mean/weight;

        //
        // Calculate quality measures
        //

        float[] signal = new float[6];
        int countPeaks = 0;
        for (int i = 0; i < peptidePeaks.length; i++)
        {
            Spectrum.Peak peak = peptidePeaks[i];
            if (null != peak && (peak.intensity > intensityHighest/50 &&
                    peak.intensity > 2 * f.median))
                countPeaks++;
            if (i < signal.length)
            {
                float v = null == peak ? 0 : peptidePeaks[i].intensity;
                signal[i] = Math.max(0.1F, v);
                sum += signal[i];
            }
        }
        for (int i = 0; i < signal.length; i++)
            signal[i] /= sum;

        f.kl = Spectrum.KLPoissonDistance(mass, signal);
        //TODO: Do we need another score?
        //f.score = f.kl >= 0 ? f.intensity * (float)-Math.log(f.kl + Math.exp(-1)) : 0F;
        f.setPeaks(countPeaks);
        f.skippedPeaks = skippedPeak;
        f.comprised = peptidePeaks;
        f.mzPeak0 = peptidePeaks[0].mz;

        //
        // What other metrics would be useful
        //

        // some sort of sumSquares distance...
        // NOT TESTED YET, EXPERIMENTAL
        int lastPeak = 0;
        for (int i=0 ; i<signal.length ; i++)
            if (null != peptidePeaks[i])
                lastPeak = i;
        float sumDist = 0;
        for (int i=0 ; i<=lastPeak ; i++)
        {
            float dist = 1; // what should I use here....
            if (null != peptidePeaks[i])
            {
                float distMZ = Math.abs((f.mz + i*invCharge) - peptidePeaks[i].mz);
                distMZ = Math.max(0, distMZ-1/(2* SpectrumResampler.getResampleFrequency()));
                float distIn = Math.abs(signal[i] - expected[i]);
                // need to pick units to scale by, 0 means good 1 is pretty bad...
                distMZ *= 6; // 1/6 of a Da is bad
                distIn *= 4; // 1/4 in scaled intesity is bad
                dist = distMZ * distMZ + distIn * distIn;
                sumDist += dist * expected[i];
            }
        }
        f.dist = sumDist;
    }


    static double distanceNearestFraction(double x, double denom)
    {
        // scale so we can do distance to nearest integer
        double t = x * denom;
        double delta = Math.abs(t - Math.round(t));
        return delta / denom;
    }

    protected static void _logDebug(String s)
    {
        _log.debug(s);
        //System.err.println(s);
    }
}