/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.viewer.feature.extraction; import org.fhcrc.cpl.toolbox.proteomics.feature.Spectrum; import org.fhcrc.cpl.toolbox.proteomics.feature.Feature; import org.fhcrc.cpl.viewer.feature.ExtractMaxima2D; import org.fhcrc.cpl.viewer.feature.extraction.PeakExtractor; import org.fhcrc.cpl.viewer.feature.extraction.SmootherCreator; import org.fhcrc.cpl.toolbox.datastructure.Tree2D; import org.fhcrc.cpl.toolbox.datastructure.FloatRange; import org.fhcrc.cpl.toolbox.datastructure.Pair; import org.fhcrc.cpl.toolbox.proteomics.Scan; import org.apache.log4j.Logger; import java.util.*; /** * smooth, ExtractMaxima2D.analyze() * Sort by decreasing intensity * Throw out peaks with < 5 scans * Break up what look like double-humped elution profiles by breaking peaks * at valleys in the profile * For each peak: * -Compute the extents * -Walking down scans, make sure that the peak's intensity * is greater than the intensities of the two m/z's around it * -Make sure that intensity is above threshold * -If you start going back up in intensity, stop * -Same for end scan * -If too short (<5 scans), toss it out * -"Integrate" to determine total intensity: for each scan, add a * block of intensity corresponding to the intensity on that scan * multiplied by half the amount of time between the adjoining scans * -Create a feature to represent the individual peak * Exclude all features initially * Filter features: for each feature: * find all neighbors within 5 scans and 1.1 m/z units * if the m/z's are too close (within 1.5 / _resamplingFrequency, for some reason), no dice * make sure they're lined up, scan-wise * If not enough scans combined in the two features, no dice * If intensities not high enough, no dice * If we got here, unexclude both */ public class WaveletPeakExtractor implements PeakExtractor { protected static Logger _log = Logger.getLogger(WaveletPeakExtractor.class); protected boolean _peakRidgeWalkSmoothed = DEFAULT_PEAK_RIDGE_WALK_SMOOTHED; public static final float DEFAULT_THRESHOLD_OFFSET_WAVELET = 1.0f; public static final float DEFAULT_THRESHOLD_OFFSET_SMOOTHED = 0f; public static final int DEFAULT_PEAK_LENGTH_REQUIREMENT = 5; protected float minRidgeProportionOfMax = .05f; //This constant and variable control which level of the wavelet is used. 3, the default, works well for //a resampling frequency of 36. public static final int DEFAULT_WAVELET_LEVEL = 3; protected int waveletLevel = DEFAULT_WAVELET_LEVEL; //minimum number of scans protected int minPeakScans = DEFAULT_PEAK_LENGTH_REQUIREMENT; public Feature[] extractPeakFeatures(Scan[] scans, float[][] spectra, FloatRange mzRange) throws InterruptedException { // debug counters int d_rawPeaks = 0; int d_tooShort = 0; int d_filtered = 0; Thread currentThread = Thread.currentThread(); int numSpectra = spectra.length; int spectrumHeight = spectra[0].length; // // separate background and signal components // we're pretty conservative about what we call background, // we can use background and/or median later to filter peaks // further. // BackgroundRemover backgroundRemover = new BackgroundRemover(); float[][] background = backgroundRemover.removeBackground(spectra); float[][] median = backgroundRemover.calculateMedian(spectra); float[][] wavelets = extractWavelets(spectra); //smooth spectra float[][] smoothedSpectra = new float[wavelets.length][]; for (int s = 0; s < wavelets.length; s++) smoothedSpectra[s] = wavelets[s].clone(); SmootherCreator.getThresholdSmoother().smooth(smoothedSpectra); // // extract all maxima from a time-smoothed version of the wavelet transformed data // we have a lot of versions of the data now, but it's helpful to keep it all around // Spectrum.Peak[] rawPeaks = ExtractMaxima2D.analyze(smoothedSpectra, mzRange.min, SpectrumResampler.getResampleInterval(), null, 0.0F); int numRawPeaks = rawPeaks.length; // ???? if no raw peaks to process, bail with empty collection to avoid out // of bounds errors if ( 0 == numRawPeaks ) return new Feature[0]; Arrays.sort(rawPeaks, Spectrum.comparePeakIntensityDesc); if (_log.isDebugEnabled()) { _log.debug("raw peaks: " + rawPeaks.length); if (rawPeaks.length > 0) for (int i = 0; i < 10; i++) _log.debug(" " + (i) + ": " + rawPeaks[rawPeaks.length * i / 100].getIntensity()); if (rawPeaks.length > 0) for (int i = 1; i < 10; i++) _log.debug(" " + (i * 10) + ": " + rawPeaks[rawPeaks.length * i / 10].getIntensity()); _log.debug(" 100: " + rawPeaks[rawPeaks.length - 1].getIntensity()); } // // We probably have a LOT of peaks at this point. 95% are extremely small. // The size<shortPeak check in this loop will throw out most, and I // don't need to be too aggressive here // // NOTE: this loop also tries to handle "double humped elution profiles" // by breaking peaks at "valleys" in the elution profile. This could perhaps // be improved upon with a second pass that specifically handles this case and // makes sure that in a region the splitting is consistent. // float[][] ridge; float thresholdOffset; if (_peakRidgeWalkSmoothed) { // Use smoothed peaks for ridge-walking ridge = smoothedSpectra; thresholdOffset = DEFAULT_THRESHOLD_OFFSET_SMOOTHED; } else { // Use wavelet peaks for ridge-walking ridge = wavelets; thresholdOffset = DEFAULT_THRESHOLD_OFFSET_WAVELET; } ArrayList<Feature> waveletPeaks = new ArrayList<Feature>(); for (int i = 0, max = rawPeaks.length; i < max; i++) { Spectrum.Peak peak = rawPeaks[i]; int imz = Math.round((peak.mz - mzRange.min) * SpectrumResampler.getResampleFrequency()); if (imz < 2 || imz >= spectrumHeight - 2) continue; int scan = peak.scan; float peakIn = spectra[peak.scan][imz]; float peakBg = background[peak.scan][imz]; float peakMd = median[peak.scan][imz]; // compute extent of this feature // this is complicated by the fact that we may have // double-humped peaks we need to separate int scanStart = scan; int scanEnd = scan; //todo: everything in the thresholdIn calculation is questionable and performs rather badly for metabolite data. //It's quite easy to get multiple features that overlap, and multiple features for the same m/z that //don't really seem to be separated by a valley. float thresholdIn = thresholdOffset + 0.5F * background[scan][imz] + 2 * median[scan][imz]; thresholdIn = Math.max(peakIn * minRidgeProportionOfMax, thresholdIn); float minIn = Float.MAX_VALUE; int scanMinIn = scan; for (; scanStart > 0; scanStart--) { float spectraIn = spectra[scanStart][imz]; // are we still on the ridge and > 5% of max if (spectraIn < thresholdIn || ridge[scanStart][imz] < ridge[scanStart][imz - 2] || ridge[scanStart][imz] < ridge[scanStart][imz + 2]) { scanStart++; break; } // have we crossed a 'valley' float smoothIn = smoothedSpectra[scanStart][imz]; if (smoothIn < minIn) { minIn = spectraIn; scanMinIn = scanStart; } else if (smoothIn > minIn * 1.1 + thresholdIn) // made up cutoff { scanStart = scanMinIn; break; } } minIn = Float.MAX_VALUE; scanMinIn = scan; for (; scanEnd < numSpectra - 1; scanEnd++) { float spectraIn = spectra[scanEnd][imz]; // are we still on the ridge and > 5% of max if (spectraIn < thresholdIn || ridge[scanEnd][imz] < ridge[scanEnd][imz - 2] || ridge[scanEnd][imz] < ridge[scanEnd][imz + 2]) { scanEnd--; break; } // have we crossed a 'valley' float smoothIn = smoothedSpectra[scanEnd][imz]; if (smoothIn < minIn) { minIn = spectraIn; scanMinIn = scanEnd; } else if (smoothIn > minIn * 1.1 + thresholdIn) // made up cutoff { scanEnd = scanMinIn; break; } } if (scanEnd - scanStart + 1 < minPeakScans) { d_tooShort++; continue; } // total intensity calculation // "integrate" intensity over time float totalIn = 0; for (int s = scanStart; s <= scanEnd; s++) { double in = spectra[s][imz]; double time = 1.0; if (s > 0 && s + 1 < scans.length) time = (scans[s + 1].getDoubleRetentionTime() - scans[s - 1].getDoubleRetentionTime()) / 2; totalIn += time * in; } Feature f = new Feature(peak.scan, scanStart, scanEnd, peak.mz, peakIn, 0, -1, 0); f.background = peakBg; f.median = peakMd; f.totalIntensity = totalIn; waveletPeaks.add(f); } if (currentThread.isInterrupted()) throw new InterruptedException(); Collections.sort(waveletPeaks, Spectrum.compareFeatureLengthDesc); // some stats if (_log.isDebugEnabled()) { _log.debug("FEATURE LENGTHS " + waveletPeaks.size() + " peaks"); if (waveletPeaks.size() > 0) { for (int i = 0; i < 10; i++) _log.debug(" " + (i) + ": " + rawPeaks[waveletPeaks.size() * i / 100].getIntensity()); } if (waveletPeaks.size() > 0) { for (int i = 1; i < 10; i++) _log.debug(" " + (i * 10) + ": " + rawPeaks[waveletPeaks.size() * i / 10].getIntensity()); } } // OK, up the bar for a feature (min length of at largest peak in the feature) minPeakScans++; _log.debug("SHORT PEAK " + minPeakScans); // // Filtering // // Sticking with the idea of persistance, eliminate peaks // that don't seem to have correllated peaks // Tree2D tree = new Tree2D(); for (Feature f : waveletPeaks) { f.excluded = true; tree.add(f.scan, f.mz, f); } Collections.sort(waveletPeaks, Spectrum.compareFeatureLengthDesc); //QUESTION: so only features with other related features get included... so how do we //end up with any single-peak features? for (Feature f : waveletPeaks) { int lenF = f.getScanLast() - f.getScanFirst() + 1; ArrayList<Feature> l = (ArrayList<Feature>) tree.getPoints(f.scan - 5, f.mz - 1.1F, f.scan + 5, f.mz + 1.1F); //too much detail // _log.debug("Evaluating feature with " + l.size() + " neighbors: " + f); for (Feature neighbor : l) { int lenN = neighbor.getScanLast() - neighbor.getScanFirst() + 1; // we want features of different masses //QUESTION: what is the significance of 1.5? if (Math.abs(neighbor.mz - f.mz) < 1.5 / SpectrumResampler.getResampleFrequency()) { _log.debug("\tneighbor too close in m/z (" + Math.abs(neighbor.mz - f.mz) + ")"); continue; } // are centers aligned int s = Math.max(f.getScanFirst(), neighbor.getScanFirst()); int e = Math.min(f.getScanLast(), neighbor.getScanLast()); if (f.scan < s || f.scan > e || neighbor.scan < s || neighbor.scan > e) { //too much detail // _log.debug("\tneighbor scans not aligned"); continue; } // if neither is chosen yet, add a small hurdle if (f.excluded && neighbor.excluded) { //QUESTION: why checking the total number of scans in both? if (lenF + lenN < minPeakScans) { //too much detail // _log.debug("\tneighbor combined scans too short"); continue; } if (f.intensity < 1 + 0.5 * f.background + 3 * Math.max(1, f.median) && neighbor.intensity < 1 + 0.5 * f.background + 3 * Math.max(1, neighbor.median)) { //too much detail // _log.debug("\tneighbor too low-intensity"); continue; } } f.excluded = false; neighbor.excluded = false; //too much detail // _log.debug("\tSuccess!"); } } ArrayList<Feature> copy = new ArrayList<Feature>(); for (Feature f : waveletPeaks) { if (!f.excluded) { copy.add(f); //too much detail // _log.debug("Final list: adding: " + f); } else d_filtered++; } waveletPeaks = copy; Feature[] result = waveletPeaks.toArray(new Feature[waveletPeaks.size()]); return result; } /** * * the D3 matrix is used by default to locate peaks, * the D3 processing effectively sharpens the raw data, this does a great job of cleaning * up messy data, e.g. wide/overlapping peaks, and stray peaks in centroided data * * why D3? This is based on our default resampling frequency 1Da/36 and our max expected charge * which I'm presuming to be 6. Level three represents a feature width of 2^3=8, which * is near to 36/6. A big change in resampling or desired max charge, might require a * change. * * dhmay changing 20100803 to use waveletLevel rather than always using 3 * @param spectra * @return */ public float[][] extractWavelets(float[][] spectra) { int numSpectra = spectra.length; float[][] wavelets = new float[numSpectra][]; for (int s = 0; s < spectra.length; s++) wavelets[s] = Spectrum.WaveletDX(spectra[s], null, waveletLevel); return wavelets; } /** * Assign the extra necessary info to a set of features representing peaks * so that they can be written to a file * @param peakFeatures * @param scans */ public void prettyPeakFeaturesForOutput(Collection<Feature> peakFeatures, Scan[] scans) { for (Feature peak : peakFeatures) { // fix up scan Scan scan = scans[peak.scan]; peak.setTime((float)scan.getDoubleRetentionTime()); peak.scan = scan.getNum(); peak.scanFirst = scans[peak.scanFirst].getNum(); peak.scanLast = scans[peak.scanLast].getNum(); peak.setPeaks(1); // otherwise these will get filtered by default } } public void setShortPeak(int minPeakScans) { this.minPeakScans = minPeakScans; } public int getShortPeak() { return minPeakScans; } public boolean isPeakRidgeWalkSmoothed() { return _peakRidgeWalkSmoothed; } public void setPeakRidgeWalkSmoothed(boolean peakRidgeWalkSmoothed) { _peakRidgeWalkSmoothed = peakRidgeWalkSmoothed; } public int getWaveletLevel() { return waveletLevel; } public void setWaveletLevel(int waveletLevel) { this.waveletLevel = waveletLevel; } }