/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.viewer.feature; import org.apache.log4j.Logger; import org.fhcrc.cpl.toolbox.proteomics.feature.Spectrum; import org.fhcrc.cpl.toolbox.proteomics.feature.Feature; import org.fhcrc.cpl.viewer.feature.FeatureExtractor; import org.fhcrc.cpl.viewer.feature.extraction.SpectrumResampler; import org.fhcrc.cpl.toolbox.proteomics.MSRun; import org.fhcrc.cpl.toolbox.datastructure.Tree2D; import org.fhcrc.cpl.toolbox.proteomics.Scan; import org.fhcrc.cpl.toolbox.datastructure.FloatRange; import java.util.*; /** * User: mbellew * Date: Sep 7, 2004 * Time: 12:10:06 PM */ public class FeatureStrategyUsingWindow extends FeatureExtractor { static Logger _log = Logger.getLogger(FeatureStrategyUsingWindow.class); static Feature[] _emptyFeatureArray = new Feature[0]; Scan[] _scans; protected double _noise = 1.0; static double debugMZ = 0; public FeatureStrategyUsingWindow(MSRun run, int scan, int count, int maxCharge, FloatRange range, double sn) { super(run, scan, count, maxCharge, range, sn); _scans = getScans(run, scan, count); } public Feature[] _analyze() throws InterruptedException { return analyzeScanAtATime(_scans); } protected float[] _smooth(float[] s) { return Spectrum.FFTsmooth(s, 6.0, false); } protected Collection<Feature> analyze1D(Scan scan) { Spectrum.Peak[] peaks = _pickPeaks(scan); return ExtractPeptideFeatures(_run, peaks);//, scan); } protected Spectrum.Peak[] _pickPeaks(Scan scan) { float[][] rawSpectrum = scan.getSpectrum(); float[][] spectrum = Spectrum.ResampleSpectrum(rawSpectrum, _mzRange, SpectrumResampler.getResampleFrequency(), false); spectrum[1] = _smooth(spectrum[1]); // not sure if this is technically noise, we've already smoothed // and perhaps removed background, // but it's what we consider a small peak _noise = Spectrum.Noise(spectrum[1], 0, spectrum[0].length); _noise = Math.max(_noise, 0.1); // // Generate Peaks // Spectrum.Peak[] peaks = Spectrum.PickPeaks(spectrum, _noise); for (Spectrum.Peak peak : peaks) peak.scan = scan.getNum(); return peaks; } /** * For each single-peak feature: * Consider all features in a small window around it * If multiple have the same m/z, take the one with the lower scan, discard the others * Toss out anything < 1/10 the intensity of the highest-intense peak * Consider all possible charge states: * For each peak, determine the mass distance from the highest-intense peak * If within tolerance, create a new multi-peak feature, make sure it contains * the highest-intensity peak, score it (a complicated process), * add it to a list of candidates * Pick the best charge state and create a feature * * @param run * @param peaksIN * @return */ protected ArrayList<Feature> ExtractPeptideFeatures(MSRun run, Spectrum.Peak[] peaksIN) { _logDebug("ExtractPeptideFeatures(" + peaksIN.length + ")"); // // store peaks in 2-D tree for searching // Tree2D peaks2D = new Tree2D(); for (Spectrum.Peak p : peaksIN) { peaks2D.add(p.scan, p.mz, p); } // OUTPUT list ArrayList<Feature> featureList = new ArrayList<Feature>(); Spectrum.Peak[] peaksByIntensity = peaksIN.clone(); Arrays.sort(peaksByIntensity, Spectrum.comparePeakIntensityDesc); // // UNDONE: PROBLEM WITH VERY LARGE 1+ FEATURES // in addition to all the other problems, we've observed a strange // phenomenon with very large peaks in 1+ charge features. There may be an "echo" // of the feature at about .5-.7 daltons after the first, and sometimes second peak. // Thie echo can be very large (as large as the second "real" peak). Moreover, the // echo seems to shift or compress the previous peak downward in MZ. If this effect // is large enough it can break the corret identification of the main feature. // // There's probably a name for this phenomenon, but I don't know what it is. // // TODO: ignore the "echo" feature // TODO: correct for the shift in the main peaks // _logDebug("ExtractPeptideFeatures: using noise = " + _noise); for (Spectrum.Peak peakHighest : peaksByIntensity) { if (peakHighest.excluded) continue; boolean debugPeak = false; if (debugMZ != 0) { // note the 100-140 check will put the feature in the middle of the detail pane if(Math.abs(peakHighest.mz - debugMZ) < 1 && peakHighest.scan > 100 && peakHighest.scan < 140) debugPeak = true; } float mzWindowStart = peakHighest.mz - 2.1F; float mzWindowEnd = peakHighest.mz + 6.1F; float scanStart = peakHighest.scan - 9; float scanEnd = peakHighest.scan + 9; // get collection of nearby peaks Spectrum.Peak[] peaks; { ArrayList<Spectrum.Peak> peakList = peaks2D.getPoints(scanStart, mzWindowStart, scanEnd, mzWindowEnd); for (int p = peakList.size()-1; p>=0 ; p--) { if ((peakList.get(p)).excluded) peakList.remove(p); } // eliminate possible duplicates and get one list of peaks sorted by mz Spectrum.Peak[] peaksNear = peakList.toArray(new Spectrum.Peak[peakList.size()]); assert peaksNear.length > 0; Arrays.sort(peaksNear, Spectrum.comparePeakIntensityDesc); Arrays.sort(peaksNear, Spectrum.comparePeakMzAsc); int lenPeaks = 0; for (int s = 1; s < peaksNear.length; s++) { if (peaksNear[lenPeaks].mz == peaksNear[s].mz) { if (Math.abs(peaksNear[s].scan-peakHighest.scan) < Math.abs(peaksNear[lenPeaks].scan-peakHighest.scan)) peaksNear[lenPeaks] = peaksNear[s]; } else { peaksNear[++lenPeaks] = peaksNear[s]; } } ++lenPeaks; peaks = new Spectrum.Peak[lenPeaks]; System.arraycopy(peaksNear, 0, peaks, 0, lenPeaks); } if (debugPeak) { _logDebug("\n\npeaks"); for (Spectrum.Peak peak : peaks) { _logDebug((peak == peakHighest ? " *" : " ") + peak.toString() + "\t" + (peak.mz - peakHighest.mz)); } _logDebug("scored features"); } // // generate candidate features // ArrayList<Feature> candidates = new ArrayList<Feature>(); //dhmay -- I don't really understand this parameter, why it has this exact value double delta = 1.0 / (SpectrumResampler.getResampleFrequency() - 1); for (Spectrum.Peak p : peaks) { if (p.mz > peakHighest.mz) break; if (p.excluded) continue; // even at high end of mass range (6400amu) leading peak is not expected // to be smaller than about 1/6 highest peak, and typically much larger if (p.intensity < peakHighest.intensity / 10.0F) continue; double distance = peakHighest.mz - p.mz; for (int charge = _maxCharge; charge >= 1; charge--) { double r = distanceNearestFraction(distance, charge); if (r < 2 * delta) { Feature f = newFeatureRange(peakHighest, p.mz, charge); ScoreFeature(f, peaks); if (f.peaks == 1) continue; boolean containsPeakHighest = false; for (Spectrum.Peak comprisedPeak : f.comprised) { if (comprisedPeak == peakHighest) { containsPeakHighest = true; break; } } if (!containsPeakHighest) continue; candidates.add(f); } } } Feature bestFeature = BestFeature(candidates); if (bestFeature == null) { // 0+ bestFeature = newFeatureRange(peakHighest, peakHighest.mz, 0); bestFeature.comprised = new Spectrum.Peak[] {peakHighest}; } // CONSIDER: use expected largest peak instead of largest actual peak if (peakHighest instanceof Feature) bestFeature.totalIntensity = ((Feature)peakHighest).totalIntensity; // // CONSIDER: back check against found features // could this be part of one of those?? // // // UNDONE: quality tests, for now we output everything for evaluation // QualityTest: { if (bestFeature.getPeaks() == 1) { bestFeature.setCharge(0); bestFeature.setKl(-1); } // Now that charge is set, compute mass bestFeature.updateMass(); if (debugPeak) { _logDebug("bestFeature"); _logDebug(" " + bestFeature.toString()); } // Got a good one // // fix ups // if (peakHighest instanceof Feature && 0 != ((Feature)peakHighest).getTime()) bestFeature.setTime(((Feature)peakHighest).getTime()); else { try { // sometimes we're not handed real scan numbers so catch() int index = run.getIndexForScanNum(peakHighest.scan); if (index < 0) index = -(index+1); bestFeature.setTime((float)run.getScan(index).getDoubleRetentionTime()); } catch (Throwable t) { } } featureList.add(bestFeature); } // // exclude all peaks explained by bestFeature (even if we don't output it) // if (debugPeak) _logDebug("exclude peaks"); for (Spectrum.Peak p : bestFeature.comprised) { if (null != p) { p.excluded = true; if (debugPeak) _logDebug(" " + p.toString()); } } assert peakHighest.excluded; } return featureList; } private Feature newFeatureRange(Spectrum.Peak peakHighest, float mz, int charge) { Feature f; if (peakHighest instanceof Feature) f = new Feature((Feature)peakHighest); else f = new Feature(peakHighest); f.charge = charge; f.mz = mz; return f; } /** * I used to do this pairwise, but had problem with weird * non-trasitive comparison heurisitics. * * This could undoubtably be improved, maybe with a combined * scoring function fn(charge, peaks, kl)? or new binary * feature comparision BetterFeature(a,b) * * @param candidates * @return */ static Feature BestFeature(ArrayList<Feature> candidates) { if (candidates.size() == 0) return null; // // Based on very little obvservation, it seems that the distribution // of KL scores tends to be bimodal. There are the terrible scores // for guess that are hopelessly wrong, and decent scores for features // with the right charge (or factor of the right charge) // // // Toss the bad scores. .5 is arbitrary (and maybe too big). // It's hard to choose dynamically as the input set is often small // double min = candidates.get(0).kl; double max = min; for (Feature candidate : candidates) { min = Math.min(candidate.kl, min); max= Math.max(candidate.kl, max); } for (int i=candidates.size()-1 ; i>=0 ; i--) { Feature candidate = candidates.get(i); if (candidate.kl > min + 0.5) candidates.remove(i); } if (candidates.size() == 1) return candidates.get(0); // // other heuristics given a set of "not awful" features // Set<Feature> prunedSet = new HashSet<Feature>(candidates); // NOTE we rely on the candidate list being sorted by MZ ASC,CHARGE DESC, // see ExtractPeptideFeatures for (int i=0 ; i<candidates.size()-1 ; i++) { Feature a = candidates.get(i); for (int j=i+1 ; j<candidates.size() ; j++) { Feature b = candidates.get(j); assert a.mzPeak0 < b.mzPeak0 || a.charge > b.charge; // see note above // if a "contains" b, remove b // CONSIDER: kl difference limiter (relying on filter above for now) if (a.charge % b.charge == 0 && a.ContainsPeak(b.comprised[0])) { // if the test above is true, this test should be fairly redundant if (a.peaks > b.peaks || a.peaks >= 6) prunedSet.remove(b); } } } if (prunedSet.size() == 1) return prunedSet.iterator().next(); // // we've still got more than one candidate! // probably can't explain all relatively big peaks with just one feature... // // Here's where we need a good secondary score function // How do we weight peaks, kl, sum(intensity) // in practice it's usually an easy choice (the peaks penalty only rarely changes result) Feature[] finalists = prunedSet.toArray(_emptyFeatureArray); Arrays.sort(finalists, new Comparator<Feature>() { double _score(Feature f) { return f.peaks * 0.1 - f.kl; } public int compare(Feature o1, Feature o2) { double score1 = _score(o1); double score2 = _score(o2); return score1 < score2 ? 1 : score1 > score2 ? -1 : 0; // sort DESC by score } }); for (int i=0 ; i<finalists.length-1 ; i++) finalists[i].next = finalists[i+1]; return finalists[0]; } static int _closest(Spectrum.Peak[] peaks, float x, int start) { float dist = Math.abs(x - peaks[start].mz); int i = start + 1; for (; i < peaks.length; i++) { float d = Math.abs(x - peaks[i].mz); if (d > dist) break; dist = d; } return i - 1; } /** * score a candidate feature f * <p/> * Expecting charge=0 features and peaks * * TODO: document this better * * @param f * @param peaks */ public void ScoreFeature(Feature f, Spectrum.Peak[] peaks) { int charge = f.charge; float invCharge = 1.0F / charge; float mass = (f.mz-Spectrum.HYDROGEN_ION_MASS) * charge; float[] expected = Spectrum.Poisson(mass); Spectrum.Peak[] peptidePeaks = new Spectrum.Peak[10]; // find start position in peak list int p = Arrays.binarySearch(peaks, f, Spectrum.comparePeakMzAsc); if (p < 0) p = -(p + 1); p = Math.max(0, p - 1); int pLastFound = p; float mzP0 = f.mz; float sum = 0.0F; float intensityLast = 0.0F; float intensityHighest = 0.0F; boolean skippedPeak = false; float distSum = 0; float distCount = 0; //dhmay -- I don't really understand this parameter, why it has this exact value double delta = 1.0 / (SpectrumResampler.getResampleFrequency() - 1); for (int Pi = 0; Pi < peptidePeaks.length; Pi++) { float mzPi = mzP0 + Pi * invCharge + (distCount > 0 ? distSum/distCount : 0); p = _closest(peaks, mzPi, p); Spectrum.Peak peakFound = peaks[p]; float dist = Math.abs(peakFound.mz - mzPi); boolean found = !peakFound.excluded && Math.abs(dist) < 2.5 * delta; if (!found) { // miss two in a row, call it quits if (Pi > 0 && null == peptidePeaks[Pi-1]) break; } else { intensityHighest = Math.max(peakFound.intensity, intensityHighest); // if peak is too BIG stop: // CONSIDER do this at end, use fn of signal v. expected? // don't like hard cut-offs if (Pi > 2 && peakFound.intensity != intensityHighest && peakFound.intensity > intensityLast * 1.33 + f.median) break; // fine-tune anchor MZ if (peakFound.intensity > intensityHighest/2) { distSum += dist; distCount++; } for (int s = pLastFound + 1; s < p; s++) { Spectrum.Peak skipped = peaks[s]; if (skipped.intensity > intensityLast / 2 + f.median) skippedPeak = true; } // record it peptidePeaks[Pi] = peakFound; pLastFound = p; intensityLast = peakFound.intensity; } } // Adjust MZ for feature float mean=0; float weight=0; for (int i=0 ; i<peptidePeaks.length ; i++) { //float mi = m0 + i * invCharge; if (null == peptidePeaks[i]) continue; mean += (peptidePeaks[i].mz - i*invCharge) * peptidePeaks[i].intensity; weight += peptidePeaks[i].intensity; } f.mz = mean/weight; // // Calculate quality measures // float[] signal = new float[6]; int countPeaks = 0; for (int i = 0; i < peptidePeaks.length; i++) { Spectrum.Peak peak = peptidePeaks[i]; if (null != peak && (peak.intensity > intensityHighest/50 && peak.intensity > 2 * f.median)) countPeaks++; if (i < signal.length) { float v = null == peak ? 0 : peptidePeaks[i].intensity; signal[i] = Math.max(0.1F, v); sum += signal[i]; } } for (int i = 0; i < signal.length; i++) signal[i] /= sum; f.kl = Spectrum.KLPoissonDistance(mass, signal); //TODO: Do we need another score? //f.score = f.kl >= 0 ? f.intensity * (float)-Math.log(f.kl + Math.exp(-1)) : 0F; f.setPeaks(countPeaks); f.skippedPeaks = skippedPeak; f.comprised = peptidePeaks; f.mzPeak0 = peptidePeaks[0].mz; // // What other metrics would be useful // // some sort of sumSquares distance... // NOT TESTED YET, EXPERIMENTAL int lastPeak = 0; for (int i=0 ; i<signal.length ; i++) if (null != peptidePeaks[i]) lastPeak = i; float sumDist = 0; for (int i=0 ; i<=lastPeak ; i++) { float dist = 1; // what should I use here.... if (null != peptidePeaks[i]) { float distMZ = Math.abs((f.mz + i*invCharge) - peptidePeaks[i].mz); distMZ = Math.max(0, distMZ-1/(2* SpectrumResampler.getResampleFrequency())); float distIn = Math.abs(signal[i] - expected[i]); // need to pick units to scale by, 0 means good 1 is pretty bad... distMZ *= 6; // 1/6 of a Da is bad distIn *= 4; // 1/4 in scaled intesity is bad dist = distMZ * distMZ + distIn * distIn; sumDist += dist * expected[i]; } } f.dist = sumDist; } static double distanceNearestFraction(double x, double denom) { // scale so we can do distance to nearest integer double t = x * denom; double delta = Math.abs(t - Math.round(t)); return delta / denom; } protected static void _logDebug(String s) { _log.debug(s); //System.err.println(s); } }