/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.viewer.feature.extraction; import org.fhcrc.cpl.toolbox.proteomics.feature.Feature; import org.fhcrc.cpl.toolbox.proteomics.feature.Spectrum; import org.fhcrc.cpl.toolbox.proteomics.MSRun; import org.fhcrc.cpl.toolbox.gui.chart.PanelWithHistogram; import org.fhcrc.cpl.toolbox.datastructure.Tree2D; import org.fhcrc.cpl.toolbox.statistics.RegressionUtilities; import org.fhcrc.cpl.viewer.quant.PeakOverlapCorrection; import org.apache.log4j.Logger; import java.util.*; /** * class for creating Features from Peaks, specific to small-molecule analysis, e.g., metabolites. * * Differs from FeatureStrategyPeakClusters in that: * -hardcoded maximum charge of 2, maximum m/z range 3 Thompsons * -most-intense peak is assumed to be monoisotope * -"k/l" calculation is actually a simple comparison of the log-ratio of the first two peaks with a theoretical * mass-logratio relationship derived from analyzing the HMDB databases * -Determination of best feature is purely a decision between charge states. Based on number of * peaks (more is better), then KL * -hard filter on candidate features if any peak intensity is more than MAX_ANY_FIRST_PEAK_PROPORTION of first * -Single peak features allowed, assume charge 1, given KL -1 * -No missing peaks allowed. comprised array has no null values * */ public class SmallMoleculePeakCombiner extends BasePeakCombiner { private static Logger _log = Logger.getLogger(SmallMoleculePeakCombiner.class); public static final double DEFAULT_MAX_ABS_DISTANCE_BETWEEN_PEAKS = 1.0; //default is positively charged ions protected boolean negativeChargeMode = false; protected double _maxAbsDistanceBetweenPeaks = DEFAULT_MAX_ABS_DISTANCE_BETWEEN_PEAKS; //This should be higher if considering chlorine-containing ions public static final float MAX_ANY_FIRST_PEAK_PROPORTION = 0.5f; //maximum distance between peaks to be considered tied together protected double _maxResampledDistanceBetweenFeatures = DEFAULT_MAX_ABS_DISTANCE_BETWEEN_PEAKS / (SpectrumResampler.getResampleFrequency() - 1); protected boolean keepStatistics = false; //5 peaks possible with chlorine boosting the 3rd & beyond, but there are so few chlorine-containing peaks //it's too damaging to consider them protected int maxPeaks = 4; //maximum distance in scans from the scan range of the base peak to the apex scan of any other peak protected int maxScanRangeDist = 1; //maximum distance from apex scan of first peak to apex scan of any other peak protected int maxApexScanDist = 6; //coefficients for the equation relating metabolite mass to the log-ratio of peak 1 intensity / peak 2 intensity //Empirically derived public static final double[] MASS_2PEAK_LOGRATIO_COEFFS = new double[] { 4.260321140289307, -0.015644630417227745, 3.08442504319828E-5, -2.9751459962312765E-8, 1.0629329881550742E-11 }; //Standard deviation of error around regression above, from HMDB database. Empirically derived public static final double MASS_2PEAK_LOGRATIO_ERROR_SD = 0.40649617; // protected List<Integer> allCandidatePeakCounts = new ArrayList<Integer>(); //track the number of candidates evaluated for each feature public List<Float> numCandidatesList = new ArrayList<Float>(); // public List<Float> klScoresList = new ArrayList<Float>(); public SmallMoleculePeakCombiner() { _maxCharge = 2; } /** * For each single-peak feature: * Consider all features in a small window around it * If multiple have the same m/z, take the one with the lower scan, discard the others * Toss out anything < 1/10 the intensity of the highest-intense peak * Consider all possible charge states: * For each peak, determine the mass distance from the highest-intense peak * If within tolerance, create a new multi-peak feature, make sure it contains * the highest-intensity peak, score it (a complicated process), * add it to a list of candidates * Pick the best charge state and create a feature * * @param run * @param peaksIN * @return */ public Feature[] createFeaturesFromPeaks(MSRun run, Spectrum.Peak[] peaksIN) { _maxCharge = 2; _log.debug("ExtractPeptideFeatures(" + peaksIN.length + ")"); //TODO: really should set some parameters in FeatureScorer here (maxDist and resamplingFreq), but not sure if we want to surface those in interface // // store peaks in 2-D tree for searching // Tree2D peaks2D = new Tree2D(); for (Spectrum.Peak p : peaksIN) { peaks2D.add(p.scan, p.mz, p); } // OUTPUT list List<Feature> featureList = new ArrayList<Feature>(); Spectrum.Peak[] peaksByIntensity = peaksIN.clone(); Arrays.sort(peaksByIntensity, Spectrum.comparePeakIntensityDesc); //List<Integer> scanDeviations = new ArrayList<Integer>(); for (Spectrum.Peak peakHighest : peaksByIntensity) { if (peakHighest.excluded) continue; //dhmay I don't know why we get 0-intensity peaks, but we do. Below noise? At any rate, for small //molecules, we don't want 'em if (peakHighest.intensity == 0) { peakHighest.excluded = true; continue; } boolean debugPeak = false; float mzWindowStart = peakHighest.mz - 1.01F; float mzWindowEnd = peakHighest.mz + (maxPeaks-1) + .1F; float scanStart = ((Feature) peakHighest).scanFirst - maxApexScanDist; float scanEnd = peakHighest.scan + maxApexScanDist; // get collection of nearby peaks Spectrum.Peak[] peaks; ArrayList<Spectrum.Peak> peakList = peaks2D.getPoints(scanStart, mzWindowStart, scanEnd, mzWindowEnd); for (int p = peakList.size()-1; p>=0 ; p--) { if ((peakList.get(p)).excluded) peakList.remove(p); } // eliminate possible duplicates and get one list of peaks sorted by mz Spectrum.Peak[] peaksNear = peakList.toArray(new Spectrum.Peak[peakList.size()]); assert peaksNear.length > 0; Arrays.sort(peaksNear, Spectrum.comparePeakIntensityDesc); Arrays.sort(peaksNear, Spectrum.comparePeakMzAsc); int lenPeaks = 0; for (int s = 1; s < peaksNear.length; s++) { if (peaksNear[lenPeaks].mz == peaksNear[s].mz) { if (Math.abs(peaksNear[s].scan-peakHighest.scan) < Math.abs(peaksNear[lenPeaks].scan-peakHighest.scan)) peaksNear[lenPeaks] = peaksNear[s]; } else { peaksNear[++lenPeaks] = peaksNear[s]; } } ++lenPeaks; //lensPeaks.add(lenPeaks); peaks = new Spectrum.Peak[lenPeaks]; System.arraycopy(peaksNear, 0, peaks, 0, lenPeaks); if (debugPeak) { _log.debug("\n\npeaks=" + peaks.length); for (Spectrum.Peak peak : peaks) { _log.debug((peak == peakHighest ? " *" : " ") + peak.toString() + "\t" + (peak.mz - peakHighest.mz)); } _log.debug("scored features"); } // // generate candidate features // ArrayList<Feature> candidates = new ArrayList<Feature>(); for (int absCharge = Math.abs(_maxCharge); absCharge >= 1; absCharge--) { Feature f = newFeatureRange(peakHighest, peakHighest.mz, negativeChargeMode ? -absCharge : absCharge); fleshOutFeature(f, peaks); //if we've only got one peak, forget it -- just use the charge-1 version, no matter how many peaks it has if (absCharge > 1 && f.peaks == 1) continue; //if (absCharge > 1) // System.err.println("charge" + f.charge + ", peaks=" + f.peaks + ", kl=" + f.kl); //for (int i=0; i<f.comprised.length; i++) if (f.comprised[i] == null) System.err.println("NULL!!! " + i + " / " + (f.comprised.length-1)); //dhmay commenting out the requirement that we have 2+ peaks. BIG CHANGE!!! // if (f.peaks == 1) // continue; if (f.comprised[0] == null)// || f.comprised[1] == null) continue; //add no candidates with any peak with intensity more than MAX_ANY_FIRST_PEAK_PROPORTION //times the first peak intensity. //Also, add no candidates with any missing peaks boolean shouldAdd = true; for (int i=1; i<f.comprised.length; i++) { // if (hasMissing && f.comprised[i] != null) // hasPresentAfterMissing = true; if (Math.round((f.comprised[i].getMz() - f.getMz()) * (float) absCharge) != i) { //if (absCharge > 1) System.err.println("fail, ch=" + absCharge + ", " + f.comprised[i].getMz() + ", " + f.getMz() + ", " + i + ", " + (f.comprised[i].getMz() - f.getMz())); shouldAdd = false; break; } if (f.comprised[i].intensity > MAX_ANY_FIRST_PEAK_PROPORTION * f.comprised[0].intensity) { shouldAdd = false; break; } } if (shouldAdd) { //if (absCharge > 1) System.err.println("\tAdding"); candidates.add(f); //allCandidatePeakCounts.add(f.peaks); } } Feature bestFeature = determineBestFeature(candidates); //if (bestFeature != null && bestFeature.comprised.length > 1) System.err.println("MULTI!"); if (bestFeature == null) { // 0+ bestFeature = newFeatureRange(peakHighest, peakHighest.mz, 1); bestFeature.comprised = new Spectrum.Peak[] {peakHighest}; } // CONSIDER: use expected largest peak instead of largest actual peak if (peakHighest instanceof Feature) bestFeature.totalIntensity = ((Feature)peakHighest).totalIntensity; //if (bestFeature.peaks > 1) //{ // int devScans = 0; // int peak2Scan = bestFeature.comprised[1].scan; // if (peak2Scan > bestFeature.getScanLast()) // devScans = peak2Scan - bestFeature.getScanLast(); // if (peak2Scan < bestFeature.getScanFirst()) // devScans = bestFeature.getScanFirst() - peak2Scan; // scanDeviations.add(devScans); //} if (keepStatistics) { numCandidatesList.add((float) candidates.size()); } //boolean wasCharge2 = bestFeature.charge == 2; //if (bestFeature.charge == 2) //{ //System.err.println("charge2, kl=" + bestFeature.kl); //} QualityTest: { if (bestFeature.getPeaks() == 1 ) { bestFeature.setCharge(negativeChargeMode ? -1 : 1); bestFeature.setKl(-1); } // Now that charge is set, compute mass bestFeature.updateMass(); if (debugPeak) { _log.debug("bestFeature"); _log.debug(" " + bestFeature.toString()); } // Got a good one // // fix ups // if (peakHighest instanceof Feature && 0 != ((Feature)peakHighest).getTime()) bestFeature.setTime(((Feature)peakHighest).getTime()); else { try { // sometimes we're not handed real scan numbers so catch() int index = run.getIndexForScanNum(peakHighest.scan); if (index < 0) index = -(index+1); bestFeature.setTime((float)run.getScan(index).getDoubleRetentionTime()); } catch (Throwable t) { } } //if ("1chlorine".equals(bestFeature.getDescription())) // System.err.println("chlor mass=" + bestFeature.mass); annotateFeatureDescription(bestFeature); featureList.add(bestFeature); } // // exclude all peaks explained by bestFeature (even if we don't output it) // if (debugPeak) _log.debug("exclude peaks"); for (Spectrum.Peak p : bestFeature.comprised) { if (null != p) { p.excluded = true; if (debugPeak) _log.debug(" " + p.toString()); } } assert peakHighest.excluded; } //new PanelWithHistogram(scanDeviations, "scan deviations").displayInTab(); //new PanelWithHistogram(allCandidatePeakCounts, "peaks").displayInTab(); //new PanelWithHistogram(lensPeaks, "peaks").displayInTab(); //int numMulti = 0; // List<Float> atts = new ArrayList<Float>(); //for (Feature feature : featureList) //{ // if (feature.comprised.length > 1) {numMulti++; atts.add(feature.kl); System.err.println(feature.kl);} //} //System.err.println("Multis: " + numMulti); //new PanelWithHistogram(atts, "atts").displayInTab(); return featureList.toArray(new Feature[featureList.size()]); } protected void annotateFeatureDescription(Feature feature) { Spectrum.Peak[] peaks = feature.comprised; // if (peaks.length > 2) System.err.println(peaks[2].intensity/peaks[1].intensity); if (peaks.length > 2 && peaks[2].intensity > peaks[1].intensity * 1.2) { // if (peaks[2] > peaks[1]) System.err.println("Better! mass=" + mass + ", peaks: " + peaks[0] + ", " +peaks[1] + ", " + peaks[2]); feature.setDescription("1Cl"); } } public void fleshOutFeature(Feature f, Spectrum.Peak[] peaks) { int absCharge = Math.abs(f.charge); float invAbsCharge = 1.0F / absCharge; float mass = 0f; if (f.charge >= 0) mass = (f.mz-Spectrum.HYDROGEN_ION_MASS) * absCharge; else mass = f.mz * absCharge; Spectrum.Peak[] peptidePeaks = new Spectrum.Peak[maxPeaks]; // find start position in peak list int p = Arrays.binarySearch(peaks, f, Spectrum.comparePeakMzAsc); if (p < 0) p = -(p + 1); p = Math.max(0, p - 1); int pLastFound = p; int lastPeptidePeakIndexFound = 0; float mzP0 = f.mz; float sum = 0.0F; float intensityLast = 0.0F; float intensityHighest = 0.0F; boolean skippedPeak = false; float distSum = 0; float distCount = 0; float lastPeakPairScanOffset = 0; for (int Pi = 0; Pi < peptidePeaks.length; Pi++) { float mzPi = mzP0 + Pi * invAbsCharge + (distCount > 0 ? distSum/distCount : 0); p = findClosestPeak(peaks, mzPi, p); Spectrum.Peak peakFound = peaks[p]; float dist = Math.abs(peakFound.mz - mzPi); //QUESTION: why 2.5? boolean found = !peakFound.excluded && Math.abs(dist) < 2.5 * _maxResampledDistanceBetweenFeatures; //dhmay adding check to make sure this new peak is within 1 scan of orig peak boundaries //This is specific to low-mass stuff: first peak will dominate, should elute as long //or longer than second if (Pi > 0) { int scanDist = 0; if (peakFound.scan > f.scanLast) scanDist = peakFound.scan - f.scanLast; if (peakFound.scan < f.scanFirst) scanDist = f.scanFirst - peakFound.scan; if (Math.abs(scanDist) > maxScanRangeDist) found = false; } //dhmay adding check for peak offset. From the third peak onward, do not add a peak if //the absolute difference in scans between it and base peak is greater than the previous difference //by 4 or more /or/ it's greater by 2 or more and double the previous difference if (Pi > 1) { float currentPeakPairScanOffset = Math.abs(peakFound.scan - f.scan); if (currentPeakPairScanOffset >= lastPeakPairScanOffset + 4 || ((currentPeakPairScanOffset >= lastPeakPairScanOffset * 2)) && (currentPeakPairScanOffset >= lastPeakPairScanOffset + 2)) found = false; } if (!found) { //dhmay changing this for SmallMolecule. NO missing peaks allowed // miss one, call it quits if (Pi > 0) break; } else { //if this peak intensity is really really small, don't add it, and stop right here if (Pi > 0 && (peakFound.intensity < intensityHighest/200f || peakFound.intensity < 2 * f.median)) break; // if peak is too BIG stop: // CONSIDER do this at end, use fn of signal v. expected? // don't like hard cut-offs //dhmay: this is fine for small molecules, only checks if peak is second, or fourth & higher if ((Pi == 1 || Pi >= 3) && peakFound.intensity > intensityLast * 1.33 + f.median) break; intensityHighest = Math.max(peakFound.intensity, intensityHighest); // fine-tune anchor MZ if (peakFound.intensity > intensityHighest/2) { distSum += dist; distCount++; } for (int s = pLastFound + 1; s < p; s++) { Spectrum.Peak skipped = peaks[s]; if (skipped.intensity > intensityLast / 2 + f.median) skippedPeak = true; } // record it peptidePeaks[Pi] = peakFound; pLastFound = p; lastPeptidePeakIndexFound = Pi; intensityLast = peakFound.intensity; if (Pi > 0) { lastPeakPairScanOffset = Math.abs(peptidePeaks[Pi].scan - f.scan); } } } if (lastPeptidePeakIndexFound < maxPeaks) { Spectrum.Peak[] peaksNew = new Spectrum.Peak[lastPeptidePeakIndexFound+1]; System.arraycopy(peptidePeaks, 0, peaksNew, 0,lastPeptidePeakIndexFound+1 ); peptidePeaks = peaksNew; //System.err.println(peaksNew.length + " of " + maxPeaks); } //if (peptidePeaks.length == 4) System.err.println(peptidePeaks[0].mz + ", " + peptidePeaks[1].mz + ", " +peptidePeaks[2].mz + ", " +peptidePeaks[3].mz ); // Adjust MZ for feature float mean=0; float weight=0; for (int i=0 ; i<peptidePeaks.length ; i++) { //float mi = m0 + i * invCharge; if (null == peptidePeaks[i]) continue; mean += (peptidePeaks[i].mz - i*invAbsCharge) * peptidePeaks[i].intensity; weight += peptidePeaks[i].intensity; } f.mz = mean/weight; int signalLength = 2; float[] signal = new float[Math.min(signalLength, peptidePeaks.length)]; for (int i = 0; i < signal.length; i++) { signal[i] = peptidePeaks[i].intensity; sum += signal[i]; } for (int i = 0; i < signal.length; i++) signal[i] /= sum; //if (Float.isNaN(signal[0])) System.err.println("***" + peptidePeaks[0].intensity + ", " + sum); //dhmay implementing check for chlorine f.kl = calcKlSmallMolecule(mass, signal); //TODO: Do we need another score? //f.score = f.kl >= 0 ? f.intensity * (float)-Math.log(f.kl + Math.exp(-1)) : 0F; f.setPeaks(peptidePeaks.length); //skippedPeaks seems to mean that a peak in the array was skipped, not that there's a peak missing f.skippedPeaks = skippedPeak; f.comprised = peptidePeaks; f.mzPeak0 = peptidePeaks[0].mz; } /** * Calculate a "KL" score based on averages from the HMDB database * * Side effect: set description of suspected chlorine-containing features * @param mass * @param peaks * @return */ protected float calcKlSmallMolecule(float mass, float[] peaks) { if (peaks.length == 1) return -1; double idealLogPeakRatio = RegressionUtilities.mapValueUsingCoefficients(MASS_2PEAK_LOGRATIO_COEFFS, mass); float actualLogPeakRatio = (float) Math.log(peaks[0] / peaks[1]); float kl = (float) Math.abs(actualLogPeakRatio - idealLogPeakRatio) / (float) MASS_2PEAK_LOGRATIO_ERROR_SD; return kl; } protected int findClosestPeak(Spectrum.Peak[] peaks, float x, int start) { float dist = Math.abs(x - peaks[start].mz); int i = start + 1; for (; i < peaks.length; i++) { float d = Math.abs(x - peaks[i].mz); if (d > dist) break; dist = d; } return i - 1; } static double distanceNearestFraction(double x, double denom) { // scale so we can do distance to nearest integer double t = x * denom; double delta = Math.abs(t - Math.round(t)); return delta / denom; } /** * I used to do this pairwise, but had problem with weird * non-trasitive comparison heurisitics. * * This could undoubtably be improved, maybe with a combined * scoring function fn(charge, peaks, kl)? or new binary * feature comparision BetterFeature(a,b) * * @param candidates * @return */ static Feature determineBestFeature(List<Feature> candidates) { if (candidates.size() == 0) return null; if (candidates.size() == 1) return candidates.get(0); //If there are any features with multiple peaks, don't consider single-peak features List<Feature> candidatesWithMultiplePeaks = new ArrayList<Feature>(); for (Feature feature : candidates) { if (feature.peaks > 1) candidatesWithMultiplePeaks.add(feature); } if (!candidatesWithMultiplePeaks.isEmpty()) { candidates = candidatesWithMultiplePeaks; } //sort by peaks descending, up to 3, then by kl descending. Which basically means sort by KL Collections.sort(candidates, new Comparator<Feature>() { public int compare(Feature o1, Feature o2) { int klCompare = _compareDesc(Math.min(o1.peaks,3), Math.min(o2.peaks,3)); if (klCompare == 0) return _compareAsc(o1.kl, o2.kl); return klCompare; } }); return candidates.get(0); } static int _compareAsc(float a, float b) { return a == b ? 0 : a < b ? -1 : 1; } static int _compareDesc(float a, float b) { return a == b ? 0 : a < b ? 1 : -1; } /** * Create a new feature representing this peak at a given charge * @param peakHighest * @param mz * @param charge * @return */ protected Feature newFeatureRange(Spectrum.Peak peakHighest, float mz, int charge) { Feature f; if (peakHighest instanceof Feature) f = new Feature((Feature)peakHighest); else f = new Feature(peakHighest); f.charge = charge; f.mz = mz; return f; } /** * Side effects: sets resampling freq in SpectrumResampler and sets max resampled distance between features * @param resamplingFrequency */ public void setResamplingFrequency(int resamplingFrequency) { SpectrumResampler.setResampleFrequency(resamplingFrequency); _maxResampledDistanceBetweenFeatures = _maxAbsDistanceBetweenPeaks / (resamplingFrequency - 1); } public double getMaxAbsDistanceBetweenPeaks() { return _maxAbsDistanceBetweenPeaks; } public boolean isKeepStatistics() { return keepStatistics; } public void setKeepStatistics(boolean keepStatistics) { this.keepStatistics = keepStatistics; } public void plotStatistics() { if (keepStatistics) { PanelWithHistogram pwhNumCandidates = new PanelWithHistogram(numCandidatesList, "#Candidate Features"); pwhNumCandidates.displayInTab(); // PanelWithScatterPlot pwsp = // new PanelWithScatterPlot(numCandidatesList, klScoresList, "KL vs. #Candidates"); // pwsp.setPointSize(2); // pwsp.displayInTab(); } } public boolean isNegativeChargeMode() { return negativeChargeMode; } public void setNegativeChargeMode(boolean negativeChargeMode) { this.negativeChargeMode = negativeChargeMode; } }