/*
* Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fhcrc.cpl.viewer.feature.extraction;
import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.proteomics.feature.Spectrum;
import org.fhcrc.cpl.viewer.feature.extraction.strategy.BaseFeatureStrategy;
import org.fhcrc.cpl.toolbox.gui.chart.PanelWithHistogram;
import org.fhcrc.cpl.toolbox.gui.chart.PanelWithScatterPlot;
import org.apache.log4j.Logger;
import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;
/**
* Default feature scorer.
* If we ever implement another one, maybe some machinery can be moved
* up into a base class*
*
* Has side effects on scored features: adjusts m/z, and sets kl, etc.
*
*/
public class DefaultFeatureScorer implements FeatureScorer
{
//maximum peaks per feature to consider
public static final int DEFAULT_MAX_PEAKS_PER_FEATURE = 10;
protected int _maxPeaksPerFeature = DEFAULT_MAX_PEAKS_PER_FEATURE;
private static Logger _log = Logger.getLogger(DefaultFeatureScorer.class);
// need to pick units to scale by, 0 means good 1 is pretty bad...
public static final float SUMSQUARES_MZ_WEIGHT = 6; // 1/6 of a Da is bad
public static final float SUMSQUARES_SCALED_INTENSITY_WEIGHT = 4; // 1/4 in scaled intesity is bad
protected boolean keepStatistics = false;
protected List<Float> unscaledMzDistances = new ArrayList<Float>();
protected List<Float> unscaledIntensityDistances = new ArrayList<Float>();
/**
* score a candidate feature f, and set the list of peaks that it's comprised of
* <p/>
* Expecting charge=0 features and peaks
*
* TODO: document this better
*
* @param f
* @param peaks
*/
public float scoreFeature(Feature f, Spectrum.Peak[] peaks)
{
double maxResampledDistanceBetweenFeatures =
DefaultPeakCombiner.DEFAULT_MAX_ABS_DISTANCE_BETWEEN_PEAKS /
(SpectrumResampler.getResampleFrequency() - 1);
int absCharge = Math.abs(f.charge);
float invAbsCharge = 1.0F / absCharge;
float mass = 0f;
if (f.charge >= 0)
mass = (f.mz-Spectrum.HYDROGEN_ION_MASS) * absCharge;
else mass = f.mz * absCharge;
Spectrum.Peak[] peptidePeaks = new Spectrum.Peak[_maxPeaksPerFeature];
// find start position in peak list
int p = Arrays.binarySearch(peaks, f, Spectrum.comparePeakMzAsc);
if (p < 0) p = -(p + 1);
p = Math.max(0, p - 1);
int pLastFound = p;
float mzP0 = f.mz;
float sum = 0.0F;
float intensityLast = 0.0F;
float intensityHighest = 0.0F;
boolean skippedPeak = false;
float distSum = 0;
float distCount = 0;
for (int Pi = 0; Pi < peptidePeaks.length; Pi++)
{
float mzPi = mzP0 + Pi * invAbsCharge + (distCount > 0 ? distSum/distCount : 0);
p = findClosestPeak(peaks, mzPi, p);
Spectrum.Peak peakFound = peaks[p];
float dist = Math.abs(peakFound.mz - mzPi);
//QUESTION: why 2.5?
boolean found =
!peakFound.excluded &&
Math.abs(dist) < 2.5 * maxResampledDistanceBetweenFeatures;
if (!found)
{
// miss two in a row, call it quits
if (Pi > 0 && null == peptidePeaks[Pi-1])
break;
}
else
{
intensityHighest = Math.max(peakFound.intensity, intensityHighest);
// if peak is too BIG stop:
// CONSIDER do this at end, use fn of signal v. expected?
// don't like hard cut-offs
if (Pi > 2 && peakFound.intensity != intensityHighest &&
peakFound.intensity > intensityLast * 1.33 + f.median)
break;
// fine-tune anchor MZ
if (peakFound.intensity > intensityHighest/2)
{
distSum += dist;
distCount++;
}
for (int s = pLastFound + 1; s < p; s++)
{
Spectrum.Peak skipped = peaks[s];
if (skipped.intensity > intensityLast / 2 + f.median)
skippedPeak = true;
}
// record it
peptidePeaks[Pi] = peakFound;
pLastFound = p;
intensityLast = peakFound.intensity;
}
}
// Adjust MZ for feature
float mean=0;
float weight=0;
for (int i=0 ; i<peptidePeaks.length ; i++)
{
//float mi = m0 + i * invCharge;
if (null == peptidePeaks[i]) continue;
mean += (peptidePeaks[i].mz - i*invAbsCharge) * peptidePeaks[i].intensity;
weight += peptidePeaks[i].intensity;
}
f.mz = mean/weight;
//
// Calculate quality measures
//
//The 'signal' array will hold values for each peak, scaled to sum to 1.
//Missing peaks are filled in with intensity 0.1 prior to scaling.
//Maximum of 6 peaks considered.
//This block also determines which peaks are "counted" in the Feature.peaks count.
//Only features with intensity >= 1/50th of the most-intense peaks, and greater than twice
//the median background noise, are counted
int signalLength = Math.min(6, _maxPeaksPerFeature);
float[] signal = new float[signalLength];
int countPeaks = 0;
for (int i = 0; i < peptidePeaks.length; i++)
{
Spectrum.Peak peak = peptidePeaks[i];
if (null != peak && (peak.intensity > intensityHighest/50 &&
peak.intensity > 2 * f.median))
countPeaks++;
if (i < signal.length)
{
float v = null == peak ? 0 : peptidePeaks[i].intensity;
signal[i] = Math.max(0.1F, v);
sum += signal[i];
}
}
for (int i = 0; i < signal.length; i++)
signal[i] /= sum;
f.kl = Spectrum.KLPoissonDistance(mass, signal);
//TODO: Do we need another score?
//f.score = f.kl >= 0 ? f.intensity * (float)-Math.log(f.kl + Math.exp(-1)) : 0F;
f.setPeaks(countPeaks);
f.skippedPeaks = skippedPeak;
f.comprised = peptidePeaks;
f.mzPeak0 = peptidePeaks[0].mz;
//
// What other metrics would be useful
//
float sumDist = calcSumSquaresDistance2D(f, signal, peptidePeaks);
return sumDist;
}
/**
* Assign a score to a particular feature, given a scaled representation of
* its peak intensities and the peaks themselves. This is kind of a 2-dimensional
* sum-squares distance measure.
*
* Lower is better
* @param f
* @param scaledIntensityDistribution
* @param peptidePeaks
* @return
*/
public float calcSumSquaresDistance2D(Feature f,
float[] scaledIntensityDistribution,
Spectrum.Peak[] peptidePeaks)
{
//Guess mass. don't actually set f.mass, since this may not be the charge this feature
//ends up having
float mass = 0f;
if (f.charge >= 0)
mass = (f.mz-Spectrum.HYDROGEN_ION_MASS) * f.charge;
else mass = f.mz * -f.charge;
//Get a scaled poisson distribution for this mass
float[] expectedDistribution = Spectrum.Poisson(mass);
//Get the index of the last peak actually present. If that's beyond the end of the expected distribution,
//move it back
int lastPeak = 0;
for (int i=0 ; i<scaledIntensityDistribution.length ; i++)
if (null != peptidePeaks[i])
lastPeak = i;
lastPeak = Math.min(lastPeak, expectedDistribution.length-1);
float sumDist = 0;
for (int i=0 ; i<=lastPeak ; i++)
{
float dist = 1; // what should I use here....
//only consider peaks that are actually present.
//TODO: does this give an unfair advantage to features with missing peaks?
if (null != peptidePeaks[i])
{
//compare observed peaks with theoretical peaks based on Dalton offsets from the base feature m/z
float theoreticalPeakMz = (f.mz + i / (float) Math.abs(f.charge));
float distMZ = Math.abs(peptidePeaks[i].mz - theoreticalPeakMz);
//Remove the effect of resampling error on m/z accuracy
distMZ = Math.max(0, distMZ - 1/(2*SpectrumResampler.getResampleFrequency()));
float distIn =
Math.abs(scaledIntensityDistribution[i] - expectedDistribution[i]);
if (keepStatistics)
{
unscaledMzDistances.add(distMZ);
unscaledIntensityDistances.add(distIn);
}
//scale the dimensions appropriately
distMZ *= SUMSQUARES_MZ_WEIGHT;
distIn *= SUMSQUARES_SCALED_INTENSITY_WEIGHT;
dist = distMZ * distMZ + distIn * distIn;
//Scale the contribution of this peak by the predicted scaled intensity of the peak
sumDist += dist * expectedDistribution[i];
//System.err.println("\t\tmass: " + mass + ", distMZ: " + distMZ + ", distIn: " + distIn + ", first int: " + scaledIntensityDistribution[0] + ", first expected int: " + expectedDistribution[0] + ", sumdist: " + sumDist);
}
}
if (_log.isDebugEnabled() && Float.isNaN(sumDist))
{
_log.debug("scorePeakDistribution: returning NaN. mz = " + f.mz + ", charge=" + f.charge);
}
return sumDist;
}
protected int findClosestPeak(Spectrum.Peak[] peaks, float x, int start)
{
float dist = Math.abs(x - peaks[start].mz);
int i = start + 1;
for (; i < peaks.length; i++)
{
float d = Math.abs(x - peaks[i].mz);
if (d > dist)
break;
dist = d;
}
return i - 1;
}
public int getMaxPeaksPerFeature()
{
return _maxPeaksPerFeature;
}
public void setMaxPeaksPerFeature(int maxPeaksPerFeature)
{
this._maxPeaksPerFeature = maxPeaksPerFeature;
}
public boolean isKeepStatistics()
{
return keepStatistics;
}
public void setKeepStatistics(boolean keepStatistics)
{
this.keepStatistics = keepStatistics;
}
public void plotStatistics()
{
if (!keepStatistics)
return;
PanelWithHistogram pwhMz = new PanelWithHistogram(unscaledMzDistances,"Unscaled M/Z Dist");
pwhMz.displayInTab();
PanelWithHistogram pwhInt = new PanelWithHistogram(unscaledIntensityDistances,"Unscaled Int Dist");
pwhInt.displayInTab();
// float[] mzDistArray = new float[unscaledMzDistances.size()];
// float[] intDistArray = new float[unscaledMzDistances.size()];
// for (int i=0; i<unscaledMzDistances.size(); i++)
// {
// mzDistArray[i] = unscaledMzDistances.get(i);
// intDistArray[i] = unscaledIntensityDistances.get(i);
// }
PanelWithScatterPlot pwsp = new PanelWithScatterPlot(unscaledMzDistances, unscaledIntensityDistances,
"mz vs int dist");
pwsp.setPointSize(1);
pwsp.setAxisLabels("m/z distance","intensity distance");
pwsp.displayInTab();
}
}