/*
* EuroCarbDB, a framework for carbohydrate bioinformatics
*
* Copyright (c) 2006-2009, Eurocarb project, or third-party contributors as
* indicated by the @author tags or express copyright attribution
* statements applied by the authors.
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
* A copy of this license accompanies this distribution in the file LICENSE.txt.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* Last commit: $Rev: 1210 $ by $Author: glycoslave $ on $Date:: 2009-06-12 #$
*/
package org.eurocarbdb.application.glycoworkbench.plugin.peakpicker;
import org.eurocarbdb.application.glycanbuilder.*;
import org.eurocarbdb.application.glycoworkbench.*;
/**
Estimates the signal/noise (S/N) ratio of each data point in a scan by using the median (histogram based)
For each datapoint in the given scan, we collect a range of data points around it (param: <i>WinLen</i>).
The noise for a datapoint is estimated to be the median of the intensities of the current window.
If the number of elements in the current window is not sufficient (param: <i>MinReqElements</i>),
the noise level is set to a default value (param: <i>NoiseForEmptyWindow</i>).
The whole computation is histogram based, so the user will need to supply a number of bins (param: <i>BinCount</i>), which determines
the level of error and runtime. The maximal intensity for a datapoint to be included in the histogram can be either determined
automatically (params: <i>AutoMaxIntensity</i>, <i>AutoMode</i>) by two different methods or can be set directly by the user (param: <i>MaxIntensity</i>).
If the (estimated) <i>MaxIntensity</i> value is too low and the median is found to be in the last (&highest) bin, a warning to std:err will be given. In this case you should increase
<i>MaxIntensity</i> (and optionally the <i>BinCount</i>).
Changing any of the parameters will invalidate the S/N values (which will invoke a recomputation on the next request).
@note
Warning to *stderr* if sparse_window_percent > 20
- percent of windows that have less than <i>MinReqElements</i> of elements
(noise estimates in those windows are simply a constant <i>NoiseForEmptyWindow</i>)
.
Warning to *stderr* if histogram_oob_percent (oob=_out_of_bounds) > 1
- percentage of median estimations that had to rely on the last(=rightmost) bin
which gives an unreliable result
.
@ref SignalToNoiseEstimatorMedian_Parameters are explained on a separate page.
@ingroup Filtering
@subpage SignalToNoiseEstimatorMedian_Parameters are explained on a seperate page
*/
import java.util.*;
public class SignalToNoiseEstimatorMedian extends SignalToNoiseEstimator {
/// method to use for estimating the maximal intensity that is used for histogram calculation
public static final int MANUAL = -1;
public static final int AUTOMAXBYSTDEV = 0;
public static final int AUTOMAXBYPERCENT = 1;
// members
/// maximal intensity considered during binning (values above get discarded)
private double max_intensity_;
/// parameter for initial automatic estimation of "max_intensity_": a stdev multiplier
private double auto_max_stdev_factor_;
/// parameter for initial automatic estimation of "max_intensity_" percentile or a stdev
private double auto_max_percentile_;
/// determines which method shall be used for estimating "max_intensity_". valid are MANUAL=-1, AUTOMAXBYSTDEV=0 or AUTOMAXBYPERCENT=1
private int auto_mode_;
/// range of data points which belong to a window in Thomson
private double win_len_;
/// number of bins in the histogram
private int bin_count_;
/// minimal number of elements a window needs to cover to be used
private int min_required_elements_;
/// used as noise value for windows which cover less than "min_required_elements_"
/// use a very high value if you want to get a low S/N result
private double noise_for_empty_window_;
/// default constructor
public SignalToNoiseEstimatorMedian()
{
setName("SignalToNoiseEstimatorMedian");
defaults_.setValue("MaxIntensity", -1., "maximal intensity considered for histogram construction. By default, it will be calculated automatically (see AutoMode)." +
" Only provide this parameter if you know what you are doing (and change 'AutoMode' to '-1')!" +
" All intensities EQUAL/ABOVE 'MaxIntensity' will be added to the LAST histogram bin." +
" If you choose 'MaxIntensity' too small, the noise estimate might be too small as well. " +
" If chosen too big, the bins become quite large (which you could counter by increasing 'BinCount', which increases runtime)." +
" In general, the Median-S/N estimator is more robust to a manual MaxIntensity than the MeanIterative-S/N.");
defaults_.setValue("AutoMaxStdevFactor", 3.0, "parameter for 'MaxIntensity' estimation (if 'AutoMode' == 0): mean + 'AutoMaxStdevFactor' * stdev");
defaults_.setValue("AutoMaxPercentile", 95, "parameter for 'MaxIntensity' estimation (if 'AutoMode' == 1): AutoMaxPercentile th percentile");
defaults_.setValue("AutoMode", 0, "method to use to determine maximal intensity: -1 --> use 'MaxIntensity'; 0 --> 'AutoMaxStdevFactor' method (default); 1 --> 'AutoMaxPercentile' method");
defaults_.setValue("WinLen", 200.0, "window length in Thomson");
defaults_.setValue("BinCount", 30, "number of bins used for histogram");
defaults_.setValue("MinRequiredElements", 10, "minimum number of elements required in a window (otherwise it is considered sparse)");
defaults_.setValue("NoiseForEmptyWindow", Math.pow(10.0,20), "noise value used for sparse windows");
defaultsToParam_();
}
protected void updateMembers_()
{
max_intensity_ = (Double)param_.getValue("MaxIntensity");
auto_max_stdev_factor_ = (Double)param_.getValue("AutoMaxStdevFactor");
auto_max_percentile_ = (Integer)param_.getValue("AutoMaxPercentile");
auto_mode_ = (Integer)param_.getValue("AutoMode");
win_len_ = (Double)param_.getValue("WinLen");
bin_count_ = (Integer)param_.getValue("BinCount");
min_required_elements_ = (Integer)param_.getValue("MinRequiredElements");
noise_for_empty_window_= (Double)param_.getValue("NoiseForEmptyWindow");
is_result_valid_ = false;
}
//-----------------------
// Accessors
/// Non-mutable access to the maximal intensity that is included in the histogram (higher values get discarded)
public double getMaxIntensity() {
return max_intensity_;
}
/// Mutable access to the maximal intensity that is included in the histogram (higher values get discarded)
public void setMaxIntensity(double max_intensity)
{
max_intensity_ = max_intensity;
param_.setValue("MaxIntensity", max_intensity_);
}
/// Non-Mutable access to the AutoMaxStdevFactor-Param, which holds a factor for stddev (only used if autoMode=1)
public double getAutoMaxStdevFactor() {
return auto_max_stdev_factor_;
}
/// Mutable access to the AutoMaxStdevFactor-Param, which holds a factor for stddev (only used if autoMode=1)
public void setAutoMaxStdevFactor(double value)
{
auto_max_stdev_factor_ = value;
param_.setValue("AutoMaxStdevFactor", auto_max_stdev_factor_);
}
/// get the AutoMaxPercentile-Param, which holds a percentile (only used if autoMode=2)
public double getAutoMaxPercentile() {
return auto_max_percentile_;
}
/// Mutable access to the AutoMaxPercentile-Param, which holds a percentile (only used if autoMode=2)
public void setAutoMaxPercentile(double value)
{
auto_max_percentile_ = value;
param_.setValue("AutoMaxPercentile", auto_max_percentile_);
}
/// @brief -1 will disable it. 0 is default. 1 is alternative method
/// Non-mutable access to AutoMode, which determines the heuristic to find MaxIntensity. See Class description.
public int getAutoMode() {
return auto_mode_;
}
/// @brief -1 will disable it. 0 is default. 1 is alternative method
/// Mutable access to AutoMode, which determines the heuristic to find MaxIntensity. See Class description.
public void setAutoMode(int auto_mode)
{
auto_mode_ = auto_mode;
param_.setValue("AutoMode", auto_mode_);
}
/// Non-mutable access to the window length (in Thomson)
public double getWinLen() {
return win_len_;
}
/// Mutable access to the window length (in Thomson)
public void setWinLen(double win_len)
{
win_len_ = win_len;
param_.setValue("WinLen", win_len_);
}
/// Non-mutable access to the number of bins used for the histogram (the more bins, the better the approximation, but longer runtime)
public int getBinCount() {
return bin_count_;
}
/// Mutable access to the number of bins used for the histogram
public void setBinCount(int bin_count)
{
bin_count_ = bin_count;
param_.setValue("BinCount", bin_count_);
}
/// Non-mutable access to the minimum required elements in a window, to be evaluated.
public int getMinReqElements() {
return min_required_elements_;
}
/// Mutable access to the minimum required elements in a window, to be evaluated.
public void setMinReqElements(int min_required_elements)
{
min_required_elements_ = min_required_elements;
param_.setValue("MinRequiredElements", min_required_elements_);
}
/// Non-mutable access to the noise value that is used if a window contains not enough elements
public double getNoiseForEmptyWindow() {
return noise_for_empty_window_;
}
/// Mutable access to the noise value that is used if a window contains not enough elements
public void setNoiseForEmptyWindow(double noise_for_empty_window)
{
noise_for_empty_window_ = noise_for_empty_window;
param_.setValue("NoiseForEmptyWindow", noise_for_empty_window_);
}
// ------------------
/// calculate StN values for all datapoints given, by using a sliding window approach
/// @param scan_first_ first element in the scan
/// @param scan_last_ last element in the scan (disregarded)
protected void computeSTN_(Peak[] data_, int scan_first_, int scan_last_) throws Exception
{
// reset counter for sparse windows
double sparse_window_percent = 0;
// reset counter for histogram overflow
double histogram_oob_percent = 0;
// reset the results
stn_estimates_.clear();
// maximal range of histogram needs to be calculated first
if (auto_mode_ == AUTOMAXBYSTDEV) {
// use MEAN+auto_max_intensity_*STDEV as threshold
GaussianEstimate gauss_global = estimate_(data_, scan_first_, scan_last_);
max_intensity_ = gauss_global.mean + Math.sqrt(gauss_global.variance)*auto_max_stdev_factor_;
}
else if (auto_mode_ == AUTOMAXBYPERCENT) {
// get value at "auto_max_percentile_"th percentile
// we use a histogram approach here as well.
if ((auto_max_percentile_ < 0) || (auto_max_percentile_ > 100)) {
String s = "" + auto_max_percentile_;
throw new Exception("AutoMode is on AUTOMAXBYPERCENT! AutoMaxPercentile is not in [0,100]. Use setAutoMaxPercentile(<value>) to change it!");
}
int[] histogram_auto = new int[100];
Arrays.fill(histogram_auto,0);
// find maximum of current scan
int size = 0;
double maxInt = 0;
int run = scan_first_;
while( run != scan_last_ ) {
maxInt = Math.max(maxInt, data_[run].getIntensity());
++size;
++run;
}
double bin_size = maxInt / 100;
// fill histogram
run = scan_first_;
while (run != scan_last_) {
++histogram_auto[(int) ((data_[run].getIntensity()-1) / bin_size)];
++run;
}
// add up element counts in histogram until ?th percentile is reached
int elements_below_percentile = (int) (auto_max_percentile_ * size / 100);
int elements_seen = 0;
int i = -1;
run = scan_first_;
while (run != scan_last_ && elements_seen < elements_below_percentile) {
++i;
elements_seen += histogram_auto[i];
++run;
}
max_intensity_ = (((double)i) + 0.5) * bin_size;
}
else { //if (auto_mode_ == MANUAL)
if (max_intensity_<=0) {
String s = "" + max_intensity_;
throw new Exception("AutoMode is on MANUAL! MaxIntensity is <=0. Needs to be positive! Use setMaxIntensity(<value>) or enable AutoMode!");
}
}
if (max_intensity_ <= 0) {
System.err.println( "TODO SignalToNoiseEstimatorMedian: the max_intensity_ value should be positive! " + max_intensity_ );
return;
}
int window_pos_center = scan_first_;
int window_pos_borderleft = scan_first_;
int window_pos_borderright = scan_first_;
double window_half_size = win_len_ / 2;
double bin_size = max_intensity_ / bin_count_;
int bin_count_minus_1 = bin_count_ - 1;
int[] histogram = new int[bin_count_];
Arrays.fill(histogram,0);
double[] bin_value = new double[bin_count_];
Arrays.fill(bin_value,0.);
// calculate average intensity that is represented by a bin
for (int bin=0; bin<bin_count_; bin++) {
histogram[bin] = 0;
bin_value[bin] = (bin + 0.5) * bin_size;
}
// bin in which a datapoint would fall
int to_bin = 0;
// index of bin where the median is located
int median_bin = 0;
// additive number of elements from left to x in histogram
int element_inc_count = 0;
// tracks elements in current window, which may vary because of uneven spaced data
int elements_in_window = 0;
// number of windows
int window_count = 0;
// number of elements where we find the median
int element_in_window_half = 0;
double noise; // noise value of a datapoint
// determine how many elements we need to estimate (for progress estimation)
int windows_overall = 0;
int run = scan_first_;
while (run != scan_last_) {
++windows_overall;
++run;
}
// MAIN LOOP
while (window_pos_center != scan_last_) {
// erase all elements from histogram that will leave the window on the LEFT side
while( data_[window_pos_borderleft].getMZ() < data_[window_pos_center].getMZ() - window_half_size ) {
to_bin = Math.min((int) ((data_[window_pos_borderleft].getIntensity()) / bin_size), bin_count_minus_1);
--histogram[to_bin];
--elements_in_window;
++window_pos_borderleft;
}
// add all elements to histogram that will enter the window on the RIGHT side
while ( (window_pos_borderright != scan_last_)
&&(data_[window_pos_borderright].getMZ() <= data_[window_pos_center].getMZ() + window_half_size ) ) {
to_bin = Math.min((int) ((data_[window_pos_borderright].getIntensity()) / bin_size), bin_count_minus_1);
++histogram[to_bin];
++elements_in_window;
++window_pos_borderright;
}
if (elements_in_window < min_required_elements_) {
noise = noise_for_empty_window_;
++sparse_window_percent;
}
else {
// find bin i where ceil[elements_in_window/2] <= sum_c(0..i){ histogram[c] }
median_bin = -1;
element_inc_count = 0;
element_in_window_half = (elements_in_window+1) / 2;
while (median_bin < bin_count_minus_1 && element_inc_count < element_in_window_half) {
++median_bin;
element_inc_count += histogram[median_bin];
}
// increase the error count
if (median_bin == bin_count_minus_1)
++histogram_oob_percent;
// just avoid division by 0
noise = Math.max(1.0, bin_value[median_bin]);
}
// store result
stn_estimates_.put(data_[window_pos_center], data_[window_pos_center].getIntensity() / noise);
// advance the window center by one datapoint
++window_pos_center;
++window_count;
} // end while
sparse_window_percent = sparse_window_percent *100 / window_count;
histogram_oob_percent = histogram_oob_percent *100 / window_count;
// warn if percentage of sparse windows is above 20%
if (sparse_window_percent > 20) {
System.err.println( "WARNING in SignalToNoiseEstimatorMedian: "
+ sparse_window_percent
+ "% of all windows were sparse. You should consider increasing WindowLength or decreasing MinReqElementsInWindow" );
}
// warn if percentage of possibly wrong median estimates is above 1%
if (histogram_oob_percent > 1) {
System.err.println( "WARNING in SignalToNoiseEstimatorMedian: "
+ histogram_oob_percent
+ "% of all Signal-to-Noise estimates are too high, because the median was found in the rightmost histogram-bin. "
+ "You should consider increasing MaxIntensity (and maybe BinCount with it, to keep bin width reasonable)");
}
} // end of shiftWindow_
}