/*
* Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fhcrc.cpl.toolbox.proteomics;
import org.apache.log4j.Logger;
import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.AmtExtraInfoDef;
import org.fhcrc.cpl.toolbox.gui.chart.ScatterPlotDialog;
import org.fhcrc.cpl.toolbox.statistics.BasicStatistics;
import org.fhcrc.cpl.toolbox.statistics.MatrixUtil;
import org.fhcrc.cpl.toolbox.statistics.RegressionUtilities;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
/**
* Utilities for regression analysis specific to msInspect -- mostly making use of Feature.
* These utilities were originally in the class that became toolbox.RegressionUtilities
*/
public class ProteomicsRegressionUtilities
{
protected static Logger _log = Logger.getLogger(ProteomicsRegressionUtilities.class);
//modes for matching features by scan or by time
public static final int REGRESSION_MODE_SCAN = 0;
public static final int REGRESSION_MODE_TIME = 1;
//time-based matching has been shown to be far superior
public static final int DEFAULT_REGRESSION_MODE = REGRESSION_MODE_TIME;
public static double[] modalRegressionIterateDegree(double[] xset, double[] yset,
int maxDegree, double leverageNumerator,
double maxStudentizedResidual,
boolean showCharts)
throws IOException
{
Feature[] dummyFeatures = new Feature[xset.length];
double[] xsetOrig = xset;
double[] ysetOrig = yset;
for (int j=0; j<xset.length; j++)
{
dummyFeatures[j] = new Feature();
dummyFeatures[j].setTime((float) xset[j]);
AmtExtraInfoDef.setObservedHydrophobicity(dummyFeatures[j], yset[j]);
}
for (int degree=1; degree<maxDegree; degree++)
{
_log.debug("Applying degree " + degree);
boolean modal = true;
if (degree==1) modal = false;
dummyFeatures = selectFeaturesWithLowLeverageAndStudentizedResidual(
dummyFeatures,
xset, yset,
leverageNumerator,
maxStudentizedResidual, modal, degree, showCharts, false);
xset = new double[dummyFeatures.length];
yset = new double[dummyFeatures.length];
for (int j=0; j<dummyFeatures.length; j++)
{
xset[j] = dummyFeatures[j].getTime();
yset[j] = AmtExtraInfoDef.getObservedHydrophobicity(dummyFeatures[j]);
}
_log.debug("Remaining pairs: " + xset.length);
}
double[] regressionResult = RegressionUtilities.modalRegression(xset, yset, maxDegree);
if (showCharts)
{
int maxX = 0;
int minX = Integer.MAX_VALUE;
for (double x : xset)
{
if (x > maxX)
maxX = (int) x;
if (x < minX)
minX = (int) x;
}
ScatterPlotDialog spd = new ScatterPlotDialog();
int numDotsOnChart = (maxX-minX+1) / 2;
double[] chartXVals = new double[numDotsOnChart];
double[] chartYVals = new double[numDotsOnChart];
for (int j=0; j<numDotsOnChart; j++)
{
chartXVals[j] = minX + (2 * j);
chartYVals[j] =
RegressionUtilities.mapValueUsingCoefficients(regressionResult, chartXVals[j]);
}
spd.addData(xset, yset,
"Matches with low stud. res.");
spd.addData(xsetOrig, ysetOrig, "all mass matches");
spd.addData(chartXVals, chartYVals, "regression function");
spd.setAxisLabels("X","Y");
spd.setVisible(true);
}
return regressionResult;
}
/**
* return an array containing the subset of the allFeatures array in which all
* features have leverage < leverageNumerator/n
*
* Used in AmtDatabaseBuilder, and possibly also in selecting MS1 features with
* no embedded MS2 for matching
* @param allFeatures
* @param leverageNumerator
* @param scanOrTimeMode
* @return
*/
public static Feature[] selectFeaturesWithLowLeverage(Feature[] allFeatures,
double leverageNumerator,
int scanOrTimeMode)
{
int n = allFeatures.length;
double[] timeValues = new double[n];
for (int i=0; i<n; i++)
timeValues[i] = scanOrTimeMode == REGRESSION_MODE_TIME ?
allFeatures[i].getTime() : allFeatures[i].getScan();
double[] leverages = BasicStatistics.leverages(timeValues);
double maxLeverage = leverageNumerator / (double) n;
//System.err.println("max leverage: " + maxLeverage + ", n: " + n);
int passed=0;
return selectFeaturesWithLowAbsoluteSomething(allFeatures, leverages, maxLeverage);
}
public static Feature[] selectFeaturesWithLowAbsoluteSomething(Feature[] allFeatures,
double[] somethings,
double maxSomething)
{
int[] indexes = RegressionUtilities.selectIndexesWithLowAbsoluteSomething(somethings, maxSomething);
Feature[] result = new Feature[indexes.length];
for (int i=0; i<indexes.length; i++)
result[i] = allFeatures[indexes[i]];
return result;
}
/**
*
* @param features
* @param xValues
* @param yValues
* @param maxStudentizedResidual
* @param modalRegression
* @param degree
* @param showCharts
* @param assumePositiveCorr Assume positive correlation between x and y? If so, we can try to adjust for bias
* @return
*/
public static Feature[] selectFeaturesWithLowStudentizedResidual(
Feature[] features, double[] xValues, double[] yValues,
double maxStudentizedResidual, boolean modalRegression, int degree, boolean showCharts,
boolean assumePositiveCorr)
{
Feature[] result = selectFeaturesWithLowLeverageAndStudentizedResidual(
features, xValues, yValues, Integer.MAX_VALUE, maxStudentizedResidual,
modalRegression, degree, showCharts, assumePositiveCorr);
return result;
}
/**
* Select low-leverage features, where leverage is defined by mass
* @param features
* @param maxLeverage
* @return
*/
public static List<Feature> selectFeaturesWithLowAbsMassLeverage(Feature[] features, float maxLeverage)
{
double[] masses = new double[features.length];
for (int i=0; i<features.length; i++)
masses[i] = features[i].getMass();
double[] leverages = BasicStatistics.leverages(masses);
List<Feature> result = new ArrayList<Feature>();
for (int i=0; i<features.length; i++)
if (Math.abs(leverages[i]) < maxLeverage)
result.add(features[i]);
return result;
}
/**
* Calculate studentized residuals
* @param xValues
* @param yValues
* @return
*/
public static double[] calculateStudentizedResiduals(double[] xValues, double[] yValues)
{
double[] regressionResult = MatrixUtil.linearRegression(xValues, yValues);
double[] leverages = BasicStatistics.leverages(xValues);
double[] residuals = new double[xValues.length];
for (int i=0; i<xValues.length; i++)
{
double predictedValue =
RegressionUtilities.mapValueUsingCoefficients(regressionResult,
xValues[i]);
residuals[i] = yValues[i] - predictedValue;
}
return BasicStatistics.studentizedResiduals(xValues,
residuals, leverages);
}
/**
*
* @param features
* @param xValues
* @param yValues
* @param leverageNumerator
* @param maxStudentizedResidual
* @param modalRegression
* @param degree
* @param showCharts
* @param assumePositiveCorr Assume positive correlation between x and y? If so, we can try to adjust for bias
* @return
*/
public static Feature[] selectFeaturesWithLowLeverageAndStudentizedResidual(
Feature[] features, double[] xValues, double[] yValues,
double leverageNumerator, double maxStudentizedResidual,
boolean modalRegression, int degree, boolean showCharts, boolean assumePositiveCorr)
{
int[] indexes = RegressionUtilities.selectIndexesWithLowLeverageAndStudentizedResidual(
xValues, yValues, leverageNumerator, maxStudentizedResidual,
modalRegression, degree, showCharts, assumePositiveCorr);
Feature[] result = new Feature[indexes.length];
for (int i=0; i<result.length; i++)
result[i] = features[indexes[i]];
return result;
}
}